In [48]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

from shared.constants import DatasetPath

In [50]:
DATASET = DatasetPath('DBLP-V1')

In [51]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

In [52]:
df = (
    spark.read.text(DATASET.raw_str('outputacm.txt'), wholetext=False, lineSep='\n\n')
        .withColumn('value', F.split(F.col('value'), '\n'))
)

df.head(10)

[Row(value=['629814', '#*Automated Deduction in Geometry: 5th International Workshop, ADG 2004, Gainesville, FL, USA, September 16-18, 2004, Revised Papers (Lecture Notes in Computer ... / Lecture Notes in Artificial Intelligence)', '#@Hoon Hong,Dongming Wang', '#t2006', '#c', '#index0']),
 Row(value=['#*A+ Certification Core Hardware (Text & Lab Manual)', '#@Charles J. Brooks', '#t2003', '#c', '#index1']),
 Row(value=['#*Performance engineering in industry: current practices and adoption challenges', '#@Ahmed E. Hassan,Parminder Flora', '#t2007', '#cProceedings of the 6th international workshop on Software and performance', '#index2', '#!This panel session discusses performance engineering practices in industry. Presentations in the session will explore the use of lightweight techniques and approaches in order to permit the cost effective and rapid adoption of performance modeling research by large industrial software systems.']),
 Row(value=['#*Dude, You Can Do It! How to Build a Swe

In [53]:
schema = T.StructType([
    T.StructField('title', T.StringType(), True),
    T.StructField('authors', T.ArrayType(T.StringType(), False), False),
    T.StructField('year', T.IntegerType(), True),
    T.StructField('venue', T.StringType(), True),
    T.StructField('index', T.IntegerType(), True),
    T.StructField('references', T.ArrayType(T.StringType(), False), False),
    T.StructField('abstract', T.StringType(), True),
])

@F.udf(returnType=schema)
def parse_citation(lines):
    result = {
        'title': None,
        'authors': [],
        'year': None,
        'venue': None,
        'index': None,
        'references': [],
        'abstract': None,
    }
    for line in lines:
        if line.startswith('#*'):
            result['title'] = line[2:].strip()
        elif line.startswith('#@'):
            result['authors'].extend(line[2:].strip().split(','))
        elif line.startswith('#t'):
            result['year'] = int(line[2:].strip())
        elif line.startswith('#c'):
            result['venue'] = line[2:].strip()
        elif line.startswith('#index'):
            result['index'] = int(line[6:].strip())
        elif line.startswith('#%'):
            result['references'].extend(line[2:].strip().split(','))
        elif line.startswith('#!'):
            result['abstract'] = line[2:].strip()
    return result

df_papers = df.select(
    parse_citation(F.col('value')).alias('parsed_citation')
).select('parsed_citation.*').cache()
df_papers.head(5)

                                                                                

[Row(title='Automated Deduction in Geometry: 5th International Workshop, ADG 2004, Gainesville, FL, USA, September 16-18, 2004, Revised Papers (Lecture Notes in Computer ... / Lecture Notes in Artificial Intelligence)', authors=['Hoon Hong', 'Dongming Wang'], year=2006, venue='', index=0, references=[], abstract=None),
 Row(title='A+ Certification Core Hardware (Text & Lab Manual)', authors=['Charles J. Brooks'], year=2003, venue='', index=1, references=[], abstract=None),
 Row(title='Performance engineering in industry: current practices and adoption challenges', authors=['Ahmed E. Hassan', 'Parminder Flora'], year=2007, venue='Proceedings of the 6th international workshop on Software and performance', index=2, references=[], abstract='This panel session discusses performance engineering practices in industry. Presentations in the session will explore the use of lightweight techniques and approaches in order to permit the cost effective and rapid adoption of performance modeling resea

In [54]:
df_nodes_authors = (
    df_papers.select(
        F.explode(F.col('authors')).alias('name'),
    ).withColumn('id', F.col('name')).distinct()
)
print(df_nodes_authors.count())
df_nodes_authors.head(5)

                                                                                

595775


[Row(name='Jayne Weisblatt', id='Jayne Weisblatt'),
 Row(name='Maryann Barber', id='Maryann Barber'),
 Row(name='Vincent Mooney', id='Vincent Mooney'),
 Row(name='V.K. Jain', id='V.K. Jain'),
 Row(name='Lawrence C. Metzelaar', id='Lawrence C. Metzelaar')]

In [55]:
df_nodes_venues = (
    df_papers.select(
        F.col('venue').alias('id'),
        F.col('venue').alias('name'),
    ).filter("id != ''").distinct()
)
print(df_nodes_venues.count())
df_nodes_venues.head(5)

12609


[Row(id='Proceedings of the 19th annual symposium on Integrated circuits and systems design', name='Proceedings of the 19th annual symposium on Integrated circuits and systems design'),
 Row(id='Workshop on Wireless Security', name='Workshop on Wireless Security'),
 Row(id='Computers and Industrial Engineering', name='Computers and Industrial Engineering'),
 Row(id='ACM Transactions on Storage (TOS)', name='ACM Transactions on Storage (TOS)'),
 Row(id='Computing', name='Computing')]

In [71]:
df_papers.groupby('year').count().orderBy('year').show()

+----+-----+
|year|count|
+----+-----+
|  -1|    9|
|1900|    1|
|1941|    1|
|1947|    1|
|1949|    1|
|1950|    2|
|1951|   21|
|1952|   35|
|1953|   63|
|1954|    6|
|1955|   14|
|1956|   53|
|1957|    9|
|1958|   45|
|1959|   89|
|1960|  174|
|1961|  343|
|1962|  412|
|1963|  310|
|1964|  359|
+----+-----+
only showing top 20 rows



                                                                                

In [72]:
df_nodes_papers = (
    df_papers.filter('year > 1900').select(
        F.col('index').alias('id'),
        'title',
        'authors',
        'venue',
        'year',
        'abstract',
        F.to_timestamp(F.col('year').cast('string'), 'yyyy').alias('timestamp')
    ).distinct()
)
print(df_nodes_papers.count())
df_nodes_papers.head(5)

                                                                                

629804


                                                                                

[Row(id=14, title='Making the Digital City: The Early Shaping of Urban Internet Space (Design & the Built Environment S.)', authors=['Alessandro Aurigi'], venue='', year=2005, abstract=None, timestamp=datetime.datetime(2005, 1, 1, 0, 0)),
 Row(id=228, title='What the Dormouse Said: How the Sixties Counterculture Shaped the Personal Computer Industry', authors=['John Markoff'], venue='', year=2006, abstract=None, timestamp=datetime.datetime(2006, 1, 1, 0, 0)),
 Row(id=435, title='Microsoft Windows Vista Illustrated Essentials', authors=['Steve Johnson'], venue='', year=2007, abstract=None, timestamp=datetime.datetime(2007, 1, 1, 0, 0)),
 Row(id=564, title="Lemony Snicket's A Series of Unfortunate Events Official Strategy Guide", authors=['Dan Birlew'], venue='', year=2004, abstract=None, timestamp=datetime.datetime(2004, 1, 1, 0, 0)),
 Row(id=1273, title="Type Style Finder: The Busy Designer's Guide to Type", authors=['Timothy Samara'], venue='', year=2006, abstract=None, timestamp=date

In [73]:
df_edges_authored = (
    df_papers.select(
        F.explode(F.col('authors')).alias('src'),
        F.col('index').alias('dst'),
    ).distinct()
)
print(df_edges_authored.count())
df_edges_authored.head(5)

1337710


[Row(src='Dean Bagley', dst=87),
 Row(src='Cheryl Boche', dst=214),
 Row(src='Jon Phillips', dst=242),
 Row(src='Jessica Cutler', dst=368),
 Row(src='Gholam R. Amin', dst=704)]

In [74]:
df_edges_published_in = (
    df_papers.select(
        F.col('index').alias('src'),
        F.col('venue').alias('dst'),
    ).filter("dst != ''").distinct()
)
print(df_edges_published_in.count())
df_edges_published_in.head(5)

531220


[Row(src=745, dst='AI Communications'),
 Row(src=836, dst='ACM SIGMICRO Newsletter'),
 Row(src=3054, dst='Proceedings of the 3rd workshop on Programming languages and operating systems: linguistic support for modern operating systems'),
 Row(src=3408, dst='Proceedings of the ninth ACM symposium on Solid modeling and applications'),
 Row(src=5848, dst='Journal of Multivariate Analysis')]

In [75]:
df_edges_cited = (
    df_papers.select(
        F.col('index').alias('src'),
        F.explode(F.col('references')).alias('dst'),
    )
        .withColumn('dst', F.col('dst').cast('int'))
        .distinct()

)
print(df_edges_cited.count())
df_edges_cited.head(5)

632751


[Row(src=1577, dst=531149),
 Row(src=2204, dst=79175),
 Row(src=2699, dst=231000),
 Row(src=2787, dst=226092),
 Row(src=3461, dst=300136)]

In [76]:
df_nodes_authors.write.parquet(DATASET.processed_str('nodes_Author'), mode='overwrite')
df_nodes_venues.write.parquet(DATASET.processed_str('nodes_Venue'), mode='overwrite')
df_nodes_papers.write.parquet(DATASET.processed_str('nodes_Paper'), mode='overwrite')

df_edges_authored.write.parquet(DATASET.processed_str('edges_AUTHORED'), mode='overwrite')
df_edges_published_in.write.parquet(DATASET.processed_str('edges_PUBLISHED_IN'), mode='overwrite')
df_edges_cited.write.parquet(DATASET.processed_str('edges_CITED'), mode='overwrite')

                                                                                

In [77]:
from datasets.build_schema import build_schema

build_schema(
    spark,
    name=str(DATASET),
    nodes=[
        ('Author', DATASET.processed_str('nodes_Author')),
        ('Venue', DATASET.processed_str('nodes_Venue')),
        ('Paper', DATASET.processed_str('nodes_Paper')),
    ],
    edges=[
        ('Authored', 'Author', 'Paper', DATASET.processed_str('edges_AUTHORED')),
        ('PublishedIn', 'Paper', 'Venue', DATASET.processed_str('edges_PUBLISHED_IN')),
        ('Cited', 'Paper', 'Paper', DATASET.processed_str('edges_CITED')),
    ]
)

[2022-01-03 20:13:24,500][/dd_volume/Development/Python/Thesis/code/datasets/datasets/build_schema.py][DEBUG] Merging old schema for DBLP-V1


DatasetSchema(name='DBLP-V1', prefix='DBLP_V1', database='DBLP-V1', description='None', nodes=[NodeSchema(label='Author', path='data/processed/DBLP-V1/nodes_Author', properties=[Property(name='name', type='string', ignore=False, label=True), Property(name='id', type='string', ignore=False, label=False)]), NodeSchema(label='Venue', path='data/processed/DBLP-V1/nodes_Venue', properties=[Property(name='id', type='string', ignore=False, label=False), Property(name='name', type='string', ignore=False, label=True)]), NodeSchema(label='Paper', path='data/processed/DBLP-V1/nodes_Paper', properties=[Property(name='id', type='int', ignore=False, label=False), Property(name='title', type='string', ignore=False, label=True), Property(name='authors', type='string[]', ignore=False, label=False), Property(name='venue', type='string', ignore=False, label=False), Property(name='year', type='int', ignore=False, label=False), Property(name='abstract', type='string', ignore=False, label=False), Property(n