In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

from shared.constants import DatasetPath

In [19]:
DATASET = DatasetPath('DBLP-V3')

In [20]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

In [21]:
df = (
    spark.read.text(DATASET.raw_str('DBLPOnlyCitationOct19.txt'), wholetext=False, lineSep='\n\n')
        .withColumn('value', F.split(F.col('value'), '\n'))
)

df.head(10)

[Row(value=['1632442', '#*OQL[C++]: Extending C++ with an Object Query Capability.', '#@José A. Blakeley', '#t1995', '#cModern Database Systems', '#index0']),
 Row(value=['#*Transaction Management in Multidatabase Systems.', '#@Yuri Breitbart,Hector Garcia-Molina,Abraham Silberschatz', '#t1995', '#cModern Database Systems', '#index1']),
 Row(value=['#*Overview of the ADDS System.', '#@Yuri Breitbart,Tom C. Reyes', '#t1995', '#cModern Database Systems', '#index2']),
 Row(value=['#*Multimedia Information Systems: Issues and Approaches.', '#@Stavros Christodoulakis,Leonidas Koveos', '#t1995', '#cModern Database Systems', '#index3']),
 Row(value=['#*Active Database Systems.', '#@Umeshwar Dayal,Eric N. Hanson,Jennifer Widom', '#t1995', '#cModern Database Systems', '#index4', '#%995520']),
 Row(value=['#*Where Object-Oriented DBMSs Should Do Better: A Critique Based on Early Experiences.', '#@Angelika Kotz Dittrich,Klaus R. Dittrich', '#t1995', '#cModern Database Systems', '#index5']),
 Row(

In [22]:
schema = T.StructType([
    T.StructField('title', T.StringType(), True),
    T.StructField('authors', T.ArrayType(T.StringType(), False), False),
    T.StructField('year', T.IntegerType(), True),
    T.StructField('venue', T.StringType(), True),
    T.StructField('index', T.IntegerType(), True),
    T.StructField('references', T.ArrayType(T.StringType(), False), False),
    T.StructField('abstract', T.StringType(), True),
])

@F.udf(returnType=schema)
def parse_citation(lines):
    result = {
        'title': None,
        'authors': [],
        'year': None,
        'venue': None,
        'index': None,
        'references': [],
        'abstract': None,
    }
    for line in lines:
        if line.startswith('#*'):
            result['title'] = line[2:].strip()
        elif line.startswith('#@'):
            result['authors'].extend(line[2:].strip().split(','))
        elif line.startswith('#t'):
            result['year'] = int(line[2:].strip())
        elif line.startswith('#c'):
            result['venue'] = line[2:].strip()
        elif line.startswith('#index'):
            result['index'] = int(line[6:].strip())
        elif line.startswith('#%'):
            result['references'].extend(line[2:].strip().split(','))
        elif line.startswith('#!'):
            result['abstract'] = line[2:].strip()
    return result

df_papers = df.select(
    parse_citation(F.col('value')).alias('parsed_citation')
).select('parsed_citation.*').cache()
df_papers.head(5)

                                                                                

[Row(title='OQL[C++]: Extending C++ with an Object Query Capability.', authors=['José A. Blakeley'], year=1995, venue='Modern Database Systems', index=0, references=[], abstract=None),
 Row(title='Transaction Management in Multidatabase Systems.', authors=['Yuri Breitbart', 'Hector Garcia-Molina', 'Abraham Silberschatz'], year=1995, venue='Modern Database Systems', index=1, references=[], abstract=None),
 Row(title='Overview of the ADDS System.', authors=['Yuri Breitbart', 'Tom C. Reyes'], year=1995, venue='Modern Database Systems', index=2, references=[], abstract=None),
 Row(title='Multimedia Information Systems: Issues and Approaches.', authors=['Stavros Christodoulakis', 'Leonidas Koveos'], year=1995, venue='Modern Database Systems', index=3, references=[], abstract=None),
 Row(title='Active Database Systems.', authors=['Umeshwar Dayal', 'Eric N. Hanson', 'Jennifer Widom'], year=1995, venue='Modern Database Systems', index=4, references=['995520'], abstract=None)]

In [23]:
df_nodes_authors = (
    df_papers.select(
        F.explode(F.col('authors')).alias('name'),
    ).withColumn('id', F.col('name')).distinct()
)
print(df_nodes_authors.count())
df_nodes_authors.head(5)

                                                                                

1036991


[Row(name='Hans Uszkoreit', id='Hans Uszkoreit'),
 Row(name='Marc Lohmann', id='Marc Lohmann'),
 Row(name='Dianne Willis', id='Dianne Willis'),
 Row(name='Suzanne M. Embury', id='Suzanne M. Embury'),
 Row(name='Mary Elizabeth Brabston', id='Mary Elizabeth Brabston')]

In [24]:
df_nodes_venues = (
    df_papers.select(
        F.col('venue').alias('id'),
        F.col('venue').alias('name'),
    ).filter("id != ''").distinct()
)
print(df_nodes_venues.count())
df_nodes_venues.head(5)

7707


[Row(id='Prolog and Databases', name='Prolog and Databases'),
 Row(id='Information Modeling in the New Millennium', name='Information Modeling in the New Millennium'),
 Row(id='Web-Powered Databases', name='Web-Powered Databases'),
 Row(id='AAMAS (Industry Track)', name='AAMAS (Industry Track)'),
 Row(id='Implementations of Prolog', name='Implementations of Prolog')]

In [25]:
df_nodes_papers = (
    df_papers.select(
        F.col('index').alias('id'),
        'title',
        'authors',
        'venue',
        'year',
        'abstract',
        F.to_timestamp(F.col('year').cast('string'), 'yyyy').alias('timestamp')
    ).distinct()
)
print(df_nodes_papers.count())
df_nodes_papers.head(5)

                                                                                

1632442


                                                                                

[Row(id=621, title='Machine Learning, Neural and Statistical Classification', authors=['Donald Michie', 'David J. Spiegelhalter', 'C. C. Taylor'], venue='', year=1994, abstract=None, timestamp=datetime.datetime(1994, 1, 1, 0, 0)),
 Row(id=892, title='Diffusion of Innovations in Organisations.', authors=['Davood Askarany'], venue='Encyclopedia of Information Science and Technology (II)', year=2005, abstract=None, timestamp=datetime.datetime(2005, 1, 1, 0, 0)),
 Row(id=902, title='Migrating Legacy Systems to the Web.', authors=['Lerina Aversano', 'Gerardo Canfora', 'Andrea De Lucia'], venue='Encyclopedia of Information Science and Technology (IV)', year=2005, abstract=None, timestamp=datetime.datetime(2005, 1, 1, 0, 0)),
 Row(id=922, title='Web Access by Older Adult Users.', authors=['Shirley Ann Becker'], venue='Encyclopedia of Information Science and Technology (V)', year=2005, abstract=None, timestamp=datetime.datetime(2005, 1, 1, 0, 0)),
 Row(id=1187, title='Hierarchies in Multidimen

In [26]:
df_edges_authored = (
    df_papers.select(
        F.explode(F.col('authors')).alias('src'),
        F.col('index').alias('dst'),
    ).distinct()
)
print(df_edges_authored.count())
df_edges_authored.head(5)

4285548


[Row(src='Daniel H. Fishman', dst=40),
 Row(src='Hartmut Wedekind', dst=221),
 Row(src='Hartmut Wedekind', dst=222),
 Row(src='Rajiv Gupta', dst=240),
 Row(src='Robert W. Brennan', dst=271)]

In [27]:
df_edges_published_in = (
    df_papers.select(
        F.col('index').alias('src'),
        F.col('venue').alias('dst'),
    ).filter("dst != ''").distinct()
)
print(df_edges_published_in.count())
df_edges_published_in.head(5)

1630753


[Row(src=16, dst='Modern Database Systems'),
 Row(src=53, dst='Object-Oriented Concepts, Databases, and Applications'),
 Row(src=398, dst='The Computer Science and Engineering Handbook'),
 Row(src=767, dst='Outstanding Dissertations in the Computer Sciences'),
 Row(src=933, dst='Encyclopedia of Information Science and Technology (I)')]

In [28]:
df_edges_cited = (
    df_papers.select(
        F.col('index').alias('src'),
        F.explode(F.col('references')).alias('dst'),
    )
        .withColumn('dst', F.col('dst').cast('int'))
        .distinct()

)
print(df_edges_cited.count())
df_edges_cited.head(5)

2327450


[Row(src=69, dst=929563),
 Row(src=69, dst=3146),
 Row(src=69, dst=158),
 Row(src=69, dst=605422),
 Row(src=78, dst=949858)]

In [29]:
df_nodes_authors.write.parquet(DATASET.processed_str('nodes_Author'), mode='overwrite')
df_nodes_venues.write.parquet(DATASET.processed_str('nodes_Venue'), mode='overwrite')
df_nodes_papers.write.parquet(DATASET.processed_str('nodes_Paper'), mode='overwrite')

df_edges_authored.write.parquet(DATASET.processed_str('edges_AUTHORED'), mode='overwrite')
df_edges_published_in.write.parquet(DATASET.processed_str('edges_PUBLISHED_IN'), mode='overwrite')
df_edges_cited.write.parquet(DATASET.processed_str('edges_CITED'), mode='overwrite')

                                                                                

In [30]:
from datasets.build_schema import build_schema

build_schema(
    spark,
    name=str(DATASET),
    nodes=[
        ('Author', DATASET.processed_str('nodes_Author')),
        ('Venue', DATASET.processed_str('nodes_Venue')),
        ('Paper', DATASET.processed_str('nodes_Paper')),
    ],
    edges=[
        ('Authored', 'Author', 'Paper', DATASET.processed_str('edges_AUTHORED')),
        ('PublishedIn', 'Paper', 'Venue', DATASET.processed_str('edges_PUBLISHED_IN')),
        ('Cited', 'Paper', 'Paper', DATASET.processed_str('edges_CITED')),
    ]
)

[2022-01-03 18:56:01,636][/dd_volume/Development/Python/Thesis/code/datasets/datasets/build_schema.py][DEBUG] Merging old schema for DBLP-V3


DatasetSchema(name='DBLP-V3', prefix='DBLP_V3', database='DBLP-V3', description='None', nodes=[NodeSchema(label='Author', path='data/processed/DBLP-V3/nodes_Author', properties=[Property(name='name', type='string', ignore=False, label=False), Property(name='id', type='string', ignore=False, label=False)]), NodeSchema(label='Venue', path='data/processed/DBLP-V3/nodes_Venue', properties=[Property(name='id', type='string', ignore=False, label=False), Property(name='name', type='string', ignore=False, label=False)]), NodeSchema(label='Paper', path='data/processed/DBLP-V3/nodes_Paper', properties=[Property(name='id', type='int', ignore=False, label=False), Property(name='title', type='string', ignore=False, label=False), Property(name='authors', type='string[]', ignore=False, label=False), Property(name='venue', type='string', ignore=False, label=False), Property(name='year', type='int', ignore=False, label=False), Property(name='abstract', type='string', ignore=False, label=False), Propert