In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

from shared.constants import DatasetPath

In [3]:
DATASET = DatasetPath('DBLP-V3')

In [4]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

22/01/22 00:57:31 WARN Utils: Your hostname, megatron resolves to a loopback address: 127.0.1.1; using 192.168.1.89 instead (on interface enp7s0)
22/01/22 00:57:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/22 00:57:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
df = (
    spark.read.text(DATASET.raw_str('DBLPOnlyCitationOct19.txt'), wholetext=False, lineSep='\n\n')
        .withColumn('value', F.split(F.col('value'), '\n'))
)

df.head(10)

[Row(value=['1632442', '#*OQL[C++]: Extending C++ with an Object Query Capability.', '#@José A. Blakeley', '#t1995', '#cModern Database Systems', '#index0']),
 Row(value=['#*Transaction Management in Multidatabase Systems.', '#@Yuri Breitbart,Hector Garcia-Molina,Abraham Silberschatz', '#t1995', '#cModern Database Systems', '#index1']),
 Row(value=['#*Overview of the ADDS System.', '#@Yuri Breitbart,Tom C. Reyes', '#t1995', '#cModern Database Systems', '#index2']),
 Row(value=['#*Multimedia Information Systems: Issues and Approaches.', '#@Stavros Christodoulakis,Leonidas Koveos', '#t1995', '#cModern Database Systems', '#index3']),
 Row(value=['#*Active Database Systems.', '#@Umeshwar Dayal,Eric N. Hanson,Jennifer Widom', '#t1995', '#cModern Database Systems', '#index4', '#%995520']),
 Row(value=['#*Where Object-Oriented DBMSs Should Do Better: A Critique Based on Early Experiences.', '#@Angelika Kotz Dittrich,Klaus R. Dittrich', '#t1995', '#cModern Database Systems', '#index5']),
 Row(

In [6]:
schema = T.StructType([
    T.StructField('title', T.StringType(), True),
    T.StructField('authors', T.ArrayType(T.StringType(), False), False),
    T.StructField('year', T.IntegerType(), True),
    T.StructField('venue', T.StringType(), True),
    T.StructField('index', T.IntegerType(), True),
    T.StructField('references', T.ArrayType(T.StringType(), False), False),
    T.StructField('abstract', T.StringType(), True),
])

@F.udf(returnType=schema)
def parse_citation(lines):
    result = {
        'title': None,
        'authors': [],
        'year': None,
        'venue': None,
        'index': None,
        'references': [],
        'abstract': None,
    }
    for line in lines:
        if line.startswith('#*'):
            result['title'] = line[2:].strip()
        elif line.startswith('#@'):
            result['authors'].extend(line[2:].strip().split(','))
        elif line.startswith('#t'):
            result['year'] = int(line[2:].strip())
        elif line.startswith('#c'):
            result['venue'] = line[2:].strip()
        elif line.startswith('#index'):
            result['index'] = int(line[6:].strip())
        elif line.startswith('#%'):
            result['references'].extend(line[2:].strip().split(','))
        elif line.startswith('#!'):
            result['abstract'] = line[2:].strip()
    return result

df_papers = df.select(
    parse_citation(F.col('value')).alias('parsed_citation')
).select('parsed_citation.*').cache()
df_papers.head(5)

                                                                                

[Row(title='OQL[C++]: Extending C++ with an Object Query Capability.', authors=['José A. Blakeley'], year=1995, venue='Modern Database Systems', index=0, references=[], abstract=None),
 Row(title='Transaction Management in Multidatabase Systems.', authors=['Yuri Breitbart', 'Hector Garcia-Molina', 'Abraham Silberschatz'], year=1995, venue='Modern Database Systems', index=1, references=[], abstract=None),
 Row(title='Overview of the ADDS System.', authors=['Yuri Breitbart', 'Tom C. Reyes'], year=1995, venue='Modern Database Systems', index=2, references=[], abstract=None),
 Row(title='Multimedia Information Systems: Issues and Approaches.', authors=['Stavros Christodoulakis', 'Leonidas Koveos'], year=1995, venue='Modern Database Systems', index=3, references=[], abstract=None),
 Row(title='Active Database Systems.', authors=['Umeshwar Dayal', 'Eric N. Hanson', 'Jennifer Widom'], year=1995, venue='Modern Database Systems', index=4, references=['995520'], abstract=None)]

In [7]:
df_nodes_authors = (
    df_papers.select(
        F.explode(F.col('authors')).alias('name'),
    ).withColumn('id', F.col('name')).filter(F.col('id').isNotNull())
        .dropDuplicates(['id'])
)
print(df_nodes_authors.count())
df_nodes_authors.head(5)

                                                                                

1036991


                                                                                

[Row(name=' Aihua Bao', id=' Aihua Bao'),
 Row(name=' C.D.', id=' C.D.'),
 Row(name=' D.F.', id=' D.F.'),
 Row(name=' F.F.', id=' F.F.'),
 Row(name=' G.A.', id=' G.A.')]

In [8]:
df_nodes_venues = (
    df_papers.select(
        F.col('venue').alias('id'),
        F.col('venue').alias('name'),
    ).filter("id != ''").filter(F.col('id').isNotNull())
        .dropDuplicates(['id'])
)
print(df_nodes_venues.count())
df_nodes_venues.head(5)

7707


[Row(id='10th Anniversary Colloquium of UNU/IIST', name='10th Anniversary Colloquium of UNU/IIST'),
 Row(id='13th Annual Symposium on Switching and Automata Theory', name='13th Annual Symposium on Switching and Automata Theory'),
 Row(id='14th Annual Symposium on Switching and Automata Theory', name='14th Annual Symposium on Switching and Automata Theory'),
 Row(id='15. WLP', name='15. WLP'),
 Row(id='15th Annual Symposium on Switching and Automata Theory', name='15th Annual Symposium on Switching and Automata Theory')]

In [9]:
df_nodes_papers = (
    df_papers.select(
        F.col('index').alias('id'),
        'title',
        'authors',
        'venue',
        'year',
        'abstract',
        F.to_timestamp(F.col('year').cast('string'), 'yyyy').alias('timestamp')
    ).filter(F.col('id').isNotNull())
        .dropDuplicates(['id'])
)
print(df_nodes_papers.count())
df_nodes_papers.head(5)

1632442


                                                                                

[Row(id=31, title='The Changing Database Standards Landscape.', authors=['Craig W. Thompson'], venue='Modern Database Systems', year=1995, abstract=None, timestamp=datetime.datetime(1995, 1, 1, 0, 0)),
 Row(id=34, title='Version Control in an Object-Oriented Architecture.', authors=['Anders Björnerstedt', 'Christer Hulten'], venue='Object-Oriented Concepts, Databases, and Applications', year=1989, abstract=None, timestamp=datetime.datetime(1989, 1, 1, 0, 0)),
 Row(id=53, title='Pogo: A Declarative Representation System for Graphics.', authors=['Mark A. Tarlton', 'P. Nong Tarlton'], venue='Object-Oriented Concepts, Databases, and Applications', year=1989, abstract=None, timestamp=datetime.datetime(1989, 1, 1, 0, 0)),
 Row(id=65, title='Database Design (Introduction to Section 6).', authors=['Michael Stonebraker'], venue='The INGRES Papers', year=1986, abstract=None, timestamp=datetime.datetime(1986, 1, 1, 0, 0)),
 Row(id=78, title='Algorithms', authors=['Robert Sedgewick'], venue='', ye

In [10]:
df_node_ids = (
    df_nodes_authors.select('id')
        .union(df_nodes_venues.select('id'))
        .union(df_nodes_papers.select('id'))
        .distinct()
)

def filter_node_ids(df):
    return df.join(
        df_node_ids,
        F.col('src') == F.col('id'),
        'inner'
    ).drop(
        'id'
    ).join(
        df_node_ids,
        F.col('dst') == F.col('id'),
        'inner'
    ).drop('id')

In [11]:
df_edges_authored = filter_node_ids(
    df_papers.select(
        F.explode(F.col('authors')).alias('src'),
        F.col('index').alias('dst'),
    ).distinct()
)
print(df_edges_authored.count())
df_edges_authored.head(5)

                                                                                

4285548


                                                                                

[Row(src='Craig W. Thompson', dst=31),
 Row(src='Anders Björnerstedt', dst=34),
 Row(src='Christer Hulten', dst=34),
 Row(src='P. Nong Tarlton', dst=53),
 Row(src='Mark A. Tarlton', dst=53)]

In [12]:
df_edges_published_in = filter_node_ids(
    df_papers.select(
        F.col('index').alias('src'),
        F.col('venue').alias('dst'),
    ).filter("dst != ''").distinct()
)
print(df_edges_published_in.count())
df_edges_published_in.head(5)

                                                                                

1630753


                                                                                

[Row(src=18942, dst='ACISP'),
 Row(src=18944, dst='ACISP'),
 Row(src=18956, dst='ACISP'),
 Row(src=18966, dst='ACISP'),
 Row(src=18979, dst='ACISP')]

In [13]:
df_edges_cited = filter_node_ids(
    df_papers.select(
        F.col('index').alias('src'),
        F.explode(F.col('references')).alias('dst'),
    )
        .withColumn('dst', F.col('dst').cast('int'))
        .distinct()

)
print(df_edges_cited.count())
df_edges_cited.head(5)

                                                                                

2327450


                                                                                

[Row(src=1113552, dst=28),
 Row(src=846493, dst=28),
 Row(src=183947, dst=28),
 Row(src=95913, dst=28),
 Row(src=176164, dst=28)]

In [14]:
df_nodes_authors.write.parquet(DATASET.processed_str('nodes_Author'), mode='overwrite')
df_nodes_venues.write.parquet(DATASET.processed_str('nodes_Venue'), mode='overwrite')
df_nodes_papers.write.parquet(DATASET.processed_str('nodes_Paper'), mode='overwrite')

df_edges_authored.write.parquet(DATASET.processed_str('edges_AUTHORED'), mode='overwrite')
df_edges_published_in.write.parquet(DATASET.processed_str('edges_PUBLISHED_IN'), mode='overwrite')
df_edges_cited.write.parquet(DATASET.processed_str('edges_CITED'), mode='overwrite')

                                                                                

In [15]:
from datasets.build_schema import build_schema

build_schema(
    spark,
    name=str(DATASET),
    nodes=[
        ('Author', DATASET.processed_str('nodes_Author')),
        ('Venue', DATASET.processed_str('nodes_Venue')),
        ('Paper', DATASET.processed_str('nodes_Paper')),
    ],
    edges=[
        ('Authored', 'Author', 'Paper', DATASET.processed_str('edges_AUTHORED')),
        ('PublishedIn', 'Paper', 'Venue', DATASET.processed_str('edges_PUBLISHED_IN')),
        ('Cited', 'Paper', 'Paper', DATASET.processed_str('edges_CITED')),
    ]
)

                                                                                

[2022-01-22 00:59:12,920][/dd_volume/Development/Python/Thesis/code/datasets/datasets/build_schema.py][DEBUG] Merging old schema for DBLP-V3


DatasetSchema(name='DBLP-V3', prefix='DBLP_V3', database='DBLP-V3', description='None', nodes=[NodeSchema(path='data/processed/DBLP-V3/nodes_Author', properties=[Property(name='name', type='string', ignore=False, label=True, timestamp=False), Property(name='id', type='string', ignore=False, label=False, timestamp=False)], label='Author', interaction=False), NodeSchema(path='data/processed/DBLP-V3/nodes_Venue', properties=[Property(name='id', type='string', ignore=False, label=False, timestamp=False), Property(name='name', type='string', ignore=False, label=True, timestamp=False)], label='Venue', interaction=False), NodeSchema(path='data/processed/DBLP-V3/nodes_Paper', properties=[Property(name='id', type='int', ignore=False, label=False, timestamp=False), Property(name='title', type='string', ignore=False, label=True, timestamp=False), Property(name='authors', type='string[]', ignore=False, label=False, timestamp=False), Property(name='venue', type='string', ignore=False, label=False, 

# Ground truth communities
Using same methodology as in:

J. Yang and J. Leskovec, “Defining and evaluating network communities based on ground-truth,” in Proceedings of the ACM SIGKDD Workshop on Mining Data Semantics, New York, NY, USA, Aug. 2012, pp. 1–8. doi: 10.1145/2350190.2350193.

In [16]:
print(df_nodes_papers.groupby('venue').count().head(100))
print(df_nodes_papers.filter(F.col('venue').isNull()).head(5))

[Row(venue='Advances in Object-Oriented Data Modeling', count=13), Row(venue='Handbook on Ontologies', count=33), Row(venue='CARS', count=949), Row(venue='CSR', count=223), Row(venue='Workshop on Database Issues for Data Visualization', count=34), Row(venue='EASSS', count=21), Row(venue='KWEPSY', count=31), Row(venue='GIS', count=368), Row(venue='International Conference on Internet Computing', count=847), Row(venue='ICECCS', count=570), Row(venue='Designing Smart Homes', count=11), Row(venue='ICPR (3)', count=796), Row(venue='IHIS', count=13), Row(venue='BIOKDD', count=52), Row(venue='Multimedia Information Retrieval', count=341), Row(venue='WiOpt', count=169), Row(venue='QoSA', count=76), Row(venue='GRAPP (GM/R)', count=46), Row(venue='SCW', count=27), Row(venue='ITSL', count=20), Row(venue='Empirical Software Engineering', count=302), Row(venue='Formal Asp. Comput.', count=573), Row(venue='Int. J. Cooperative Inf. Syst.', count=344), Row(venue='Parts, Hybrids, and Packaging, IEEE Tr

                                                                                

[]


In [17]:
df_nodes_papers.groupby('venue').count().filter('count < 10').count()

2705

In [None]:
df_nodes_papers.groupby('venue').count().filter('count > 100').head(5)

## Cant compute ground truth communities. Venue is actually a journal