In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Exception in thread "main" java.nio.file.NoSuchFileException: /tmp/tmpx511epyb/connection5286146727792970879.info
	at sun.nio.fs.UnixException.translateToIOException(UnixException.java:86)
	at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102)
	at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107)
	at sun.nio.fs.UnixFileSystemProvider.newByteChannel(UnixFileSystemProvider.java:214)
	at java.nio.file.Files.newByteChannel(Files.java:361)
	at java.nio.file.Files.createFile(Files.java:632)
	at java.nio.file.TempFileHelper.create(TempFileHelper.java:138)
	at java.nio.file.TempFileHelper.createTempFile(TempFileHelper.java:161)
	at java.nio.file.Files.createTempFile(Files.java:852)
	at org.apache.spark.api.python.PythonGatewayServer$.main(PythonGate

In [7]:
from shared.schema import DatasetSchema

DATASET = DatasetSchema.load_schema('DBLP-V1')
DATASET.save_schema()

In [8]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

22/01/23 01:42:56 WARN Utils: Your hostname, megatron resolves to a loopback address: 127.0.1.1; using 192.168.1.89 instead (on interface enp7s0)
22/01/23 01:42:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/23 01:42:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/23 01:42:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/01/23 01:42:58 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [9]:
df = (
    spark.read.text(DATASET.raw_str('outputacm.txt'), wholetext=False, lineSep='\n\n')
        .withColumn('value', F.split(F.col('value'), '\n'))
)

df.head(10)

[Row(value=['629814', '#*Automated Deduction in Geometry: 5th International Workshop, ADG 2004, Gainesville, FL, USA, September 16-18, 2004, Revised Papers (Lecture Notes in Computer ... / Lecture Notes in Artificial Intelligence)', '#@Hoon Hong,Dongming Wang', '#t2006', '#c', '#index0']),
 Row(value=['#*A+ Certification Core Hardware (Text & Lab Manual)', '#@Charles J. Brooks', '#t2003', '#c', '#index1']),
 Row(value=['#*Performance engineering in industry: current practices and adoption challenges', '#@Ahmed E. Hassan,Parminder Flora', '#t2007', '#cProceedings of the 6th international workshop on Software and performance', '#index2', '#!This panel session discusses performance engineering practices in industry. Presentations in the session will explore the use of lightweight techniques and approaches in order to permit the cost effective and rapid adoption of performance modeling research by large industrial software systems.']),
 Row(value=['#*Dude, You Can Do It! How to Build a Swe

In [10]:
schema = T.StructType([
    T.StructField('title', T.StringType(), True),
    T.StructField('authors', T.ArrayType(T.StringType(), False), False),
    T.StructField('year', T.IntegerType(), True),
    T.StructField('venue', T.StringType(), True),
    T.StructField('index', T.StringType(), True),
    T.StructField('references', T.ArrayType(T.StringType(), False), False),
    T.StructField('abstract', T.StringType(), True),
])

@F.udf(returnType=schema)
def parse_citation(lines):
    result = {
        'title': None,
        'authors': [],
        'year': None,
        'venue': None,
        'index': None,
        'references': [],
        'abstract': None,
    }
    for line in lines:
        if line.startswith('#*'):
            result['title'] = line[2:].strip()
        elif line.startswith('#@'):
            result['authors'].extend(line[2:].strip().split(','))
        elif line.startswith('#t'):
            result['year'] = int(line[2:].strip())
        elif line.startswith('#c'):
            result['venue'] = line[2:].strip()
        elif line.startswith('#index'):
            result['index'] = int(line[6:].strip())
        elif line.startswith('#%'):
            result['references'].extend(line[2:].strip().split(','))
        elif line.startswith('#!'):
            result['abstract'] = line[2:].strip()
    return result

df_papers = df.select(
    parse_citation(F.col('value')).alias('parsed_citation')
).select('parsed_citation.*').cache()
df_papers.head(5)

                                                                                

[Row(title='Automated Deduction in Geometry: 5th International Workshop, ADG 2004, Gainesville, FL, USA, September 16-18, 2004, Revised Papers (Lecture Notes in Computer ... / Lecture Notes in Artificial Intelligence)', authors=['Hoon Hong', 'Dongming Wang'], year=2006, venue='', index='0', references=[], abstract=None),
 Row(title='A+ Certification Core Hardware (Text & Lab Manual)', authors=['Charles J. Brooks'], year=2003, venue='', index='1', references=[], abstract=None),
 Row(title='Performance engineering in industry: current practices and adoption challenges', authors=['Ahmed E. Hassan', 'Parminder Flora'], year=2007, venue='Proceedings of the 6th international workshop on Software and performance', index='2', references=[], abstract='This panel session discusses performance engineering practices in industry. Presentations in the session will explore the use of lightweight techniques and approaches in order to permit the cost effective and rapid adoption of performance modeling

In [11]:
df_nodes_authors = (
    df_papers.select(
        F.explode(F.col('authors')).alias('name'),
    ).withColumn('id', F.col('name'))
     .filter(F.col('id').isNotNull())
        .dropDuplicates(['id'])
)
print(df_nodes_authors.count())
df_nodes_authors.head(5)

                                                                                

595775


                                                                                

[Row(name=' B. West', id=' B. West'),
 Row(name=' Consumer Electronics Society Staff', id=' Consumer Electronics Society Staff'),
 Row(name=' D. Crookes', id=' D. Crookes'),
 Row(name=' H', id=' H'),
 Row(name=' H. Hsu', id=' H. Hsu')]

In [12]:
df_nodes_venues = (
    df_papers.select(
        F.col('venue').alias('id'),
        F.col('venue').alias('name'),
    ).filter("id != ''")
     .filter(F.col('id').isNotNull())
        .dropDuplicates(['id'])
)
print(df_nodes_venues.count())
df_nodes_venues.head(5)

12609


[Row(id='(1987)', name='(1987)'),
 Row(id='(1992&ndash;1993)', name='(1992&ndash;1993)'),
 Row(id='(1993&ndash;1994)', name='(1993&ndash;1994)'),
 Row(id='(1994 Supplement)', name='(1994 Supplement)'),
 Row(id='(Fall 1991)', name='(Fall 1991)')]

In [13]:
df_papers.groupby('year').count().orderBy('year').show()

+----+-----+
|year|count|
+----+-----+
|  -1|    9|
|1900|    1|
|1941|    1|
|1947|    1|
|1949|    1|
|1950|    2|
|1951|   21|
|1952|   35|
|1953|   63|
|1954|    6|
|1955|   14|
|1956|   53|
|1957|    9|
|1958|   45|
|1959|   89|
|1960|  174|
|1961|  343|
|1962|  412|
|1963|  310|
|1964|  359|
+----+-----+
only showing top 20 rows



In [14]:
df_nodes_papers = (
    df_papers.filter('year > 1900').select(
        F.col('index').alias('id'),
        'title',
        'authors',
        'venue',
        'year',
        'abstract',
        F.to_timestamp(F.col('year').cast('string'), 'yyyy').alias('timestamp')
    ) .filter(F.col('id').isNotNull())
        .dropDuplicates(['id'])
)
print(df_nodes_papers.count())
df_nodes_papers.head(5)

629804


                                                                                

[Row(id='100010', title='MPICH-V2: a Fault Tolerant MPI for Volatile Nodes based on Pessimistic Sender Based Message Logging', authors=['Aurélien Bouteiller', 'Franck Cappello', 'Thomas Herault', 'Géraud Krawezik', 'Pierre Lemarinier', 'Frédéric Magniette'], venue='Proceedings of the 2003 ACM/IEEE conference on Supercomputing', year=2003, abstract='Execution of MPI applications on clusters and Grid deployments suffering from node and network failures motivates the use of fault tolerant MPI implementations. We present MPICH-V2 (the second protocol of MPICH-V project), an automatic fault tolerant MPI implementation using an innovative protocol that removes the most limiting factor of the pessimistic message logging approach: reliable logging of in transit messages. MPICH-V2 relies on uncoordinated checkpointing, sender based message logging and remote reliable logging of message logical clocks. This paper presents the architecture of MPICH-V2, its theoretical foundation and the performan

In [15]:
df_node_ids = (
    df_nodes_authors.select('id')
        .union(df_nodes_venues.select('id'))
        .union(df_nodes_papers.select('id'))
        .distinct()
)

def filter_node_ids(df):
    return df.join(
        df_node_ids,
        F.col('src') == F.col('id'),
        'inner'
    ).drop(
        'id'
    ).join(
        df_node_ids,
        F.col('dst') == F.col('id'),
        'inner'
    ).drop('id')

In [16]:
df_edges_authored = filter_node_ids(
    df_papers.select(
        F.explode(F.col('authors')).alias('src'),
        F.col('index').alias('dst'),
    ).distinct()
)
print(df_edges_authored.count())
df_edges_authored.head(5)

                                                                                

1337700


                                                                                

[Row(src='Frédéric Magniette', dst='100010'),
 Row(src='Pierre Lemarinier', dst='100010'),
 Row(src='Thomas Herault', dst='100010'),
 Row(src='Géraud Krawezik', dst='100010'),
 Row(src='Aurélien Bouteiller', dst='100010')]

In [17]:
df_edges_published_in = filter_node_ids(
    df_papers.select(
        F.col('index').alias('src'),
        F.col('venue').alias('dst'),
    ).filter("dst != ''").distinct()
)
print(df_edges_published_in.count())
df_edges_published_in.head(5)

                                                                                

531218


                                                                                

[Row(src='164367', dst='(March 1987)'),
 Row(src='515847', dst='(May 1991)'),
 Row(src='528823', dst='(May 1991)'),
 Row(src='517467', dst='(May 1991)'),
 Row(src='600054', dst='3C ON-LINE')]

In [18]:
df_edges_cited = filter_node_ids(
    df_papers.select(
        F.col('index').alias('src'),
        F.explode(F.col('references')).alias('dst'),
    )
        .withColumn('dst', F.col('dst').cast('long').cast('string'))
        .distinct()

)
print(df_edges_cited.count())
df_edges_cited.head(5)

                                                                                

632751


                                                                                

[Row(src='581674', dst='100010'),
 Row(src='410794', dst='100010'),
 Row(src='109295', dst='100010'),
 Row(src='50186', dst='100010'),
 Row(src='415704', dst='100090')]

In [19]:
df_nodes_authors.write.parquet(DATASET.processed_str('nodes_Author'), mode='overwrite')
df_nodes_venues.write.parquet(DATASET.processed_str('nodes_Venue'), mode='overwrite')
df_nodes_papers.write.parquet(DATASET.processed_str('nodes_Paper'), mode='overwrite')

df_edges_authored.write.parquet(DATASET.processed_str('edges_AUTHORED'), mode='overwrite')
df_edges_published_in.write.parquet(DATASET.processed_str('edges_PUBLISHED_IN'), mode='overwrite')
df_edges_cited.write.parquet(DATASET.processed_str('edges_CITED'), mode='overwrite')

                                                                                

In [20]:
from shared.schema.graph import GraphSchema, NodeSchema, EdgeSchema

(
    GraphSchema()
        .add_node_schema('Author', NodeSchema.from_spark(df_nodes_authors.schema, label='name'))
        .add_node_schema('Venue', NodeSchema.from_spark(df_nodes_venues.schema, label='name'))
        .add_node_schema('Paper', NodeSchema.from_spark(df_nodes_papers.schema, label='title', timestamp='timestamp', interaction=False))
        .add_edge_schema('AUTHORED', EdgeSchema.from_spark(df_edges_authored.schema, source_type='Author', target_type='Paper', directed=True))
        .add_edge_schema('PUBLISHED_IN', EdgeSchema.from_spark(df_edges_published_in.schema, source_type='Paper', target_type='Venue', directed=True))
        .add_edge_schema('CITED', EdgeSchema.from_spark(df_edges_cited.schema, source_type='Paper', target_type='Paper', directed=True))
        .save_schema(DATASET.processed())
)

GraphSchema(_path=PosixPath('/dd_volume/Development/Python/Thesis/code/datasets/data/processed/DBLP-V1'), nodes={'Author': NodeSchema(_type='Author', _schema=..., label='name', properties={'name': GraphProperty(_name='name', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False))}, dynamic=None), 'Venue': NodeSchema(_type='Venue', _schema=..., label='name', properties={'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'name': GraphProperty(_name='name', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False))}, dynamic=None), 'Paper': NodeSchema(_type='Paper', _schema=..., label='title', properties={'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'title': GraphProperty(_name='title', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'authors': GraphProperty(_na

# Ground truth communities
Using same methodology as in:

J. Yang and J. Leskovec, “Defining and evaluating network communities based on ground-truth,” in Proceedings of the ACM SIGKDD Workshop on Mining Data Semantics, New York, NY, USA, Aug. 2012, pp. 1–8. doi: 10.1145/2350190.2350193.

In [21]:
print(df_nodes_papers.groupby('venue').count().head(100))
print(df_nodes_papers.filter(F.col('venue').isNull()).head(5))

[Row(venue='Proceedings of the 2009 ICSE Workshop on Comparison and Versioning of Software Models', count=12), Row(venue='Empirical Software Engineering', count=45), Row(venue='Computers in the Schools', count=351), Row(venue='The IFIP TC2/WG 2.1 Working Conference on Program specification and transformation', count=24), Row(venue='Library Hi Tech', count=315), Row(venue="International Workshop All '86 on Analogical and inductive inference", count=16), Row(venue='Proceedings of the Third International Conference on Medical Image Computing and Computer-Assisted Intervention', count=133), Row(venue='Proceedings of the IFIP TC8/WG8.1 Working Conference on Information Systems in the WWW Environment', count=21), Row(venue='Workshop on Evolutionary Models and Strategies, Workshop on Parallel Processing: Logic, Organization, and Technology: Parallelism, Learning, Evolution', count=27), Row(venue='Proceedings of the 14th Annual International Cryptology Conference on Advances in Cryptology', co

                                                                                

[]


In [22]:
df_nodes_papers.groupby('venue').count().filter('count > 100').count()

928

In [23]:
df_nodes_papers.groupby('venue').count().filter('count > 100').head(5)

[Row(venue='Computers in the Schools', count=351),
 Row(venue='Library Hi Tech', count=315),
 Row(venue='Proceedings of the Third International Conference on Medical Image Computing and Computer-Assisted Intervention', count=133),
 Row(venue='Proceedings of the 29th International Conference on Software Engineering Workshops', count=127),
 Row(venue='Proceedings of the 32nd IEEE Conference on Local Computer Networks', count=159)]

## Cant compute ground truth communities. Venue is actually a journal