In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

In [3]:
from shared.schema import DatasetSchema

DATASET = DatasetSchema.load_schema('DBLP-HCN')
DATASET.save_schema()

In [4]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

22/01/23 01:31:24 WARN Utils: Your hostname, megatron resolves to a loopback address: 127.0.1.1; using 192.168.1.89 instead (on interface enp7s0)
22/01/23 01:31:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/23 01:31:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/23 01:31:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/01/23 01:31:26 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/01/23 01:31:26 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/01/23 01:31:26 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
22/01/23 01:31:26

In [5]:
schema = T.StructType([
    T.StructField('id', T.StringType(), True),
    T.StructField('title', T.StringType(), True),
    T.StructField('venue', T.StringType(), True),
    T.StructField('year', T.IntegerType(), True),
    T.StructField('authors', T.StringType(), True),
    T.StructField('abstract', T.StringType(), True),
])

df = (
    spark.read.csv(DATASET.raw_str('author.txt'), schema=schema, sep='\t\t\t')
        .withColumn('authors', F.transform(F.split(F.col('authors'), ';'), F.trim))
)

df.head(10)

                                                                                

[Row(id='104', title='SWORD: workload-aware data placement and replica selection for cloud data management systems.', venue='vldb', year=2014, authors=['K. Ashwin Kumar', 'Abdul Quamar'], abstract='Cloud computing is increasingly being seen as a way to reduce infrastructure costs and add elasticity, and is being used by a wide range of organizations. Cloud data management systems today need to serve a range of different workloads, from analytical read-heavy workloads to transactional (OLTP) workloads. For both the service providers and the users, it is critical to minimize the consumption of resources like CPU, memory, communication bandwidth, and energy, without compromising on service-level agreements if any. In this article, we develop a workload-aware data placement and replication approach, called SWORD, for minimizing resource consumption in such an environment. Specifically, we monitor and model the expected workload as a hypergraph and develop partitioning techniques that minim

In [6]:
df_nodes_authors = (
    df.select(
        F.explode(F.col('authors')).alias('name'),
    ).withColumn('id', F.col('name')).distinct()
)
print(df_nodes_authors.count())
df_nodes_authors.head(5)

                                                                                

5162


[Row(name='Abdul Quamar', id='Abdul Quamar'),
 Row(name='Karen Stepanyan', id='Karen Stepanyan'),
 Row(name='Wang-Pin Hsiung', id='Wang-Pin Hsiung'),
 Row(name='Hailong Sun', id='Hailong Sun'),
 Row(name='James Bailey', id='James Bailey')]

In [7]:
df_nodes_venues = (
    df.select(
        F.col('venue').alias('id'),
        F.col('venue').alias('name'),
    ).filter("id != ''").distinct()
)
print(df_nodes_venues.count())
df_nodes_venues.head(5)

14


[Row(id='vldb', name='vldb'),
 Row(id='www', name='www'),
 Row(id='icde', name='icde'),
 Row(id='nips', name='nips'),
 Row(id='icdm', name='icdm')]

In [8]:
df.groupby('year').count().orderBy('year').show()

+----+-----+
|year|count|
+----+-----+
|2012|  844|
|2013| 1370|
|2014| 1336|
|2015| 1562|
|2016|  404|
+----+-----+



In [9]:
df_nodes_papers = (
    df.filter('year > 1900').select(
        F.col('id').alias('id'),
        'title',
        'authors',
        'venue',
        'year',
        'abstract',
        F.to_timestamp(F.col('year').cast('string'), 'yyyy').alias('timestamp')
    ).distinct()
)
print(df_nodes_papers.count())
df_nodes_papers.head(5)

                                                                                

5511


                                                                                

[Row(id='840', title='Layered processing of skyline-window-join (SWJ) queries using iteration-fabric.', authors=['Mithila Nagendra', 'K. Sel&ccedil'], venue='icde', year=2013, abstract='The problem of finding interesting tuples in a data set, more commonly known as the skyline problem, has been extensively studied in scenarios where the data is static. More recently, skyline research has moved towards data streaming environments, where tuples arrive/expire in a continuous manner. Several algorithms have been developed to track skyline changes over sliding windows; however, existing methods focus on skyline analysis in which all required skyline attributes belong to a single incoming data stream. This constraint renders current algorithms unsuitable for applications that require a real-time “join” operation to be carried out between multiple incoming data streams, arriving from different sources, before the skyline query can be answered. Based on this motivation, in this paper, we addre

In [10]:
df_node_ids = (
    df_nodes_authors.select('id')
        .union(df_nodes_venues.select('id'))
        .union(df_nodes_papers.select('id'))
        .distinct()
)


def filter_node_ids(df):
    return df.join(
        df_node_ids,
        F.col('src') == F.col('id'),
        'inner'
    ).drop(
        'id'
    ).join(
        df_node_ids,
        F.col('dst') == F.col('id'),
        'inner'
    ).drop('id')

In [11]:
df_edges_authored = filter_node_ids(
    df.select(
        F.explode(F.col('authors')).alias('src'),
        F.col('id').alias('dst'),
    ).distinct()
)
print(df_edges_authored.count())
df_edges_authored.head(5)

                                                                                

11022


                                                                                

[Row(src='Ashish Sabharwal', dst='16892'),
 Row(src='Ashish Sabharwal', dst='5431'),
 Row(src='Carsten Lutz', dst='17169'),
 Row(src='Carsten Lutz', dst='16733'),
 Row(src='Carsten Lutz', dst='16751')]

In [12]:
df_edges_published_in = filter_node_ids(
    df.select(
        F.col('id').alias('src'),
        F.col('venue').alias('dst'),
    ).filter("dst != ''").distinct()
)
print(df_edges_published_in.count())
df_edges_published_in.head(5)

                                                                                

5511


[Row(src='11332', dst='icml'),
 Row(src='4032', dst='aaai'),
 Row(src='9586', dst='nips'),
 Row(src='9993', dst='nips'),
 Row(src='11078', dst='sdm')]

In [13]:
df_nodes_authors.write.parquet(DATASET.processed_str('nodes_Author'), mode='overwrite')
df_nodes_venues.write.parquet(DATASET.processed_str('nodes_Venue'), mode='overwrite')
df_nodes_papers.write.parquet(DATASET.processed_str('nodes_Paper'), mode='overwrite')

df_edges_authored.write.parquet(DATASET.processed_str('edges_AUTHORED'), mode='overwrite')
df_edges_published_in.write.parquet(DATASET.processed_str('edges_PUBLISHED_IN'), mode='overwrite')

                                                                                

In [14]:
from shared.schema.graph import GraphSchema, NodeSchema, EdgeSchema

(
    GraphSchema()
        .add_node_schema('Author', NodeSchema.from_spark(df_nodes_authors.schema, label='name'))
        .add_node_schema('Venue', NodeSchema.from_spark(df_nodes_venues.schema, label='name'))
        .add_node_schema('Paper', NodeSchema.from_spark(df_nodes_papers.schema, label='title', timestamp='timestamp', interaction=False))
        .add_edge_schema('AUTHORED', EdgeSchema.from_spark(df_edges_authored.schema, source_type='Author', target_type='Paper', directed=True))
        .add_edge_schema('PUBLISHED_IN', EdgeSchema.from_spark(df_edges_published_in.schema, source_type='Paper', target_type='Venue', directed=True))
        .save_schema(DATASET.processed())
)

GraphSchema(_path=PosixPath('/dd_volume/Development/Python/Thesis/code/datasets/data/processed/DBLP-HCN'), nodes={'Author': NodeSchema(_type='Author', _schema=..., label='name', properties={'name': GraphProperty(_name='name', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False))}, dynamic=None), 'Venue': NodeSchema(_type='Venue', _schema=..., label='name', properties={'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'name': GraphProperty(_name='name', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False))}, dynamic=None), 'Paper': NodeSchema(_type='Paper', _schema=..., label='title', properties={'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'title': GraphProperty(_name='title', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'authors': GraphProperty(_n

# Ground truth communities
Using same methodology as in:

J. Yang and J. Leskovec, “Defining and evaluating network communities based on ground-truth,” in Proceedings of the ACM SIGKDD Workshop on Mining Data Semantics, New York, NY, USA, Aug. 2012, pp. 1–8. doi: 10.1145/2350190.2350193.


In [15]:
print(df_nodes_papers.groupby('venue').count().head(100))
print(df_nodes_papers.filter(F.col('venue').isNull()).head(5))

[Row(venue='icde', count=299), Row(venue='pakdd', count=210), Row(venue='aaai', count=1041), Row(venue='ecir', count=198), Row(venue='dasfaa', count=189), Row(venue='icdm', count=474), Row(venue='cvpr', count=898), Row(venue='vldb', count=94), Row(venue='icml', count=462), Row(venue='nips', count=689), Row(venue='pkdd', count=240), Row(venue='sdm', count=192), Row(venue='ijcai', count=461), Row(venue='www', count=64)]
[]


In [16]:
df_nodes_venues_comm = (
    df_nodes_venues
        .select(
        F.col('id').alias('id'),
        F.col('id').alias('cid'),
    )
        .distinct()
)
df_nodes_venues_comm.head(5)

[Row(id='vldb', cid='vldb'),
 Row(id='www', cid='www'),
 Row(id='icde', cid='icde'),
 Row(id='nips', cid='nips'),
 Row(id='icdm', cid='icdm')]

In [17]:
df_nodes_papers_comm = (
    df_nodes_papers
        .select(
        F.col('id').alias('id'),
        F.col('venue').alias('cid'),
    )
        .distinct()
)
df_nodes_papers_comm.head(5)

[Row(id='498', cid='icde'),
 Row(id='880', cid='icde'),
 Row(id='1060', cid='icde'),
 Row(id='1431', cid='dasfaa'),
 Row(id='1579', cid='pkdd')]

In [18]:
df_nodes_authors_comm = (
    df_nodes_authors
        .join(df_edges_authored, F.col('id') == F.col('src'), 'left')
        .drop('src')
        .withColumnRenamed('dst', 'paper_id')
        .join(df_edges_published_in, F.col('paper_id') == F.col('src'), 'left')
        .drop('src')
        .withColumnRenamed('dst', 'cid')
        .select(
        F.col('id').alias('id'),
        F.col('cid').alias('cid'),
    )
        .distinct()
)
df_nodes_authors_comm.head(5)

                                                                                

[Row(id='Volkan Cevher', cid='nips'),
 Row(id='Viliam Lis&yacute', cid='nips'),
 Row(id='Jacob R. Gardner', cid='nips'),
 Row(id='Vuk Ercegovac', cid='icde'),
 Row(id='Fredrik Lindsten', cid='nips')]

In [19]:
df_nodes_comm = (
    df_nodes_authors_comm
        .union(df_nodes_papers_comm)
        .union(df_nodes_venues_comm)
        .distinct()
)
df_nodes_comm.head(5)

[Row(id='Volkan Cevher', cid='nips'),
 Row(id='Viliam Lis&yacute', cid='nips'),
 Row(id='Jacob R. Gardner', cid='nips'),
 Row(id='Vuk Ercegovac', cid='icde'),
 Row(id='Fredrik Lindsten', cid='nips')]

In [20]:
df_nodes_comm.coalesce(1).write.csv(
    DATASET.processed_str('ground_truth.comlist.tmp'),
    sep='\t',
    mode='overwrite',
    header=None,
    quoteAll=True
)

                                                                                

In [21]:
import shutil

shutil.move(
    str(next(DATASET.processed('ground_truth.comlist.tmp').glob('*.csv'))),
    DATASET.processed_str('ground_truth.ncomlist')
)

shutil.rmtree(DATASET.processed_str('ground_truth.comlist.tmp'), ignore_errors=True)