In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

from shared.constants import DatasetPath

In [4]:
DATASET = DatasetPath('DBLP-HCN')

In [5]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

22/01/21 23:58:07 WARN Utils: Your hostname, megatron resolves to a loopback address: 127.0.1.1; using 192.168.1.89 instead (on interface enp7s0)
22/01/21 23:58:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/21 23:58:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/21 23:58:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/01/21 23:58:08 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/01/21 23:58:08 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [6]:
schema = T.StructType([
    T.StructField('id', T.IntegerType(), True),
    T.StructField('title', T.StringType(), True),
    T.StructField('venue', T.StringType(), True),
    T.StructField('year', T.IntegerType(), True),
    T.StructField('authors', T.StringType(), True),
    T.StructField('abstract', T.StringType(), True),
])

df = (
    spark.read.csv(DATASET.raw_str('author.txt'), schema=schema, sep='\t\t\t')
        .withColumn('authors', F.transform(F.split(F.col('authors'), ';'), F.trim))
)

df.head(10)

                                                                                

[Row(id=104, title='SWORD: workload-aware data placement and replica selection for cloud data management systems.', venue='vldb', year=2014, authors=['K. Ashwin Kumar', 'Abdul Quamar'], abstract='Cloud computing is increasingly being seen as a way to reduce infrastructure costs and add elasticity, and is being used by a wide range of organizations. Cloud data management systems today need to serve a range of different workloads, from analytical read-heavy workloads to transactional (OLTP) workloads. For both the service providers and the users, it is critical to minimize the consumption of resources like CPU, memory, communication bandwidth, and energy, without compromising on service-level agreements if any. In this article, we develop a workload-aware data placement and replication approach, called SWORD, for minimizing resource consumption in such an environment. Specifically, we monitor and model the expected workload as a hypergraph and develop partitioning techniques that minimiz

In [7]:
df_nodes_authors = (
    df.select(
        F.explode(F.col('authors')).alias('name'),
    ).withColumn('id', F.col('name')).distinct()
)
print(df_nodes_authors.count())
df_nodes_authors.head(5)

                                                                                

5162


[Row(name='Abdul Quamar', id='Abdul Quamar'),
 Row(name='Karen Stepanyan', id='Karen Stepanyan'),
 Row(name='Wang-Pin Hsiung', id='Wang-Pin Hsiung'),
 Row(name='Hailong Sun', id='Hailong Sun'),
 Row(name='James Bailey', id='James Bailey')]

In [8]:
df_nodes_venues = (
    df.select(
        F.col('venue').alias('id'),
        F.col('venue').alias('name'),
    ).filter("id != ''").distinct()
)
print(df_nodes_venues.count())
df_nodes_venues.head(5)

14


[Row(id='vldb', name='vldb'),
 Row(id='www', name='www'),
 Row(id='icde', name='icde'),
 Row(id='nips', name='nips'),
 Row(id='icdm', name='icdm')]

In [9]:
df.groupby('year').count().orderBy('year').show()

+----+-----+
|year|count|
+----+-----+
|2012|  844|
|2013| 1370|
|2014| 1336|
|2015| 1562|
|2016|  404|
+----+-----+



In [10]:
df_nodes_papers = (
    df.filter('year > 1900').select(
        F.col('id').alias('id'),
        'title',
        'authors',
        'venue',
        'year',
        'abstract',
        F.to_timestamp(F.col('year').cast('string'), 'yyyy').alias('timestamp')
    ).distinct()
)
print(df_nodes_papers.count())
df_nodes_papers.head(5)

5511


[Row(id=841, title='Metadata-as-a-Service.', authors=['Akon Dey', 'Gajanan S. Chinchwadkar'], venue='icde', year=2015, abstract='We present a vision of a technology and domain agnostic service that will store metadata that describes properties of the diverse data sets in an enterprise (or across several enterprises), and spread among heterogenous stores, such as relational databases, data warehouses, NoSQL or NewSQL cloud storage platforms, etc. The Metadata-as-a-Service will allow search over the metadata, so users and applications can find useful data sets, whether those are raw data or derived data. We make a preliminary proposal for the high-level architecture and API of such a service.', timestamp=datetime.datetime(2015, 1, 1, 0, 0)),
 Row(id=855, title='On random walk based graph sampling.', authors=['Rong-Hua Li', 'Jeffrey Xu Yu'], venue='icde', year=2015, abstract='Random walk based graph sampling has been recognized as a fundamental technique to collect uniform node samples fr

In [11]:
df_node_ids = (
    df_nodes_authors.select('id')
        .union(df_nodes_venues.select('id'))
        .union(df_nodes_papers.select('id'))
        .distinct()
)

def filter_node_ids(df):
    return df.join(
        df_node_ids,
        F.col('src') == F.col('id'),
        'inner'
    ).drop(
        'id'
    ).join(
        df_node_ids,
        F.col('dst') == F.col('id'),
        'inner'
    ).drop('id')

In [12]:
df_edges_authored = filter_node_ids(
    df.select(
        F.explode(F.col('authors')).alias('src'),
        F.col('id').alias('dst'),
    ).distinct()
)
print(df_edges_authored.count())
df_edges_authored.head(5)

11022


[Row(src='Ashish Sabharwal', dst=16892),
 Row(src='Ashish Sabharwal', dst=5431),
 Row(src='Carsten Lutz', dst=17169),
 Row(src='Carsten Lutz', dst=16751),
 Row(src='Carsten Lutz', dst=16733)]

In [13]:
df_edges_published_in = filter_node_ids(
    df.select(
        F.col('id').alias('src'),
        F.col('venue').alias('dst'),
    ).filter("dst != ''").distinct()
)
print(df_edges_published_in.count())
df_edges_published_in.head(5)

5511


[Row(src=1436, dst='dasfaa'),
 Row(src=11078, dst='sdm'),
 Row(src=1090, dst='icde'),
 Row(src=9993, dst='nips'),
 Row(src=3210, dst='icdm')]

In [14]:
df_nodes_authors.write.parquet(DATASET.processed_str('nodes_Author'), mode='overwrite')
df_nodes_venues.write.parquet(DATASET.processed_str('nodes_Venue'), mode='overwrite')
df_nodes_papers.write.parquet(DATASET.processed_str('nodes_Paper'), mode='overwrite')

df_edges_authored.write.parquet(DATASET.processed_str('edges_AUTHORED'), mode='overwrite')
df_edges_published_in.write.parquet(DATASET.processed_str('edges_PUBLISHED_IN'), mode='overwrite')

                                                                                

In [15]:
from datasets.build_schema import build_schema

build_schema(
    spark,
    name=str(DATASET),
    nodes=[
        ('Author', DATASET.processed_str('nodes_Author')),
        ('Venue', DATASET.processed_str('nodes_Venue')),
        ('Paper', DATASET.processed_str('nodes_Paper')),
    ],
    edges=[
        ('Authored', 'Author', 'Paper', DATASET.processed_str('edges_AUTHORED')),
        ('PublishedIn', 'Paper', 'Venue', DATASET.processed_str('edges_PUBLISHED_IN')),
    ]
)

[2022-01-21 23:58:31,397][/dd_volume/Development/Python/Thesis/code/datasets/datasets/build_schema.py][DEBUG] Merging old schema for DBLP-HCN


DatasetSchema(name='DBLP-HCN', prefix='DBLP_HCN', database='DBLP-HCN', description='None', nodes=[NodeSchema(path='data/processed/DBLP-HCN/nodes_Author', properties=[Property(name='name', type='string', ignore=False, label=True, timestamp=False), Property(name='id', type='string', ignore=False, label=True, timestamp=False)], label='Author', interaction=False), NodeSchema(path='data/processed/DBLP-HCN/nodes_Venue', properties=[Property(name='id', type='string', ignore=False, label=False, timestamp=False), Property(name='name', type='string', ignore=False, label=True, timestamp=False)], label='Venue', interaction=False), NodeSchema(path='data/processed/DBLP-HCN/nodes_Paper', properties=[Property(name='id', type='int', ignore=False, label=False, timestamp=False), Property(name='title', type='string', ignore=False, label=True, timestamp=False), Property(name='authors', type='string[]', ignore=False, label=False, timestamp=False), Property(name='venue', type='string', ignore=False, label=Fa

# Ground truth communities
Using same methodology as in:

J. Yang and J. Leskovec, “Defining and evaluating network communities based on ground-truth,” in Proceedings of the ACM SIGKDD Workshop on Mining Data Semantics, New York, NY, USA, Aug. 2012, pp. 1–8. doi: 10.1145/2350190.2350193.


In [18]:
print(df_nodes_papers.groupby('venue').count().head(100))
print(df_nodes_papers.filter(F.col('venue').isNull()).head(5))

[]

In [30]:
df_nodes_venues_comm = (
    df_nodes_venues
        .select(
            F.col('id').alias('id'),
            F.col('id').alias('cid'),
        )
        .distinct()
)
df_nodes_venues_comm.head(5)

[Row(id='vldb', cid='vldb'),
 Row(id='www', cid='www'),
 Row(id='icde', cid='icde'),
 Row(id='nips', cid='nips'),
 Row(id='icdm', cid='icdm')]

In [31]:
df_nodes_papers_comm = (
    df_nodes_papers
        .select(
            F.col('id').alias('id'),
            F.col('venue').alias('cid'),
        )
        .distinct()
)
df_nodes_papers_comm.head(5)

[Row(id=107, cid='vldb'),
 Row(id=977, cid='icde'),
 Row(id=1179, cid='dasfaa'),
 Row(id=1465, cid='dasfaa'),
 Row(id=1794, cid='pkdd')]

In [34]:
df_nodes_authors_comm = (
    df_nodes_authors
        .join(df_edges_authored, F.col('id') == F.col('src'), 'left')
        .drop('src')
        .withColumnRenamed('dst', 'paper_id')
        .join(df_edges_published_in, F.col('paper_id') == F.col('src'), 'left')
        .drop('src')
        .withColumnRenamed('dst', 'cid')
        .select(
            F.col('id').alias('id'),
            F.col('cid').alias('cid'),
        )
        .distinct()
)
df_nodes_authors_comm.head(5)

[Row(id='Bei Xu', cid='ijcai'),
 Row(id='Yue Wu', cid='icml'),
 Row(id='Reshef Meir', cid='aaai'),
 Row(id='Christina Teflioudi', cid='vldb'),
 Row(id='Hong Cheng', cid='icde')]

In [36]:
df_nodes_comm = (
    df_nodes_authors_comm
        .union(df_nodes_papers_comm)
        .union(df_nodes_venues_comm)
        .distinct()
)
df_nodes_comm.head(5)

[Row(id='Bei Xu', cid='ijcai'),
 Row(id='Yue Wu', cid='icml'),
 Row(id='Reshef Meir', cid='aaai'),
 Row(id='Christina Teflioudi', cid='vldb'),
 Row(id='Hong Cheng', cid='icde')]

In [42]:
df_nodes_comm.coalesce(1).write.csv(
    DATASET.processed_str('ground_truth.comlist.tmp'),
    sep='\t',
    mode='overwrite',
    header=None,
    quoteAll=True
)

In [43]:
import shutil

shutil.move(
    str(next(DATASET.processed('ground_truth.comlist.tmp').glob('*.csv'))),
    DATASET.processed_str('ground_truth.comlist')
)

shutil.rmtree(DATASET.processed_str('ground_truth.comlist.tmp'), ignore_errors=True)

In [44]:
from shared.constants import DATASETS_PATH
from datasets.schema import DatasetSchema

schema = DatasetSchema.load_schema(str(DATASET))
schema.ground_truth = DATASET.processed('ground_truth.comlist').relative_to(DATASETS_PATH)
schema.save_schema()