In [216]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [217]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window as W
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.stem.snowball import SnowballStemmer

In [218]:
from shared.paths import DatasetPath

DS = DatasetPath('DBLP-HCN')

In [219]:
spark = (SparkSession.builder
         .appName(f'{DS}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

In [220]:
schema = T.StructType([
    T.StructField('id', T.StringType(), True),
    T.StructField('title', T.StringType(), True),
    T.StructField('venue', T.StringType(), True),
    T.StructField('year', T.IntegerType(), True),
    T.StructField('authors', T.StringType(), True),
    T.StructField('abstract', T.StringType(), True),
])

df = (
    spark.read.csv(DS.raw_str('author.txt'), schema=schema, sep='\t\t\t')
        .withColumn('authors', F.transform(F.split(F.col('authors'), ';'), F.trim))
)

df.head(10)

[Row(id='104', title='SWORD: workload-aware data placement and replica selection for cloud data management systems.', venue='vldb', year=2014, authors=['K. Ashwin Kumar', 'Abdul Quamar'], abstract='Cloud computing is increasingly being seen as a way to reduce infrastructure costs and add elasticity, and is being used by a wide range of organizations. Cloud data management systems today need to serve a range of different workloads, from analytical read-heavy workloads to transactional (OLTP) workloads. For both the service providers and the users, it is critical to minimize the consumption of resources like CPU, memory, communication bandwidth, and energy, without compromising on service-level agreements if any. In this article, we develop a workload-aware data placement and replication approach, called SWORD, for minimizing resource consumption in such an environment. Specifically, we monitor and model the expected workload as a hypergraph and develop partitioning techniques that minim

## Data Extraction

In [221]:
df_raw_nodes_authors = (
    df.select(
        F.explode(F.col('authors')).alias('name'),
    )
        .distinct()
        .sort('name')
        .coalesce(1)
        .withColumn('id', F.monotonically_increasing_id())
)
print(df_raw_nodes_authors.count())
df_raw_nodes_authors.head(5)

5162


[Row(name='&Aacute', id=0),
 Row(name='&Agrave', id=1),
 Row(name='&Eacute', id=2),
 Row(name='&atilde', id=3),
 Row(name='&ntilde', id=4)]

In [222]:
df_raw_nodes_venues = (
    df.select(
        F.col('venue').alias('name'),
    )
        .filter("name != ''")
        .distinct()
        .sort('name')
        .coalesce(1)
        .withColumn('id', F.monotonically_increasing_id())
)
print(df_raw_nodes_venues.count())
df_raw_nodes_venues.head(5)

14


[Row(name='aaai', id=0),
 Row(name='cvpr', id=1),
 Row(name='dasfaa', id=2),
 Row(name='ecir', id=3),
 Row(name='icde', id=4)]

In [223]:
df.groupby('year').count().orderBy('year').show()

+----+-----+
|year|count|
+----+-----+
|2012|  844|
|2013| 1370|
|2014| 1336|
|2015| 1562|
|2016|  404|
+----+-----+



In [224]:
df_raw_nodes_papers = (
    df.filter('year > 1900').select(
        F.col('id').alias('pub_id'),
        F.col('title').alias('name'),
        'authors',
        'venue',
        'year',
        'abstract',
        F.col('year').alias('timestamp_from')
    )
        .distinct()
        .sort('pub_id')
        .coalesce(1)
        .withColumn('id', F.monotonically_increasing_id())
)
print(df_raw_nodes_papers.count())
df_raw_nodes_papers.head(5)

5511


[Row(pub_id='10002', name='Distributed Submodular Maximization: Identifying Representative Elements in Massive Data.', authors=['Baharan Mirzasoleiman', 'Amin Karbasi'], venue='nips', year=2013, abstract='Many large-scale machine learning problems (such as clustering, non-parametric learning, kernel machines, etc.) require selecting, out of a massive data set, a manageable, representative subset. Such problems can often be reduced to maximizing a submodular set function subject to cardinality constraints. Classical approaches require centralized access to the full data set; but for truly large-scale problems, rendering the data centrally is often impractical. In this paper, we consider the problem of submodular function maximization in a distributed fashion. We develop a simple, two-stage protocol GreeDI, that is easily implemented using MapReduce style computations. We theoretically analyze our approach, and show, that under certain natural conditions, performance close to the (imprac

In [225]:
df_node_ids = (
    df_raw_nodes_authors.select('id')
        .union(df_raw_nodes_venues.select('id'))
        .union(df_raw_nodes_papers.select('id'))
        .distinct()
)


def filter_node_ids(df, src_col='id', dst_col='id'):
    return df.join(
        df_node_ids,
        F.col('src') == F.col(src_col),
        'inner'
    ).drop(
        'id'
    ).join(
        df_node_ids,
        F.col('dst') == F.col(dst_col),
        'inner'
    ).drop('id')

In [226]:
df_edges_authored = (
    df.select(
        F.explode(F.col('authors')).alias('author_name'),
        F.col('id').alias('dst_pub_id'),
        F.col('year').alias('timestamp_from'),
    )
    .join(
        df_raw_nodes_authors.select('id', 'name'),
        F.col('author_name') == F.col('name'),
        'inner'
    )
    .withColumnRenamed('id', 'src')
    .join(
        df_raw_nodes_papers.select('id', 'pub_id'),
        F.col('dst_pub_id') == F.col('pub_id'),
        'inner'
    )
    .withColumnRenamed('id', 'dst')
    .select(
            F.col('src'),
            F.col('dst'),
            F.col('timestamp_from'),
    )
    .distinct()
)
print(df_edges_authored.count())
df_edges_authored.head(5)

11022


[Row(src=2826, dst=3457, timestamp_from=2015),
 Row(src=3211, dst=5262, timestamp_from=2013),
 Row(src=3615, dst=5456, timestamp_from=2014),
 Row(src=2367, dst=1548, timestamp_from=2012),
 Row(src=2835, dst=3562, timestamp_from=2013)]

In [227]:
df_edges_published_in = (
    df.select(
        F.col('id').alias('src_pub_id'),
        F.col('venue').alias('dst_venue'),
        F.col('year').alias('timestamp_from'),
    )
        .join(
            df_raw_nodes_papers.select('id', 'pub_id'),
            F.col('src_pub_id') == F.col('pub_id'),
            'inner'
        )
        .withColumnRenamed('id', 'src')
        .join(
            df_raw_nodes_venues.select('id', 'name'),
            F.col('dst_venue') == F.col('name'),
            'inner'
        )
        .withColumnRenamed('id', 'dst')
        .select(
            F.col('src'),
            F.col('dst'),
            F.col('timestamp_from'),
        )
        .distinct()
)
print(df_edges_published_in.count())
df_edges_published_in.head(5)

5511


[Row(src=0, dst=8, timestamp_from=2013),
 Row(src=1, dst=8, timestamp_from=2013),
 Row(src=2, dst=8, timestamp_from=2013),
 Row(src=3, dst=4, timestamp_from=2015),
 Row(src=4, dst=8, timestamp_from=2015)]

# Ground truth communities
Using same methodology as in:

J. Yang and J. Leskovec, “Defining and evaluating network communities based on ground-truth,” in Proceedings of the ACM SIGKDD Workshop on Mining Data Semantics, New York, NY, USA, Aug. 2012, pp. 1–8. doi: 10.1145/2350190.2350193.


In [228]:
print(df_raw_nodes_papers.groupby('venue').count().head(100))
print(df_raw_nodes_papers.filter(F.col('venue').isNull()).head(5))

[Row(venue='vldb', count=94), Row(venue='icde', count=299), Row(venue='pkdd', count=240), Row(venue='icdm', count=474), Row(venue='aaai', count=1041), Row(venue='nips', count=689), Row(venue='sdm', count=192), Row(venue='icml', count=462), Row(venue='dasfaa', count=189), Row(venue='www', count=64), Row(venue='pakdd', count=210), Row(venue='cvpr', count=898), Row(venue='ijcai', count=461), Row(venue='ecir', count=198)]
[]


In [229]:
df_comm_nodes_venues = (
    df_raw_nodes_venues.select(
        F.col('id'),
        F.array(F.col('id')).alias('cids')
    )
)
df_comm_nodes_venues.head(5)

[Row(id=0, cids=[0]),
 Row(id=1, cids=[1]),
 Row(id=2, cids=[2]),
 Row(id=3, cids=[3]),
 Row(id=4, cids=[4])]

In [230]:
df_comm_nodes_papers = (
    df_edges_published_in.select(
        F.col('src').alias('id'),
        'dst'
    ).groupby('id').agg(
        F.array_distinct(F.collect_list(F.col('dst'))).alias('cids')
    )
)
df_comm_nodes_papers.head(5)

[Row(id=0, cids=[8]),
 Row(id=1, cids=[8]),
 Row(id=2, cids=[8]),
 Row(id=3, cids=[4]),
 Row(id=4, cids=[8])]

In [231]:
df_comm_nodes_authors = (
    df_edges_authored.select(
        F.col('src').alias('id'),
        F.col('dst').alias('paper_id')
    ).join(
        df_edges_published_in.alias('paper_rel'),
        F.col('paper_id') == F.col('paper_rel.src'),
        'left'
    ).groupby('id').agg(
        F.array_distinct(F.collect_list(F.col('paper_rel.dst'))).alias('cids')
    )
)
df_comm_nodes_authors.head(5)

[Row(id=0, cids=[1, 7]),
 Row(id=1, cids=[8]),
 Row(id=2, cids=[0, 7, 1, 3]),
 Row(id=3, cids=[7]),
 Row(id=4, cids=[0])]

## Feature Engineering (keyword extraction)

In [232]:
df_kw_clean = df.select('id', (F.lower(F.regexp_replace('title', "[^a-zA-Z\\s]", "")).alias('text')))

tokenizer = Tokenizer(inputCol='text', outputCol='words_token')
df_kw_tokens = tokenizer.transform(df_kw_clean).select('id', 'words_token')

remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
df_kw_nostopw = remover.transform(df_kw_tokens).select('id', 'words_clean')

# Stem text
stemmer = SnowballStemmer(language='english')
stemmer_udf = F.udf(lambda tokens: [stemmer.stem(token) for token in tokens], T.ArrayType(T.StringType()))
df_kw_stemmed = df_kw_nostopw.withColumn("words_stemmed", stemmer_udf("words_clean")).select('id', 'words_stemmed')

filter_length_udf = F.udf(lambda row: [x for x in row if len(x) >= 3], T.ArrayType(T.StringType()))
df_kw_doc = df_kw_stemmed.select('id', filter_length_udf(F.col('words_stemmed')).alias('keywords'))

df_kw_doc.head(10)

                                                                                

[Row(id='104', keywords=['sword', 'workloadawar', 'data', 'placement', 'replica', 'select', 'cloud', 'data', 'manag', 'system']),
 Row(id='106', keywords=['survey', 'largescal', 'analyt', 'queri', 'process', 'mapreduc']),
 Row(id='107', keywords=['take', 'big', 'pictur', 'repres', 'skylin', 'base', 'signific', 'divers']),
 Row(id='108', keywords=['calibr', 'trajectori', 'data', 'spatiotempor', 'similar', 'analysi']),
 Row(id='111', keywords=['window', 'pqgram', 'approxim', 'join', 'datacentr', 'xml']),
 Row(id='112', keywords=['sort', 'network', 'fpgas']),
 Row(id='115', keywords=['queri', 'revers', 'engin']),
 Row(id='116', keywords=['outsourc', 'shortest', 'distanc', 'comput', 'privaci', 'protect']),
 Row(id='118', keywords=['high', 'effici', 'qualiti', 'larg', 'graph', 'match']),
 Row(id='120', keywords=['effici', 'process', 'hop', 'reachabl', 'queri'])]

In [233]:
df_kw = df_kw_doc\
    .select(F.explode('keywords').alias('keyword'))\
    .groupBy('keyword')\
    .count()\
    .orderBy('count', ascending=False)

df_kw.show(10)

[Stage 4170:>                                                       (0 + 2) / 2]

+-------+-----+
|keyword|count|
+-------+-----+
|  learn|  901|
|  model|  579|
|   data|  408|
|network|  368|
|    use|  356|
| detect|  287|
|   imag|  247|
|  optim|  241|
|    via|  241|
| social|  237|
+-------+-----+
only showing top 10 rows



                                                                                

In [234]:
k = 80
top_keywords = [x.keyword for x in df_kw.limit(k).select('keyword').collect()]
print(top_keywords)

['learn', 'model', 'data', 'network', 'use', 'detect', 'imag', 'optim', 'via', 'social', 'graph', 'effici', 'cluster', 'classif', 'predict', 'search', 'base', 'analysi', 'queri', 'algorithm', 'featur', 'approach', 'estim', 'structur', 'mine', 'spars', 'onlin', 'inform', 'object', 'select', 'process', 'recommend', 'multipl', 'infer', 'system', 'deep', 'represent', 'robust', 'pattern', 'recognit', 'fast', 'local', 'activ', 'adapt', 'bayesian', 'rank', 'applic', 'dynam', 'kernel', 'distribut', 'factor', 'label', 'topic', 'probabilist', 'matrix', 'framework', 'match', 'approxim', 'method', 'semant', 'stream', 'general', 'user', 'visual', 'retriev', 'neural', 'video', 'segment', 'stochast', 'plan', 'linear', 'time', 'sampl', 'regular', 'comput', 'problem', 'scalabl', 'relat', 'hierarch', 'regress']


In [235]:
df_node_author_keywords = (
    df_edges_authored.select(
        F.col('src').alias('author_id'),
        F.col('dst').alias('paper_id'),
    ).join(
        df_kw_doc,
        F.col('paper_id') == F.col('id'),
        'left'
    ).groupby('author_id').agg(
        F.collect_list('keywords').alias('keywords')
    ).withColumn('keywords', F.array_distinct(F.flatten('keywords')))
)
df_node_author_keywords.head(5)

[Row(author_id=0, keywords=['cluster', 'order', 'sequenc', 'base', 'typic', 'index', 'find', 'clinic', 'pathway', 'candid']),
 Row(author_id=1, keywords=['valu', 'ignor', 'number', 'player']),
 Row(author_id=2, keywords=['price', 'war', 'continu', 'competit', 'multiitem', 'onlin', 'multitask', 'learn', 'via', 'spars', 'dictionari', 'optim', 'queri', 'inconsist', 'descript', 'logic', 'knowledg', 'base', 'prefer', 'repair', 'semant']),
 Row(author_id=3, keywords=['dcbtree', 'spaceeffici', 'delta', 'code', 'cach', 'conscious', 'btree']),
 Row(author_id=4, keywords=['autonom', 'agent', 'futur', 'energi', 'market', 'power', 'trade', 'competit'])]

In [236]:
df_node_author_feats = (
    df_node_author_keywords.select(['author_id'] + [
        F.array_contains('keywords', F.lit(k)).alias('feat_' + str(k))
        for k in top_keywords
    ])
)
df_node_author_feats.head(5)

[Row(author_id=0, feat_learn=False, feat_model=False, feat_data=False, feat_network=False, feat_use=False, feat_detect=False, feat_imag=False, feat_optim=False, feat_via=False, feat_social=False, feat_graph=False, feat_effici=False, feat_cluster=True, feat_classif=False, feat_predict=False, feat_search=False, feat_base=True, feat_analysi=False, feat_queri=False, feat_algorithm=False, feat_featur=False, feat_approach=False, feat_estim=False, feat_structur=False, feat_mine=False, feat_spars=False, feat_onlin=False, feat_inform=False, feat_object=False, feat_select=False, feat_process=False, feat_recommend=False, feat_multipl=False, feat_infer=False, feat_system=False, feat_deep=False, feat_represent=False, feat_robust=False, feat_pattern=False, feat_recognit=False, feat_fast=False, feat_local=False, feat_activ=False, feat_adapt=False, feat_bayesian=False, feat_rank=False, feat_applic=False, feat_dynam=False, feat_kernel=False, feat_distribut=False, feat_factor=False, feat_label=False, fe

In [237]:
df_nodes_authors = (
    df_raw_nodes_authors.alias('main')
        .join(df_node_author_feats, F.col('main.id') == F.col('author_id'), 'left')
        .join(df_comm_nodes_authors.withColumnRenamed('id', 'comm__id'), F.col('main.id') == F.col('comm__id'), 'left')
        .drop('comm__id')
        .drop('author_id')
        .sort('main.id')
)

df_nodes_authors.head(5)

[Row(name='&Aacute', id=0, feat_learn=False, feat_model=False, feat_data=False, feat_network=False, feat_use=False, feat_detect=False, feat_imag=False, feat_optim=False, feat_via=False, feat_social=False, feat_graph=False, feat_effici=False, feat_cluster=True, feat_classif=False, feat_predict=False, feat_search=False, feat_base=True, feat_analysi=False, feat_queri=False, feat_algorithm=False, feat_featur=False, feat_approach=False, feat_estim=False, feat_structur=False, feat_mine=False, feat_spars=False, feat_onlin=False, feat_inform=False, feat_object=False, feat_select=False, feat_process=False, feat_recommend=False, feat_multipl=False, feat_infer=False, feat_system=False, feat_deep=False, feat_represent=False, feat_robust=False, feat_pattern=False, feat_recognit=False, feat_fast=False, feat_local=False, feat_activ=False, feat_adapt=False, feat_bayesian=False, feat_rank=False, feat_applic=False, feat_dynam=False, feat_kernel=False, feat_distribut=False, feat_factor=False, feat_label=

In [238]:
df_node_venue_keywords = (
    df_edges_published_in.select(
        F.col('src').alias('paper_id'),
        F.col('dst').alias('venue_id'),
    ).join(
        df_kw_doc,
        F.col('paper_id') == F.col('id'),
        'left'
    ).groupby('venue_id').agg(
        F.collect_list('keywords').alias('keywords')
    ).withColumn('keywords', F.array_distinct(F.flatten('keywords')))
)
df_node_venue_keywords.head(5)

[Row(venue_id=8, keywords=['sword', 'workloadawar', 'data', 'placement', 'replica', 'select', 'cloud', 'manag', 'system', 'survey', 'largescal', 'analyt', 'queri', 'process', 'mapreduc', 'take', 'big', 'pictur', 'repres', 'skylin', 'base', 'signific', 'divers', 'calibr', 'trajectori', 'spatiotempor', 'similar', 'analysi', 'window', 'pqgram', 'approxim', 'join', 'datacentr', 'xml', 'sort', 'network', 'fpgas', 'revers', 'engin', 'outsourc', 'shortest', 'distanc', 'comput', 'privaci', 'protect', 'high', 'effici', 'qualiti', 'larg', 'graph', 'match', 'hop', 'reachabl', 'hybrid', 'entiti', 'cluster', 'use', 'crowd', 'topk', 'web', 'applic', 'acm', 'scalabl', 'parallel', 'extract', 'frequent', 'pattern', 'long', 'sequenc', 'extrem', 'movingobject', 'updat', 'workload', 'main', 'memori', 'scope', 'databas', 'meet', 'snapshot', 'continu', 'nearest', 'neighbor', 'haloop', 'approach', 'iter', 'andes', 'evalu', 'nottwig', 'relat', 'increment', 'resolut', 'rule', 'conformityawar', 'influenc', 'max

In [239]:
df_node_venue_feats = (
    df_node_venue_keywords.select(['venue_id'] + [
        F.array_contains('keywords', F.lit(k)).alias('feat_' + str(k))
        for k in top_keywords
    ])
)
df_node_venue_feats.head(5)

[Row(venue_id=8, feat_learn=True, feat_model=True, feat_data=True, feat_network=True, feat_use=True, feat_detect=True, feat_imag=True, feat_optim=True, feat_via=True, feat_social=True, feat_graph=True, feat_effici=True, feat_cluster=True, feat_classif=True, feat_predict=True, feat_search=True, feat_base=True, feat_analysi=True, feat_queri=True, feat_algorithm=True, feat_featur=True, feat_approach=True, feat_estim=True, feat_structur=True, feat_mine=True, feat_spars=True, feat_onlin=True, feat_inform=True, feat_object=True, feat_select=True, feat_process=True, feat_recommend=True, feat_multipl=True, feat_infer=True, feat_system=True, feat_deep=True, feat_represent=True, feat_robust=True, feat_pattern=True, feat_recognit=True, feat_fast=True, feat_local=True, feat_activ=True, feat_adapt=True, feat_bayesian=True, feat_rank=True, feat_applic=True, feat_dynam=True, feat_kernel=False, feat_distribut=True, feat_factor=True, feat_label=True, feat_topic=True, feat_probabilist=True, feat_matrix=

In [240]:
df_nodes_venues = (
    df_raw_nodes_venues
        .join(df_node_venue_feats, F.col('id') == F.col('venue_id'), 'left')
        .join(df_comm_nodes_venues.withColumnRenamed('id', 'comm__id'), F.col('id') == F.col('comm__id'), 'left')
        .drop('venue_id', 'comm__id')
        .sort('id')
)

df_nodes_venues.head(5)

[Row(name='aaai', id=0, feat_learn=True, feat_model=True, feat_data=True, feat_network=True, feat_use=True, feat_detect=True, feat_imag=True, feat_optim=True, feat_via=True, feat_social=True, feat_graph=True, feat_effici=True, feat_cluster=True, feat_classif=True, feat_predict=True, feat_search=True, feat_base=True, feat_analysi=True, feat_queri=True, feat_algorithm=True, feat_featur=True, feat_approach=True, feat_estim=True, feat_structur=True, feat_mine=True, feat_spars=True, feat_onlin=True, feat_inform=True, feat_object=True, feat_select=True, feat_process=True, feat_recommend=True, feat_multipl=True, feat_infer=True, feat_system=True, feat_deep=True, feat_represent=True, feat_robust=True, feat_pattern=True, feat_recognit=True, feat_fast=True, feat_local=True, feat_activ=True, feat_adapt=True, feat_bayesian=True, feat_rank=True, feat_applic=True, feat_dynam=True, feat_kernel=True, feat_distribut=True, feat_factor=True, feat_label=True, feat_topic=True, feat_probabilist=True, feat_m

In [241]:
df_nodes_papers = (
    df_raw_nodes_papers
        .join(df_comm_nodes_papers.withColumnRenamed('id', 'comm__id'), F.col('id') == F.col('comm__id'), 'left')
        .drop('comm__id', 'pub_id')
        .sort('id')
)
df_nodes_papers.show(5)

+--------------------+--------------------+-----+----+--------------------+--------------+---+----+
|                name|             authors|venue|year|            abstract|timestamp_from| id|cids|
+--------------------+--------------------+-----+----+--------------------+--------------+---+----+
|Distributed Submo...|[Baharan Mirzasol...| nips|2013|Many large-scale ...|          2013|  0| [8]|
|EDML for Learning...|[Khaled S. Refaat...| nips|2013|EDML is a recentl...|          2013|  1| [8]|
|Bayesian inferenc...|[Mijung Park, Jon...| nips|2013|The receptive fie...|          2013|  2| [8]|
|On crowdsensed da...|[Saket Sathe, Tim...| icde|2015|Crowdsensing appl...|          2015|  3| [4]|
|Fast and Memory O...|[Se-Young Yun, Ma...| nips|2015|In this paper, we...|          2015|  4| [8]|
+--------------------+--------------------+-----+----+--------------------+--------------+---+----+
only showing top 5 rows



## Save data

In [242]:
df_nodes_authors.write.parquet(DS.processed_str('node__Author'), mode='overwrite')
df_nodes_venues.write.parquet(DS.processed_str('node__Venue'), mode='overwrite')
df_nodes_papers.write.parquet(DS.processed_str('node__Paper'), mode='overwrite')

df_edges_authored.write.parquet(DS.processed_str('edge__Author_AUTHORED_Paper'), mode='overwrite')
df_edges_published_in.write.parquet(DS.processed_str('edge__Paper_PUBLISHEDIN_Venue'), mode='overwrite')