In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window as W
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.stem.snowball import SnowballStemmer

In [3]:
from shared.schema import DatasetSchema

DATASET = DatasetSchema.load_schema('DBLP-HCN')
DATASET.save_schema()

In [4]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/12 13:06:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/12 13:06:21 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
schema = T.StructType([
    T.StructField('id', T.StringType(), True),
    T.StructField('title', T.StringType(), True),
    T.StructField('venue', T.StringType(), True),
    T.StructField('year', T.IntegerType(), True),
    T.StructField('authors', T.StringType(), True),
    T.StructField('abstract', T.StringType(), True),
])

df = (
    spark.read.csv(DATASET.raw_str('author.txt'), schema=schema, sep='\t\t\t')
        .withColumn('authors', F.transform(F.split(F.col('authors'), ';'), F.trim))
)

df.head(10)

                                                                                

[Row(id='104', title='SWORD: workload-aware data placement and replica selection for cloud data management systems.', venue='vldb', year=2014, authors=['K. Ashwin Kumar', 'Abdul Quamar'], abstract='Cloud computing is increasingly being seen as a way to reduce infrastructure costs and add elasticity, and is being used by a wide range of organizations. Cloud data management systems today need to serve a range of different workloads, from analytical read-heavy workloads to transactional (OLTP) workloads. For both the service providers and the users, it is critical to minimize the consumption of resources like CPU, memory, communication bandwidth, and energy, without compromising on service-level agreements if any. In this article, we develop a workload-aware data placement and replication approach, called SWORD, for minimizing resource consumption in such an environment. Specifically, we monitor and model the expected workload as a hypergraph and develop partitioning techniques that minim

## Data Extraction

In [6]:
df_nodes_authors = (
    df.select(
        F.explode(F.col('authors')).alias('name'),
    )
        .distinct()
        .sort('name')
        .coalesce(1)
        .withColumn('id', F.monotonically_increasing_id())
)
print(df_nodes_authors.count())
df_nodes_authors.head(5)

                                                                                

5162


[Row(name='&Aacute', id=0),
 Row(name='&Agrave', id=1),
 Row(name='&Eacute', id=2),
 Row(name='&atilde', id=3),
 Row(name='&ntilde', id=4)]

In [7]:
df_nodes_venues = (
    df.select(
        F.col('venue').alias('name'),
    )
        .filter("name != ''")
        .distinct()
        .sort('name')
        .coalesce(1)
        .withColumn('id', F.monotonically_increasing_id())
)
print(df_nodes_venues.count())
df_nodes_venues.head(5)

14


[Row(name='aaai', id=0),
 Row(name='cvpr', id=1),
 Row(name='dasfaa', id=2),
 Row(name='ecir', id=3),
 Row(name='icde', id=4)]

In [8]:
df.groupby('year').count().orderBy('year').show()

+----+-----+
|year|count|
+----+-----+
|2012|  844|
|2013| 1370|
|2014| 1336|
|2015| 1562|
|2016|  404|
+----+-----+



In [9]:
df_nodes_papers = (
    df.filter('year > 1900').select(
        F.col('id').alias('pub_id'),
        'title',
        'authors',
        'venue',
        'year',
        'abstract',
        F.col('year').alias('timestamp')
    )
        .distinct()
        .sort('pub_id')
        .coalesce(1)
        .withColumn('id', F.monotonically_increasing_id())
)
print(df_nodes_papers.count())
df_nodes_papers.head(5)

5511


[Row(pub_id='10002', title='Distributed Submodular Maximization: Identifying Representative Elements in Massive Data.', authors=['Baharan Mirzasoleiman', 'Amin Karbasi'], venue='nips', year=2013, abstract='Many large-scale machine learning problems (such as clustering, non-parametric learning, kernel machines, etc.) require selecting, out of a massive data set, a manageable, representative subset. Such problems can often be reduced to maximizing a submodular set function subject to cardinality constraints. Classical approaches require centralized access to the full data set; but for truly large-scale problems, rendering the data centrally is often impractical. In this paper, we consider the problem of submodular function maximization in a distributed fashion. We develop a simple, two-stage protocol GreeDI, that is easily implemented using MapReduce style computations. We theoretically analyze our approach, and show, that under certain natural conditions, performance close to the (impra

In [10]:
df_node_ids = (
    df_nodes_authors.select('id')
        .union(df_nodes_venues.select('id'))
        .union(df_nodes_papers.select('id'))
        .distinct()
)


def filter_node_ids(df, src_col='id', dst_col='id'):
    return df.join(
        df_node_ids,
        F.col('src') == F.col(src_col),
        'inner'
    ).drop(
        'id'
    ).join(
        df_node_ids,
        F.col('dst') == F.col(dst_col),
        'inner'
    ).drop('id')

In [11]:
df_edges_authored = (
    df.select(
        F.explode(F.col('authors')).alias('author_name'),
        F.col('id').alias('dst_pub_id'),
        F.col('year').alias('timestamp'),
    )
    .join(
        df_nodes_authors.select('id', 'name'),
        F.col('author_name') == F.col('name'),
        'inner'
    )
    .withColumnRenamed('id', 'src')
    .join(
        df_nodes_papers.select('id', 'pub_id'),
        F.col('dst_pub_id') == F.col('pub_id'),
        'inner'
    )
    .withColumnRenamed('id', 'dst')
    .select(
            F.col('src'),
            F.col('dst'),
            F.col('timestamp'),
    )
    .distinct()
)
print(df_edges_authored.count())
df_edges_authored.head(5)

11022


[Row(src=2826, dst=3457, timestamp=2015),
 Row(src=3211, dst=5262, timestamp=2013),
 Row(src=3615, dst=5456, timestamp=2014),
 Row(src=2367, dst=1548, timestamp=2012),
 Row(src=2835, dst=3562, timestamp=2013)]

In [12]:
df_edges_published_in = (
    df.select(
        F.col('id').alias('src_pub_id'),
        F.col('venue').alias('dst_venue'),
        F.col('year').alias('timestamp'),
    )
        .join(
            df_nodes_papers.select('id', 'pub_id'),
            F.col('src_pub_id') == F.col('pub_id'),
            'inner'
        )
        .withColumnRenamed('id', 'src')
        .join(
            df_nodes_venues.select('id', 'name'),
            F.col('dst_venue') == F.col('name'),
            'inner'
        )
        .withColumnRenamed('id', 'dst')
        .select(
            F.col('src'),
            F.col('dst'),
            F.col('timestamp'),
        )
        .distinct()
)
print(df_edges_published_in.count())
df_edges_published_in.head(5)

5511


[Row(src=0, dst=8, timestamp=2013),
 Row(src=1, dst=8, timestamp=2013),
 Row(src=2, dst=8, timestamp=2013),
 Row(src=3, dst=4, timestamp=2015),
 Row(src=4, dst=8, timestamp=2015)]

## Feature Engineering (keyword extraction)

In [13]:
df_kw_clean = df.select('id', (F.lower(F.regexp_replace('title', "[^a-zA-Z\\s]", "")).alias('text')))

tokenizer = Tokenizer(inputCol='text', outputCol='words_token')
df_kw_tokens = tokenizer.transform(df_kw_clean).select('id', 'words_token')

remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
df_kw_nostopw = remover.transform(df_kw_tokens).select('id', 'words_clean')

# Stem text
stemmer = SnowballStemmer(language='english')
stemmer_udf = F.udf(lambda tokens: [stemmer.stem(token) for token in tokens], T.ArrayType(T.StringType()))
df_kw_stemmed = df_kw_nostopw.withColumn("words_stemmed", stemmer_udf("words_clean")).select('id', 'words_stemmed')

filter_length_udf = F.udf(lambda row: [x for x in row if len(x) >= 3], T.ArrayType(T.StringType()))
df_kw_doc = df_kw_stemmed.select('id', filter_length_udf(F.col('words_stemmed')).alias('keywords'))

df_kw_doc.head(10)

                                                                                

[Row(id='104', keywords=['sword', 'workloadawar', 'data', 'placement', 'replica', 'select', 'cloud', 'data', 'manag', 'system']),
 Row(id='106', keywords=['survey', 'largescal', 'analyt', 'queri', 'process', 'mapreduc']),
 Row(id='107', keywords=['take', 'big', 'pictur', 'repres', 'skylin', 'base', 'signific', 'divers']),
 Row(id='108', keywords=['calibr', 'trajectori', 'data', 'spatiotempor', 'similar', 'analysi']),
 Row(id='111', keywords=['window', 'pqgram', 'approxim', 'join', 'datacentr', 'xml']),
 Row(id='112', keywords=['sort', 'network', 'fpgas']),
 Row(id='115', keywords=['queri', 'revers', 'engin']),
 Row(id='116', keywords=['outsourc', 'shortest', 'distanc', 'comput', 'privaci', 'protect']),
 Row(id='118', keywords=['high', 'effici', 'qualiti', 'larg', 'graph', 'match']),
 Row(id='120', keywords=['effici', 'process', 'hop', 'reachabl', 'queri'])]

In [14]:
df_kw = df_kw_doc\
    .select(F.explode('keywords').alias('keyword'))\
    .groupBy('keyword')\
    .count()\
    .orderBy('count', ascending=False)

df_kw.show(10)

[Stage 113:>                                                        (0 + 2) / 2]

+-------+-----+
|keyword|count|
+-------+-----+
|  learn|  901|
|  model|  579|
|   data|  408|
|network|  368|
|    use|  356|
| detect|  287|
|   imag|  247|
|  optim|  241|
|    via|  241|
| social|  237|
+-------+-----+
only showing top 10 rows



                                                                                

In [15]:
k = 200
top_keywords = [x.keyword for x in df_kw.limit(k).select('keyword').collect()]
print(top_keywords)

['learn', 'model', 'data', 'network', 'use', 'detect', 'imag', 'optim', 'via', 'social', 'graph', 'effici', 'cluster', 'classif', 'predict', 'search', 'base', 'analysi', 'queri', 'algorithm', 'featur', 'approach', 'estim', 'structur', 'mine', 'spars', 'onlin', 'inform', 'object', 'select', 'multipl', 'recommend', 'process', 'infer', 'system', 'deep', 'represent', 'pattern', 'robust', 'recognit', 'fast', 'local', 'activ', 'adapt', 'bayesian', 'rank', 'applic', 'distribut', 'kernel', 'dynam', 'factor', 'label', 'topic', 'probabilist', 'matrix', 'framework', 'match', 'approxim', 'method', 'semant', 'stream', 'user', 'visual', 'general', 'retriev', 'neural', 'video', 'segment', 'stochast', 'plan', 'linear', 'time', 'sampl', 'regular', 'problem', 'scalabl', 'comput', 'relat', 'hierarch', 'regress', 'larg', 'track', 'random', 'largescal', 'knowledg', 'evalu', 'transfer', 'generat', 'tempor', 'constraint', 'discrimin', 'filter', 'similar', 'embed', 'game', 'discoveri', 'action', 'event', 'mul

                                                                                

In [16]:
df_node_author_keywords = (
    df_edges_authored.select(
        F.col('src').alias('author_id'),
        F.col('dst').alias('paper_id'),
    ).join(
        df_kw_doc,
        F.col('paper_id') == F.col('id'),
        'left'
    ).groupby('author_id').agg(
        F.collect_list('keywords').alias('keywords')
    ).withColumn('keywords', F.array_distinct(F.flatten('keywords')))
)
df_node_author_keywords.head(5)

[Row(author_id=0, keywords=['cluster', 'order', 'sequenc', 'base', 'typic', 'index', 'find', 'clinic', 'pathway', 'candid']),
 Row(author_id=1, keywords=['valu', 'ignor', 'number', 'player']),
 Row(author_id=2, keywords=['price', 'war', 'continu', 'competit', 'multiitem', 'onlin', 'multitask', 'learn', 'via', 'spars', 'dictionari', 'optim', 'queri', 'inconsist', 'descript', 'logic', 'knowledg', 'base', 'prefer', 'repair', 'semant']),
 Row(author_id=3, keywords=['dcbtree', 'spaceeffici', 'delta', 'code', 'cach', 'conscious', 'btree']),
 Row(author_id=4, keywords=['autonom', 'agent', 'futur', 'energi', 'market', 'power', 'trade', 'competit'])]

In [17]:
df_node_author_feats = (
    df_node_author_keywords.select(['author_id'] + [
        F.array_contains('keywords', F.lit(k)).alias('feat_' + str(k))
        for k in top_keywords
    ])
)
df_node_author_feats.head(5)

22/03/12 13:06:42 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Row(author_id=0, feat_learn=False, feat_model=False, feat_data=False, feat_network=False, feat_use=False, feat_detect=False, feat_imag=False, feat_optim=False, feat_via=False, feat_social=False, feat_graph=False, feat_effici=False, feat_cluster=True, feat_classif=False, feat_predict=False, feat_search=False, feat_base=True, feat_analysi=False, feat_queri=False, feat_algorithm=False, feat_featur=False, feat_approach=False, feat_estim=False, feat_structur=False, feat_mine=False, feat_spars=False, feat_onlin=False, feat_inform=False, feat_object=False, feat_select=False, feat_multipl=False, feat_recommend=False, feat_process=False, feat_infer=False, feat_system=False, feat_deep=False, feat_represent=False, feat_pattern=False, feat_robust=False, feat_recognit=False, feat_fast=False, feat_local=False, feat_activ=False, feat_adapt=False, feat_bayesian=False, feat_rank=False, feat_applic=False, feat_distribut=False, feat_kernel=False, feat_dynam=False, feat_factor=False, feat_label=False, fe

In [18]:
df_nodes_authors_new = df_nodes_authors.join(
    df_node_author_feats,
    F.col('id') == F.col('author_id'),
    'left'
).drop('author_id').sort('id')

df_nodes_authors_new.head(5)

[Row(name='&Aacute', id=0, feat_learn=False, feat_model=False, feat_data=False, feat_network=False, feat_use=False, feat_detect=False, feat_imag=False, feat_optim=False, feat_via=False, feat_social=False, feat_graph=False, feat_effici=False, feat_cluster=True, feat_classif=False, feat_predict=False, feat_search=False, feat_base=True, feat_analysi=False, feat_queri=False, feat_algorithm=False, feat_featur=False, feat_approach=False, feat_estim=False, feat_structur=False, feat_mine=False, feat_spars=False, feat_onlin=False, feat_inform=False, feat_object=False, feat_select=False, feat_multipl=False, feat_recommend=False, feat_process=False, feat_infer=False, feat_system=False, feat_deep=False, feat_represent=False, feat_pattern=False, feat_robust=False, feat_recognit=False, feat_fast=False, feat_local=False, feat_activ=False, feat_adapt=False, feat_bayesian=False, feat_rank=False, feat_applic=False, feat_distribut=False, feat_kernel=False, feat_dynam=False, feat_factor=False, feat_label=

In [19]:
df_node_venue_keywords = (
    df_edges_published_in.select(
        F.col('src').alias('paper_id'),
        F.col('dst').alias('venue_id'),
    ).join(
        df_kw_doc,
        F.col('paper_id') == F.col('id'),
        'left'
    ).groupby('venue_id').agg(
        F.collect_list('keywords').alias('keywords')
    ).withColumn('keywords', F.array_distinct(F.flatten('keywords')))
)
df_node_venue_keywords.head(5)

[Row(venue_id=8, keywords=['sword', 'workloadawar', 'data', 'placement', 'replica', 'select', 'cloud', 'manag', 'system', 'survey', 'largescal', 'analyt', 'queri', 'process', 'mapreduc', 'take', 'big', 'pictur', 'repres', 'skylin', 'base', 'signific', 'divers', 'calibr', 'trajectori', 'spatiotempor', 'similar', 'analysi', 'window', 'pqgram', 'approxim', 'join', 'datacentr', 'xml', 'sort', 'network', 'fpgas', 'revers', 'engin', 'outsourc', 'shortest', 'distanc', 'comput', 'privaci', 'protect', 'high', 'effici', 'qualiti', 'larg', 'graph', 'match', 'hop', 'reachabl', 'hybrid', 'entiti', 'cluster', 'use', 'crowd', 'topk', 'web', 'applic', 'acm', 'scalabl', 'parallel', 'extract', 'frequent', 'pattern', 'long', 'sequenc', 'extrem', 'movingobject', 'updat', 'workload', 'main', 'memori', 'scope', 'databas', 'meet', 'snapshot', 'continu', 'nearest', 'neighbor', 'haloop', 'approach', 'iter', 'andes', 'evalu', 'nottwig', 'relat', 'increment', 'resolut', 'rule', 'conformityawar', 'influenc', 'max

In [20]:
df_node_venue_feats = (
    df_node_venue_keywords.select(['venue_id'] + [
        F.array_contains('keywords', F.lit(k)).alias('feat_' + str(k))
        for k in top_keywords
    ])
)
df_node_venue_feats.head(5)

[Row(venue_id=8, feat_learn=True, feat_model=True, feat_data=True, feat_network=True, feat_use=True, feat_detect=True, feat_imag=True, feat_optim=True, feat_via=True, feat_social=True, feat_graph=True, feat_effici=True, feat_cluster=True, feat_classif=True, feat_predict=True, feat_search=True, feat_base=True, feat_analysi=True, feat_queri=True, feat_algorithm=True, feat_featur=True, feat_approach=True, feat_estim=True, feat_structur=True, feat_mine=True, feat_spars=True, feat_onlin=True, feat_inform=True, feat_object=True, feat_select=True, feat_multipl=True, feat_recommend=True, feat_process=True, feat_infer=True, feat_system=True, feat_deep=True, feat_represent=True, feat_pattern=True, feat_robust=True, feat_recognit=True, feat_fast=True, feat_local=True, feat_activ=True, feat_adapt=True, feat_bayesian=True, feat_rank=True, feat_applic=True, feat_distribut=True, feat_kernel=False, feat_dynam=True, feat_factor=True, feat_label=True, feat_topic=True, feat_probabilist=True, feat_matrix=

In [21]:
df_nodes_venues_new = df_nodes_venues.join(
    df_node_venue_feats,
    F.col('id') == F.col('venue_id'),
    'left'
).drop('venue_id').sort('id')

df_nodes_venues_new.head(5)

[Row(name='aaai', id=0, feat_learn=True, feat_model=True, feat_data=True, feat_network=True, feat_use=True, feat_detect=True, feat_imag=True, feat_optim=True, feat_via=True, feat_social=True, feat_graph=True, feat_effici=True, feat_cluster=True, feat_classif=True, feat_predict=True, feat_search=True, feat_base=True, feat_analysi=True, feat_queri=True, feat_algorithm=True, feat_featur=True, feat_approach=True, feat_estim=True, feat_structur=True, feat_mine=True, feat_spars=True, feat_onlin=True, feat_inform=True, feat_object=True, feat_select=True, feat_multipl=True, feat_recommend=True, feat_process=True, feat_infer=True, feat_system=True, feat_deep=True, feat_represent=True, feat_pattern=True, feat_robust=True, feat_recognit=True, feat_fast=True, feat_local=True, feat_activ=True, feat_adapt=True, feat_bayesian=True, feat_rank=True, feat_applic=True, feat_distribut=True, feat_kernel=True, feat_dynam=True, feat_factor=True, feat_label=True, feat_topic=True, feat_probabilist=True, feat_m

## Save data

In [22]:
df_nodes_authors_new.write.parquet(DATASET.processed_str('nodes_Author'), mode='overwrite')
df_nodes_venues_new.write.parquet(DATASET.processed_str('nodes_Venue'), mode='overwrite')
df_nodes_papers.write.parquet(DATASET.processed_str('nodes_Paper'), mode='overwrite')

df_edges_authored.write.parquet(DATASET.processed_str('edges_AUTHORED'), mode='overwrite')
df_edges_published_in.write.parquet(DATASET.processed_str('edges_PUBLISHED_IN'), mode='overwrite')

                                                                                

In [23]:
from shared.schema.graph import GraphSchema, NodeSchema, EdgeSchema

(
    GraphSchema()
        .add_node_schema('Author', NodeSchema.from_spark(df_nodes_authors_new.schema, label='name'))
        .add_node_schema('Venue', NodeSchema.from_spark(df_nodes_venues_new.schema, label='name'))
        .add_node_schema('Paper', NodeSchema.from_spark(df_nodes_papers.schema, label='title', timestamp='timestamp', interaction=False))
        .add_edge_schema('AUTHORED', EdgeSchema.from_spark(df_edges_authored.schema, source_type='Author', target_type='Paper', directed=True, timestamp='timestamp', interaction=False))
        .add_edge_schema('PUBLISHED_IN', EdgeSchema.from_spark(df_edges_published_in.schema, source_type='Paper', target_type='Venue', directed=True, timestamp='timestamp', interaction=False))
        .save_schema(DATASET.processed())
)

GraphSchema(_path=PosixPath('/data/pella/projects/University/Thesis/Thesis/code/storage/datasets/processed/DBLP-HCN'), nodes={'Author': NodeSchema(_type='Author', _schema=..., label='name', properties={'name': GraphProperty(_name='name', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.INT: 'int'>, array=False)), 'feat_learn': GraphProperty(_name='feat_learn', dtype=DType(atomic=<DTypeAtomic.BOOL: 'boolean'>, array=False)), 'feat_model': GraphProperty(_name='feat_model', dtype=DType(atomic=<DTypeAtomic.BOOL: 'boolean'>, array=False)), 'feat_data': GraphProperty(_name='feat_data', dtype=DType(atomic=<DTypeAtomic.BOOL: 'boolean'>, array=False)), 'feat_network': GraphProperty(_name='feat_network', dtype=DType(atomic=<DTypeAtomic.BOOL: 'boolean'>, array=False)), 'feat_use': GraphProperty(_name='feat_use', dtype=DType(atomic=<DTypeAtomic.BOOL: 'boolean'>, array=False)), 'feat_detect': GraphProperty(_name='feat_

# Ground truth communities
Using same methodology as in:

J. Yang and J. Leskovec, “Defining and evaluating network communities based on ground-truth,” in Proceedings of the ACM SIGKDD Workshop on Mining Data Semantics, New York, NY, USA, Aug. 2012, pp. 1–8. doi: 10.1145/2350190.2350193.


In [24]:
print(df_nodes_papers.groupby('venue').count().head(100))
print(df_nodes_papers.filter(F.col('venue').isNull()).head(5))

[Row(venue='www', count=64), Row(venue='icde', count=299), Row(venue='dasfaa', count=189), Row(venue='pkdd', count=240), Row(venue='icdm', count=474), Row(venue='aaai', count=1041), Row(venue='vldb', count=94), Row(venue='nips', count=689), Row(venue='icml', count=462), Row(venue='sdm', count=192), Row(venue='cvpr', count=898), Row(venue='ijcai', count=461), Row(venue='pakdd', count=210), Row(venue='ecir', count=198)]
[]


In [25]:
df_nodes_venues_comm = (
    df_nodes_venues
        .select(
        F.col('id').alias('id'),
        F.col('id').alias('cid'),
    )
        .distinct()
)
df_nodes_venues_comm.head(5)

[Row(id=0, cid=0),
 Row(id=1, cid=1),
 Row(id=2, cid=2),
 Row(id=3, cid=3),
 Row(id=4, cid=4)]

In [26]:
df_nodes_papers_comm = (
    df_nodes_papers
        .select(
        F.col('id').alias('id'),
        F.col('venue').alias('cid'),
    )
        .distinct()
)
df_nodes_papers_comm.head(5)

[Row(id=0, cid='nips'),
 Row(id=1, cid='nips'),
 Row(id=2, cid='nips'),
 Row(id=3, cid='icde'),
 Row(id=4, cid='nips')]

In [27]:
df_nodes_authors_comm = (
    df_nodes_authors
        .join(df_edges_authored, F.col('id') == F.col('src'), 'left')
        .drop('src')
        .withColumnRenamed('dst', 'paper_id')
        .join(df_edges_published_in, F.col('paper_id') == F.col('src'), 'left')
        .drop('src')
        .withColumnRenamed('dst', 'cid')
        .select(
        F.col('id').alias('id'),
        F.col('cid').alias('cid'),
    )
        .distinct()
)
df_nodes_authors_comm.head(5)

[Row(id=26, cid=1),
 Row(id=29, cid=0),
 Row(id=474, cid=7),
 Row(id=474, cid=5),
 Row(id=964, cid=1)]

In [28]:
df_nodes_comm = (
    df_nodes_authors_comm
        .union(df_nodes_papers_comm)
        .union(df_nodes_venues_comm)
        .distinct()
)
df_nodes_comm.head(5)

                                                                                

[Row(id=4590, cid='1'),
 Row(id=1950, cid='11'),
 Row(id=2040, cid='1'),
 Row(id=2453, cid='0'),
 Row(id=3091, cid='8')]

In [29]:
df_nodes_comm.coalesce(1).write.csv(
    DATASET.processed_str('ground_truth.comlist.tmp'),
    sep='\t',
    mode='overwrite',
    header=None,
    quoteAll=True
)

                                                                                

In [30]:
import shutil

shutil.move(
    str(next(DATASET.processed('ground_truth.comlist.tmp').glob('*.csv'))),
    DATASET.processed_str('ground_truth.ncomlist')
)

shutil.rmtree(DATASET.processed_str('ground_truth.comlist.tmp'), ignore_errors=True)