In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

In [28]:
from shared.schema import DatasetSchema

DATASET = DatasetSchema.load_schema('social-distancing-student')
DATASET.save_schema()

In [29]:
spark = (SparkSession.builder
         .appName(str(DATASET))
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

In [30]:
DATE_FORMAT = "EEE MMM dd HH:mm:ss '+0000' yyyy"
probs_schema = T.StructType([
    T.StructField('SUPPORTS', T.FloatType(), False),
    T.StructField('IRRELEVANT', T.FloatType(), False),
    T.StructField('REJECTS', T.FloatType(), False),
])

df = (
    spark.read.format('org.apache.spark.sql.json')
        .load(DATASET.raw_str('social-distancing-student.json'))
        .withColumn('created_at', F.to_timestamp(F.col('created_at'), 'EEE MMM dd HH:mm:ss \'+0000\' yyyy'))
        .withColumn('probabilities_social_distancing', F.from_json('probabilities_social_distancing', probs_schema))
).cache()
df.head(1)

22/03/23 00:08:55 WARN CacheManager: Asked to cache already cached data.        


[Row(created_at=datetime.datetime(2020, 9, 30, 22, 1, 2), entities=Row(hashtags=[], media=None, symbols=[], urls=[Row(display_url='twitter.com/i/web/status/1…', expanded_url='https://twitter.com/i/web/status/1311425852169744385', indices=[117, 140], url='https://t.co/EA6OO7cWR5')], user_mentions=[Row(id=2493701, id_str='2493701', indices=[0, 5], name='NU.nl', screen_name='NUnl')]), full_text='@NUnl Waarom niet de dove mens het mondkapje op en degene die spreekt geen monkapje. Als één van twee het maar op heeft en wel afstand houden. ik ben SH en heb erg last van mensen die te snel praten en te zacht. Wat doe ik hier aan. Gewoon maar vragen denk ik?', id_str='1311425852169744385', in_reply_to_status_id_str='1311352289928581120', in_reply_to_user_id_str='2493701', label_social_distancing='SUPPORTS', probabilities_social_distancing=Row(SUPPORTS=0.9821312427520752, IRRELEVANT=0.014853371307253838, REJECTS=0.003045313758775592), quoted_status_id_str='', sentiment=-0.6, text='@NUnl Waarom ni

In [31]:
users_tweet_df = df.select(
    'user.name', 'user.screen_name', 'user.id_str'
)
user_mentions_df = (
    df.select(F.explode('entities.user_mentions').alias('user'))
        .select(
        'user.name',
        'user.screen_name',
        'user.id_str'
    )
)
df_nodes_users_tmp = (
    users_tweet_df
        .union(user_mentions_df)
        .select(
            'name',
            'screen_name',
            F.col('id_str').alias('id'),
        )
        .dropDuplicates(['id'])
)
df_nodes_users_tmp.head(5)

[Row(name='DLRP-Magic.com', screen_name='DLRP_Magic', id='100002919'),
 Row(name='Russel⁷ ✜ +×', screen_name='HoldOnSoobin', id='1000331230565617666'),
 Row(name='abkuijer❌❌❌', screen_name='abkuijer', id='10003862'),
 Row(name='Koen', screen_name='koen0612', id='1000414316238172163'),
 Row(name='Wouter van Embden', screen_name='EmbdenWouter', id='1000464536082747393')]

In [32]:
df_nodes_hashtags_tmp = (
    df
        .select(F.explode('entities.hashtags').alias('hashtag'))
        .select(
        F.col('hashtag.text').alias('id'),
        F.col('hashtag.text').alias('name')
    )
        .dropDuplicates(['id'])
)
df_nodes_hashtags_tmp.head(5)

[Row(id='040fungi', name='040fungi'),
 Row(id='11stedenMcDrive', name='11stedenMcDrive'),
 Row(id='140ktober', name='140ktober'),
 Row(id='150km', name='150km'),
 Row(id='1dagniet', name='1dagniet')]

In [33]:
df_nodes_tweets = df.select(
    F.col('id_str').alias('id'),
    'text',
    'label_social_distancing',
    F.col('probabilities_social_distancing.SUPPORTS').alias('feat_supports'),
    F.col('probabilities_social_distancing.IRRELEVANT').alias('feat_irrelevant'),
    F.col('probabilities_social_distancing.REJECTS').alias('feat_rejects'),
    F.col('sentiment').alias('feat_sentiment'),
    F.col('created_at').alias('timestamp')
)
df_nodes_tweets.head(5)

[Row(id='1311425852169744385', text='@NUnl Waarom niet de dove mens het mondkapje op en degene die spreekt geen monkapje. Als één van twee het maar op h… https://t.co/EA6OO7cWR5', label_social_distancing='SUPPORTS', feat_supports=0.9821312427520752, feat_irrelevant=0.014853371307253838, feat_rejects=0.003045313758775592, feat_sentiment=-0.6, timestamp=datetime.datetime(2020, 9, 30, 22, 1, 2)),
 Row(id='1311426330488180736', text='hoe meer werklozen er komen. Hoe meer bedrijven kapot gaan, en zo kan ik nog uren doorgaan. \n\nBuiten dit om, ik sna… https://t.co/8LGwaSPock', label_social_distancing='SUPPORTS', feat_supports=0.9024366736412048, feat_irrelevant=8.047425944823772e-05, feat_rejects=0.09751284122467041, feat_sentiment=-0.6, timestamp=datetime.datetime(2020, 9, 30, 22, 2, 56)),
 Row(id='1311426383906844672', text='RT @Rijksoverheid: Dringend advies: draag vanaf vandaag een niet-medisch mondkapje in publieke binnenruimtes zoals winkels, musea en benzin…', label_social_distancing

# Edge Extraction

In [34]:
df_node_ids = (
    df_nodes_users_tmp.select('id')
        .union(df_nodes_hashtags_tmp.select('id'))
        .union(df_nodes_tweets.select('id'))
        .dropDuplicates(['id'])
).cache()


def filter_node_ids(df):
    return df.join(
        df_node_ids,
        F.col('src') == F.col('id'),
        'inner'
    ).drop(
        'id'
    ).join(
        df_node_ids,
        F.col('dst') == F.col('id'),
        'inner'
    ).drop('id')

22/03/23 00:08:56 WARN CacheManager: Asked to cache already cached data.


In [35]:
df_edges_tweeted = filter_node_ids(df.select(
    F.col('user.id_str').alias('src'),
    F.col('id_str').alias('dst'),
    F.col('created_at').alias('timestamp')
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter(
    "dst != '' AND src != ''"
).distinct())

df_edges_tweeted.head(5)
# write_relations(tweeted_ref_df, "TWEETED", ":Twitter:User.id_str:user_id", ":Twitter:Tweet.id_str:tweet_id")

[Row(src='122328607', dst='1311428477023903744', timestamp=datetime.datetime(2020, 9, 30, 22, 11, 28)),
 Row(src='3534886816', dst='1311686424907976706', timestamp=datetime.datetime(2020, 10, 1, 15, 16, 28)),
 Row(src='431739163', dst='1311695956472213506', timestamp=datetime.datetime(2020, 10, 1, 15, 54, 20)),
 Row(src='324968756', dst='1311709328769011719', timestamp=datetime.datetime(2020, 10, 1, 16, 47, 29)),
 Row(src='783981240227594240', dst='1311717211363516421', timestamp=datetime.datetime(2020, 10, 1, 17, 18, 48))]

In [36]:
df_edges_reply_to_user = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.col('in_reply_to_user_id_str').alias('dst'),
    F.col('created_at').alias('timestamp'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_reply_to_user.head(5)
# write_relations(reply_ref_df, "REPLY_TO", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:User.id_str:user_id")

[Row(src='1311566282228203521', dst='3133164023', timestamp=datetime.datetime(2020, 10, 1, 7, 19, 4)),
 Row(src='1311627809899642887', dst='58302457', timestamp=datetime.datetime(2020, 10, 1, 11, 23, 33)),
 Row(src='1311628351149420545', dst='2288448844', timestamp=datetime.datetime(2020, 10, 1, 11, 25, 42)),
 Row(src='1311764583615467522', dst='22187256', timestamp=datetime.datetime(2020, 10, 1, 20, 27, 2)),
 Row(src='1311779455883505665', dst='3103511', timestamp=datetime.datetime(2020, 10, 1, 21, 26, 8))]

In [37]:
df_edges_quote_tweet = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.col('quoted_status_id_str').alias('dst'),
    F.col('created_at').alias('timestamp'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_quote_tweet.head(5)
# write_relations(quote_tweet_df, "QUOTED", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:Tweet.id_str:quoted_tweet_id")

[Row(src='1311643833587953665', dst='1311635271650340870', timestamp=datetime.datetime(2020, 10, 1, 12, 27, 13)),
 Row(src='1311764925304496139', dst='1311755749975089167', timestamp=datetime.datetime(2020, 10, 1, 20, 28, 24)),
 Row(src='1311826032094334976', dst='1311726735638114306', timestamp=datetime.datetime(2020, 10, 2, 0, 31, 13)),
 Row(src='1311646892829351937', dst='1311635271650340870', timestamp=datetime.datetime(2020, 10, 1, 12, 39, 23)),
 Row(src='1311953609991614464', dst='1311951954172575744', timestamp=datetime.datetime(2020, 10, 2, 8, 58, 10))]

In [38]:
df_edges_mentioned_user = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.explode('entities.user_mentions.id_str').alias('dst'),
    F.col('created_at').alias('timestamp'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_mentioned_user.head(5)
# write_relations(mention_user_ref_df, "MENTIONED", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:User.id_str:user_id")

[Row(src='1311566282228203521', dst='3133164023', timestamp=datetime.datetime(2020, 10, 1, 7, 19, 4)),
 Row(src='1311579770908545024', dst='1233135510920495109', timestamp=datetime.datetime(2020, 10, 1, 8, 12, 40)),
 Row(src='1311585502215254017', dst='1103569408822267905', timestamp=datetime.datetime(2020, 10, 1, 8, 35, 26)),
 Row(src='1311591131424141312', dst='15200788', timestamp=datetime.datetime(2020, 10, 1, 8, 57, 48)),
 Row(src='1311596859748765701', dst='15595333', timestamp=datetime.datetime(2020, 10, 1, 9, 20, 34))]

In [39]:
df_edges_reply_to_tweet = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.col('in_reply_to_status_id_str').alias('dst'),
    F.col('created_at').alias('timestamp'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_reply_to_tweet.head(5)
# write_relations(reply_tweet_df, "REPLY_TO", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:Tweet.id_str:original_tweet_id")

[Row(src='1311601566256893953', dst='1311594779067797505', timestamp=datetime.datetime(2020, 10, 1, 9, 39, 16)),
 Row(src='1311639847627522050', dst='1311631160720719872', timestamp=datetime.datetime(2020, 10, 1, 12, 11, 23)),
 Row(src='1311943541124407303', dst='1311928390409228289', timestamp=datetime.datetime(2020, 10, 2, 8, 18, 9)),
 Row(src='1311943738189647872', dst='1311936390045855744', timestamp=datetime.datetime(2020, 10, 2, 8, 18, 56)),
 Row(src='1311685445370228744', dst='1311684421511254017', timestamp=datetime.datetime(2020, 10, 1, 15, 12, 34))]

In [40]:
df_edges_mentioned_hashtag = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.explode('entities.hashtags.text').alias('dst'),
    F.col('created_at').alias('timestamp'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_mentioned_hashtag.head(5)
# write_relations(hashtag_mention_df, "MENTIONED", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:Hashtag.id_str:hashtag_id")

[Row(src='1311578039311048705', dst='horecasluiten', timestamp=datetime.datetime(2020, 10, 1, 8, 5, 47)),
 Row(src='1311693208095797248', dst='mondkapjesadvies', timestamp=datetime.datetime(2020, 10, 1, 15, 43, 25)),
 Row(src='1311972926607052801', dst='coroNEE', timestamp=datetime.datetime(2020, 10, 2, 10, 14, 55)),
 Row(src='1311984202750009344', dst='Avondklok', timestamp=datetime.datetime(2020, 10, 2, 10, 59, 44)),
 Row(src='1311575723438682112', dst='coronavirus', timestamp=datetime.datetime(2020, 10, 1, 7, 56, 35))]

In [41]:
df_user_ids = df_nodes_users_tmp.select('id')

df_followers = (
    spark.read.text(DATASET.raw_str('followers'), wholetext=False, pathGlobFilter='*.txt')
        .select(
        F.regexp_extract(F.input_file_name(), r'([0-9]+)%20([A-z0-9%]+).txt$', 1).alias('user_id'),
        F.col('value').alias('follower_id'),
    )
).cache()

df_followers.head(5)

                                                                                

[Row(user_id='56377143', follower_id='2687493770'),
 Row(user_id='56377143', follower_id='1443138398928183298'),
 Row(user_id='56377143', follower_id='480680728'),
 Row(user_id='56377143', follower_id='801927938304315392'),
 Row(user_id='56377143', follower_id='279492619')]

In [42]:
df_edges_follows = filter_node_ids(df_followers.join(
    df_user_ids.alias('a'), F.col('follower_id') == F.col('a.id'), 'inner'
).join(
    df_user_ids.alias('b'), F.col('user_id') == F.col('b.id'), 'inner'
).select(
    F.col('follower_id').alias('src'),
    F.col('user_id').alias('dst')
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_follows.head(5)
# write_relations(df_followers, "FOLLOWS", ":Twitter:User.id_str:follower_id", ":Twitter:User.id_str:user_id")

                                                                                

[Row(src='1034004099950362625', dst='56377143'),
 Row(src='714542887040630784', dst='56377143'),
 Row(src='400296650', dst='56377143'),
 Row(src='2576050703', dst='56377143'),
 Row(src='138394987', dst='56377143')]

# Feature Engineering

In [43]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.stem.snowball import SnowballStemmer
import nltk

nltk.download('stopwords')

df_kw_clean = df_nodes_tweets.select('id', (F.lower(F.regexp_replace('text', "[^a-zA-Z\\s]", "")).alias('text')))

tokenizer = Tokenizer(inputCol='text', outputCol='words_token')
df_kw_tokens = tokenizer.transform(df_kw_clean).select('id', 'words_token')

stopwordList = nltk.corpus.stopwords.words('dutch')
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean', stopWords=stopwordList)
df_kw_nostopw = remover.transform(df_kw_tokens).select('id', 'words_clean')

# Stem text
stemmer = SnowballStemmer(language='dutch')
stemmer_udf = F.udf(lambda tokens: [stemmer.stem(token) for token in tokens], T.ArrayType(T.StringType()))
df_kw_stemmed = df_kw_nostopw.withColumn("words_stemmed", stemmer_udf("words_clean")).select('id', 'words_stemmed')

filter_length_udf = F.udf(lambda row: [x for x in row if len(x) >= 3], T.ArrayType(T.StringType()))
df_kw_doc = df_kw_stemmed.select('id', F.array_distinct(filter_length_udf(F.col('words_stemmed'))).alias('keywords'))

df_kw_doc.head(10)

[nltk_data] Downloading package stopwords to /home/egordm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
                                                                                

[Row(id='1311425852169744385', keywords=['nunl', 'waarom', 'dov', 'men', 'mondkapj', 'deg', 'spreekt', 'monkapj', 'twee', 'httpstcoeaoocwr']),
 Row(id='1311426330488180736', keywords=['werkloz', 'kom', 'bedrijv', 'kapot', 'gan', 'uren', 'doorgan', 'buit', 'sna', 'httpstcolgwaspock']),
 Row(id='1311426383906844672', keywords=['rijksover', 'dringend', 'advies', 'drag', 'vanaf', 'vandag', 'nietmedisch', 'mondkapj', 'publiek', 'binnenruimtes', 'zoal', 'winkel', 'musea', 'benzin']),
 Row(id='1311426584893624322', keywords=['natassavas', 'georg', 'soros', 'bocht', 'pompt', 'even', 'ker', 'rac', 'nieuw', 'ovj', 'los', 'angeles', 'uiteraard']),
 Row(id='1311426630020202498', keywords=['zuidplas', 'dringend', 'advies', 'drag', 'vanaf', 'vandag', 'nietmedisch', 'mondkapj', 'publiek', 'binnenruimtes', 'zoal', 'winkel', 'musea', 'benzinestat']),
 Row(id='1311426978390716417', keywords=['vrgroning', 'dringend', 'advies', 'drag', 'vanaf', 'vandag', 'nietmedisch', 'mondkapj', 'publiek', 'binnenruimte

In [44]:
df_kw = df_kw_doc\
    .select(F.explode('keywords').alias('keyword'))\
    .groupBy('keyword')\
    .count()\
    .orderBy('count', ascending=False)

df_kw.show(10)



+----------+-----+
|   keyword|count|
+----------+-----+
|   afstand|17514|
|     meter|12283|
|      houd|12134|
|  mondkapj| 5150|
|      mens| 4960|
|      drag| 3710|
| anderhalv| 3465|
|       wel| 3335|
|      hand| 3198|
|mondkapjes| 2970|
+----------+-----+
only showing top 10 rows



                                                                                

In [45]:
k = 80
top_keywords = {x.keyword for x in df_kw.limit(k).select('keyword').collect()}
print(top_keywords)



{'echt', 'wel', 'goed', 'drag', 'houdt', 'publiek', 'covid', 'gan', 'lockdown', 'hand', 'masker', 'jij', 'dringend', 'regel', 'mens', 'wek', 'mondmasker', 'houd', 'mondkapj', 'mogelijk', 'wij', 'advies', 'elkar', 'meter', 'stuk', 'binnenruimtes', 'iederen', 'hel', 'horeca', 'blijv', 'allen', 'hou', 'musea', 'binn', 'tuss', 'anderhalv', 'vanaf', 'wer', 'rivm', 'blijf', 'besmet', 'onz', 'gat', 'zorg', 'volgen', 'even', 'nos', 'schol', 'buit', 'ander', 'zit', 'wass', 'zoal', 'zie', 'afstand', 'thuis', 'werk', 'verspreid', 'sted', 'grot', 'mee', 'mag', 'all', 'wet', 'waarom', 'maatregel', 'corona', 'minpres', 'lat', 'mak', 'test', 'denk', 'moet', 'net', 'vandag', 'mondkapjes', 'winkel', 'kom', 'gewon', 'war'}


                                                                                

In [46]:
filter_freq_udf = F.udf(lambda row: [kw for kw in row if kw in top_keywords], T.ArrayType(T.StringType()))
df_freq_kw_doc = df_kw_doc.select('id', F.explode(filter_freq_udf(F.col('keywords'))).alias('keyword')).cache()
df_freq_kw_doc.head(5)

[Row(id='1311425852169744385', keyword='waarom'),
 Row(id='1311425852169744385', keyword='mondkapj'),
 Row(id='1311426330488180736', keyword='kom'),
 Row(id='1311426330488180736', keyword='gan'),
 Row(id='1311426330488180736', keyword='buit')]

In [47]:
df_hashtag_feats = df_edges_mentioned_hashtag.select('src', 'dst')\
    .join(df_freq_kw_doc.alias('a'), F.col('src') == F.col('a.id'), 'inner')\
    .groupby('dst')\
    .agg(F.collect_list('keyword').alias('keywords'))\
    .select([F.col('dst').alias('hashtag_id')] + [
        F.array_contains('keywords', F.lit(k)).alias('feat_' + str(k))
        for k in top_keywords
    ])
df_hashtag_feats.head(5)

                                                                                

[Row(hashtag_id='040fungi', feat_echt=False, feat_wel=False, feat_goed=False, feat_drag=False, feat_houdt=False, feat_publiek=False, feat_covid=False, feat_gan=False, feat_lockdown=False, feat_hand=False, feat_masker=False, feat_jij=False, feat_dringend=False, feat_regel=False, feat_mens=False, feat_wek=False, feat_mondmasker=False, feat_houd=False, feat_mondkapj=False, feat_mogelijk=False, feat_wij=False, feat_advies=False, feat_elkar=False, feat_meter=True, feat_stuk=False, feat_binnenruimtes=False, feat_iederen=False, feat_hel=False, feat_horeca=False, feat_blijv=False, feat_allen=False, feat_hou=False, feat_musea=False, feat_binn=False, feat_tuss=False, feat_anderhalv=False, feat_vanaf=False, feat_wer=False, feat_rivm=False, feat_blijf=False, feat_besmet=False, feat_onz=False, feat_gat=False, feat_zorg=False, feat_volgen=False, feat_even=False, feat_nos=False, feat_schol=False, feat_buit=False, feat_ander=False, feat_zit=False, feat_wass=False, feat_zoal=False, feat_zie=False, feat

In [48]:
df_nodes_hashtags = df_nodes_hashtags_tmp\
    .join(df_hashtag_feats, F.col('id') == F.col('hashtag_id'), 'left')
df_nodes_hashtags.head(5)

[Row(id='040fungi', name='040fungi', hashtag_id='040fungi', feat_echt=False, feat_wel=False, feat_goed=False, feat_drag=False, feat_houdt=False, feat_publiek=False, feat_covid=False, feat_gan=False, feat_lockdown=False, feat_hand=False, feat_masker=False, feat_jij=False, feat_dringend=False, feat_regel=False, feat_mens=False, feat_wek=False, feat_mondmasker=False, feat_houd=False, feat_mondkapj=False, feat_mogelijk=False, feat_wij=False, feat_advies=False, feat_elkar=False, feat_meter=True, feat_stuk=False, feat_binnenruimtes=False, feat_iederen=False, feat_hel=False, feat_horeca=False, feat_blijv=False, feat_allen=False, feat_hou=False, feat_musea=False, feat_binn=False, feat_tuss=False, feat_anderhalv=False, feat_vanaf=False, feat_wer=False, feat_rivm=False, feat_blijf=False, feat_besmet=False, feat_onz=False, feat_gat=False, feat_zorg=False, feat_volgen=False, feat_even=False, feat_nos=False, feat_schol=False, feat_buit=False, feat_ander=False, feat_zit=False, feat_wass=False, feat_

In [49]:
df_user_feats = df_edges_tweeted.select('src', 'dst')\
    .join(df_freq_kw_doc.alias('a'), F.col('dst') == F.col('a.id'), 'inner')\
    .groupby('src')\
    .agg(F.collect_list('keyword').alias('keywords'))\
    .select([F.col('src').alias('user_id')] + [
        F.array_contains('keywords', F.lit(k)).alias('feat_' + str(k))
        for k in top_keywords
    ])
df_user_feats.head(5)

[Row(user_id='100002919', feat_echt=False, feat_wel=False, feat_goed=False, feat_drag=False, feat_houdt=True, feat_publiek=False, feat_covid=False, feat_gan=False, feat_lockdown=False, feat_hand=False, feat_masker=False, feat_jij=False, feat_dringend=False, feat_regel=False, feat_mens=True, feat_wek=False, feat_mondmasker=False, feat_houd=False, feat_mondkapj=False, feat_mogelijk=False, feat_wij=False, feat_advies=False, feat_elkar=False, feat_meter=True, feat_stuk=False, feat_binnenruimtes=False, feat_iederen=False, feat_hel=False, feat_horeca=True, feat_blijv=False, feat_allen=True, feat_hou=False, feat_musea=False, feat_binn=False, feat_tuss=True, feat_anderhalv=False, feat_vanaf=False, feat_wer=False, feat_rivm=False, feat_blijf=False, feat_besmet=False, feat_onz=False, feat_gat=True, feat_zorg=False, feat_volgen=False, feat_even=False, feat_nos=False, feat_schol=False, feat_buit=False, feat_ander=False, feat_zit=False, feat_wass=False, feat_zoal=False, feat_zie=False, feat_afstand

In [50]:
df_nodes_users = df_nodes_users_tmp\
    .join(df_user_feats, F.col('id') == F.col('user_id'), 'left')
df_nodes_users.head(5)

                                                                                

[Row(name='Russel⁷ ✜ +×', screen_name='HoldOnSoobin', id='1000331230565617666', user_id=None, feat_echt=None, feat_wel=None, feat_goed=None, feat_drag=None, feat_houdt=None, feat_publiek=None, feat_covid=None, feat_gan=None, feat_lockdown=None, feat_hand=None, feat_masker=None, feat_jij=None, feat_dringend=None, feat_regel=None, feat_mens=None, feat_wek=None, feat_mondmasker=None, feat_houd=None, feat_mondkapj=None, feat_mogelijk=None, feat_wij=None, feat_advies=None, feat_elkar=None, feat_meter=None, feat_stuk=None, feat_binnenruimtes=None, feat_iederen=None, feat_hel=None, feat_horeca=None, feat_blijv=None, feat_allen=None, feat_hou=None, feat_musea=None, feat_binn=None, feat_tuss=None, feat_anderhalv=None, feat_vanaf=None, feat_wer=None, feat_rivm=None, feat_blijf=None, feat_besmet=None, feat_onz=None, feat_gat=None, feat_zorg=None, feat_volgen=None, feat_even=None, feat_nos=None, feat_schol=None, feat_buit=None, feat_ander=None, feat_zit=None, feat_wass=None, feat_zoal=None, feat_z

# Saving The Data

In [51]:
df_nodes_users.write.parquet(DATASET.processed_str('nodes_User'), mode='overwrite')
df_nodes_hashtags.write.parquet(DATASET.processed_str('nodes_Hashtag'), mode='overwrite')
df_nodes_tweets.write.parquet(DATASET.processed_str('nodes_Tweet'), mode='overwrite')

                                                                                

In [52]:
df_edges_tweeted.write.parquet(DATASET.processed_str('edges_TWEETED'), mode='overwrite')
df_edges_reply_to_user.write.parquet(DATASET.processed_str('edges_REPLIES_TO_USER'), mode='overwrite')
df_edges_reply_to_tweet.write.parquet(DATASET.processed_str('edges_REPLIES_TO_TWEET'), mode='overwrite')
df_edges_quote_tweet.write.parquet(DATASET.processed_str('edges_QUOTES_TWEET'), mode='overwrite')
df_edges_mentioned_user.write.parquet(DATASET.processed_str('edges_MENTIONS_USER'), mode='overwrite')
df_edges_mentioned_hashtag.write.parquet(DATASET.processed_str('edges_MENTIONS_HASHTAG'), mode='overwrite')
df_edges_follows.write.parquet(DATASET.processed_str('edges_FOLLOWS'), mode='overwrite')

                                                                                

In [53]:
from shared.schema.graph import GraphSchema, NodeSchema, EdgeSchema

(
    GraphSchema()
        .add_node_schema('User', NodeSchema.from_spark(df_nodes_users.schema, label='name'))
        .add_node_schema('Hashtag', NodeSchema.from_spark(df_nodes_hashtags.schema, label='name'))
        .add_node_schema('Tweet', NodeSchema.from_spark(df_nodes_tweets.schema, label='text', timestamp='timestamp', interaction=False))
        .add_edge_schema('TWEETED', EdgeSchema.from_spark(df_edges_tweeted.schema, source_type='User', target_type='Tweet', directed=True, timestamp='timestamp', interaction=False))
        .add_edge_schema('REPLIES_TO_USER', EdgeSchema.from_spark(df_edges_reply_to_user.schema, source_type='Tweet', target_type='User', directed=True, timestamp='timestamp', interaction=False))
        .add_edge_schema('REPLIES_TO_TWEET', EdgeSchema.from_spark(df_edges_reply_to_tweet.schema, source_type='Tweet', target_type='Tweet', directed=True, timestamp='timestamp', interaction=False))
        .add_edge_schema('QUOTES_TWEET', EdgeSchema.from_spark(df_edges_quote_tweet.schema, source_type='Tweet', target_type='Tweet', directed=True, timestamp='timestamp', interaction=False))
        .add_edge_schema('MENTIONS_USER', EdgeSchema.from_spark(df_edges_mentioned_user.schema, source_type='Tweet', target_type='User', directed=True, timestamp='timestamp', interaction=False))
        .add_edge_schema('MENTIONS_HASHTAG', EdgeSchema.from_spark(df_edges_mentioned_hashtag.schema, source_type='Tweet', target_type='Hashtag', directed=True, timestamp='timestamp', interaction=False))
        .add_edge_schema('FOLLOWS', EdgeSchema.from_spark(df_edges_follows.schema, source_type='User', target_type='User', directed=True))
        .save_schema(DATASET.processed())
)

GraphSchema(_path=PosixPath('/data/pella/projects/University/Thesis/Thesis/code/storage/datasets/processed/social-distancing-student'), nodes={'User': NodeSchema(_type='User', _schema=..., label='name', properties={'name': GraphProperty(_name='name', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'screen_name': GraphProperty(_name='screen_name', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'user_id': GraphProperty(_name='user_id', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'feat_echt': GraphProperty(_name='feat_echt', dtype=DType(atomic=<DTypeAtomic.BOOL: 'boolean'>, array=False)), 'feat_wel': GraphProperty(_name='feat_wel', dtype=DType(atomic=<DTypeAtomic.BOOL: 'boolean'>, array=False)), 'feat_goed': GraphProperty(_name='feat_goed', dtype=DType(atomic=<DTypeAtomic.BOOL: 'boolean'>, array=False)), 'feat_drag': GraphProperty(_na