In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession


In [3]:
from shared.schema import DatasetSchema

DATASET = DatasetSchema.load_schema('social-distancing-student')
DATASET.save_schema()

In [4]:
spark = (SparkSession.builder
         .appName(str(DATASET))
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

22/01/22 22:35:04 WARN Utils: Your hostname, megatron resolves to a loopback address: 127.0.1.1; using 192.168.1.89 instead (on interface enp7s0)
22/01/22 22:35:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/22 22:35:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/22 22:35:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/01/22 22:35:05 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [5]:
DATE_FORMAT = "EEE MMM dd HH:mm:ss '+0000' yyyy"
probs_schema = T.StructType([
    T.StructField('SUPPORTS', T.FloatType(), False),
    T.StructField('IRRELEVANT', T.FloatType(), False),
    T.StructField('REJECTS', T.FloatType(), False),
])

df = (
    spark.read.format('org.apache.spark.sql.json')
        .load(DATASET.raw_str('social-distancing-student.json'))
        .withColumn('created_at', F.to_timestamp(F.col('created_at'), 'EEE MMM dd HH:mm:ss \'+0000\' yyyy'))
        .withColumn('probabilities_social_distancing', F.from_json('probabilities_social_distancing', probs_schema))
).cache()
df.head(1)

                                                                                

[Row(created_at=datetime.datetime(2020, 9, 30, 22, 1, 2), entities=Row(hashtags=[], media=None, symbols=[], urls=[Row(display_url='twitter.com/i/web/status/1…', expanded_url='https://twitter.com/i/web/status/1311425852169744385', indices=[117, 140], url='https://t.co/EA6OO7cWR5')], user_mentions=[Row(id=2493701, id_str='2493701', indices=[0, 5], name='NU.nl', screen_name='NUnl')]), full_text='@NUnl Waarom niet de dove mens het mondkapje op en degene die spreekt geen monkapje. Als één van twee het maar op heeft en wel afstand houden. ik ben SH en heb erg last van mensen die te snel praten en te zacht. Wat doe ik hier aan. Gewoon maar vragen denk ik?', id_str='1311425852169744385', in_reply_to_status_id_str='1311352289928581120', in_reply_to_user_id_str='2493701', label_social_distancing='SUPPORTS', probabilities_social_distancing=Row(SUPPORTS=0.9821312427520752, IRRELEVANT=0.014853371307253838, REJECTS=0.003045313758775592), quoted_status_id_str='', sentiment=-0.6, text='@NUnl Waarom ni

In [6]:
users_tweet_df = df.select('user.name', 'user.screen_name', 'user.id_str')
user_mentions_df = (
    df.select(F.explode('entities.user_mentions').alias('user'))
        .select('user.name', 'user.screen_name', 'user.id_str')
)
df_nodes_users = (
    users_tweet_df
        .union(user_mentions_df)
        .select('name', 'screen_name', F.col('id_str').alias('id'))
        .dropDuplicates(['id'])
)
df_nodes_users.head(5)

                                                                                

[Row(name='DLRP-Magic.com', screen_name='DLRP_Magic', id='100002919'),
 Row(name='Russel⁷ ✜ +×', screen_name='HoldOnSoobin', id='1000331230565617666'),
 Row(name='abkuijer❌❌❌', screen_name='abkuijer', id='10003862'),
 Row(name='Koen', screen_name='koen0612', id='1000414316238172163'),
 Row(name='Wouter van Embden', screen_name='EmbdenWouter', id='1000464536082747393')]

In [7]:
df_nodes_hashtags = (
    df
        .select(F.explode('entities.hashtags').alias('hashtag'))
        .select(
        F.col('hashtag.text').alias('id'),
        F.col('hashtag.text').alias('name')
    )
        .dropDuplicates(['id'])
)
df_nodes_hashtags.head(5)

[Row(id='040fungi', name='040fungi'),
 Row(id='11stedenMcDrive', name='11stedenMcDrive'),
 Row(id='140ktober', name='140ktober'),
 Row(id='150km', name='150km'),
 Row(id='1dagniet', name='1dagniet')]

In [8]:
df_nodes_tweets = df.select(
    F.col('id_str').alias('id'),
    'text',
    'label_social_distancing',
    F.col('probabilities_social_distancing.SUPPORTS').alias('prob_supports'),
    F.col('probabilities_social_distancing.IRRELEVANT').alias('prob_irrelevant'),
    F.col('probabilities_social_distancing.REJECTS').alias('prob_rejects'),
    'sentiment',
    'created_at'
)
df_nodes_tweets.head(5)

[Row(id='1311425852169744385', text='@NUnl Waarom niet de dove mens het mondkapje op en degene die spreekt geen monkapje. Als één van twee het maar op h… https://t.co/EA6OO7cWR5', label_social_distancing='SUPPORTS', prob_supports=0.9821312427520752, prob_irrelevant=0.014853371307253838, prob_rejects=0.003045313758775592, sentiment=-0.6, created_at=datetime.datetime(2020, 9, 30, 22, 1, 2)),
 Row(id='1311426330488180736', text='hoe meer werklozen er komen. Hoe meer bedrijven kapot gaan, en zo kan ik nog uren doorgaan. \n\nBuiten dit om, ik sna… https://t.co/8LGwaSPock', label_social_distancing='SUPPORTS', prob_supports=0.9024366736412048, prob_irrelevant=8.047425944823772e-05, prob_rejects=0.09751284122467041, sentiment=-0.6, created_at=datetime.datetime(2020, 9, 30, 22, 2, 56)),
 Row(id='1311426383906844672', text='RT @Rijksoverheid: Dringend advies: draag vanaf vandaag een niet-medisch mondkapje in publieke binnenruimtes zoals winkels, musea en benzin…', label_social_distancing='SUPPOR

In [9]:
df_node_ids = (
    df_nodes_users.select('id')
        .union(df_nodes_hashtags.select('id'))
        .union(df_nodes_tweets.select('id'))
        .dropDuplicates(['id'])
).cache()


def filter_node_ids(df):
    return df.join(
        df_node_ids,
        F.col('src') == F.col('id'),
        'inner'
    ).drop(
        'id'
    ).join(
        df_node_ids,
        F.col('dst') == F.col('id'),
        'inner'
    ).drop('id')

In [10]:
df_edges_tweeted = filter_node_ids(df.select(
    F.col('user.id_str').alias('src'),
    F.col('id_str').alias('dst'),
).filter(
    F.col('src').isNotNull() &
    F.col('src').isNotNull()
).filter(
    "dst != '' AND src != ''"
).distinct())

df_edges_tweeted.head(5)
# write_relations(tweeted_ref_df, "TWEETED", ":Twitter:User.id_str:user_id", ":Twitter:Tweet.id_str:tweet_id")

                                                                                

[Row(src='1893682442', dst='1311535089738801152'),
 Row(src='161960540', dst='1311556166057701377'),
 Row(src='54533650', dst='1311575637153533952'),
 Row(src='318940031', dst='1311578096525553664'),
 Row(src='3097256176', dst='1311602723868028928')]

In [11]:
df_edges_reply_to_user = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.col('in_reply_to_user_id_str').alias('dst'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_reply_to_user.head(5)
# write_relations(reply_ref_df, "REPLY_TO", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:User.id_str:user_id")

[Row(src='1311434188172791809', dst='2359279408'),
 Row(src='1311939840024473600', dst='1271166748264476673'),
 Row(src='1311681370356408320', dst='3243614116'),
 Row(src='1311751664207106052', dst='23929278'),
 Row(src='1311594362019753985', dst='105103709')]

In [12]:
df_edges_reply_to_tweet = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.col('in_reply_to_status_id_str').alias('dst'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_reply_to_tweet.head(5)
# write_relations(reply_tweet_df, "REPLY_TO", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:Tweet.id_str:original_tweet_id")

[Row(src='1311601566256893953', dst='1311594779067797505'),
 Row(src='1311676116520837120', dst='1311662390761586688'),
 Row(src='1311638761516793859', dst='1311603550003232775'),
 Row(src='1311951655676645376', dst='1311951306593112065'),
 Row(src='1311616924879552512', dst='1311577791956168709')]

In [13]:
df_edges_quote_tweet = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.col('quoted_status_id_str').alias('dst'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_quote_tweet.head(5)
# write_relations(quote_tweet_df, "QUOTED", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:Tweet.id_str:quoted_tweet_id")

[Row(src='1311826032094334976', dst='1311726735638114306'),
 Row(src='1311716018193080325', dst='1311629389654904833'),
 Row(src='1311643317596303368', dst='1311635271650340870'),
 Row(src='1311790813807022081', dst='1311786569234354179'),
 Row(src='1311901502609526784', dst='1311755749975089167')]

In [14]:
df_edges_mentioned_user = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.explode('entities.user_mentions.id_str').alias('dst'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_mentioned_user.head(5)
# write_relations(mention_user_ref_df, "MENTIONED", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:User.id_str:user_id")

[Row(src='1311432097303855104', dst='2647131516'),
 Row(src='1311434188172791809', dst='2359279408'),
 Row(src='1311563553091072001', dst='96970572'),
 Row(src='1311595010089943042', dst='39232421'),
 Row(src='1311619916013940738', dst='171084541')]

In [15]:
df_edges_mentioned_hashtag = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.explode('entities.hashtags.text').alias('dst'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_mentioned_hashtag.head(5)
# write_relations(hashtag_mention_df, "MENTIONED", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:Hashtag.id_str:hashtag_id")

[Row(src='1311578039311048705', dst='mondkapjes'),
 Row(src='1311701292864221184', dst='MONDKAPPENNOU'),
 Row(src='1311902401323044867', dst='leadbyexample'),
 Row(src='1311567232980398081', dst='Influenzavirus'),
 Row(src='1311633023931215874', dst='mondkapjes')]

In [16]:
df_user_ids = df_nodes_users.select('id')

df_followers = (
    spark.read.text(DATASET.raw_str('followers'), wholetext=False, pathGlobFilter='*.txt')
        .select(
        F.regexp_extract(F.input_file_name(), r'([0-9]+)%20([A-z0-9%]+).txt$', 1).alias('user_id'),
        F.col('value').alias('follower_id'),
    )
).cache()

df_followers.head(5)

                                                                                

[Row(user_id='56377143', follower_id='2687493770'),
 Row(user_id='56377143', follower_id='1443138398928183298'),
 Row(user_id='56377143', follower_id='480680728'),
 Row(user_id='56377143', follower_id='801927938304315392'),
 Row(user_id='56377143', follower_id='279492619')]

In [17]:
df_edges_follows = filter_node_ids(df_followers.join(
    df_user_ids.alias('a'), F.col('follower_id') == F.col('a.id'), 'inner'
).join(
    df_user_ids.alias('b'), F.col('user_id') == F.col('b.id'), 'inner'
).select(
    F.col('follower_id').alias('src'),
    F.col('user_id').alias('dst')
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_follows.head(5)
# write_relations(df_followers, "FOLLOWS", ":Twitter:User.id_str:follower_id", ":Twitter:User.id_str:user_id")

                                                                                

[Row(src='1034004099950362625', dst='56377143'),
 Row(src='714542887040630784', dst='56377143'),
 Row(src='400296650', dst='56377143'),
 Row(src='2576050703', dst='56377143'),
 Row(src='138394987', dst='56377143')]

In [18]:
df_nodes_users.write.parquet(DATASET.processed_str('nodes_User'), mode='overwrite')
df_nodes_hashtags.write.parquet(DATASET.processed_str('nodes_Hashtag'), mode='overwrite')
df_nodes_tweets.write.parquet(DATASET.processed_str('nodes_Tweet'), mode='overwrite')

                                                                                

In [19]:
df_edges_tweeted.write.parquet(DATASET.processed_str('edges_TWEETED'), mode='overwrite')
df_edges_reply_to_user.write.parquet(DATASET.processed_str('edges_REPLIES_TO_USER'), mode='overwrite')
df_edges_reply_to_tweet.write.parquet(DATASET.processed_str('edges_REPLIES_TO_TWEET'), mode='overwrite')
df_edges_quote_tweet.write.parquet(DATASET.processed_str('edges_QUOTES_TWEET'), mode='overwrite')
df_edges_mentioned_user.write.parquet(DATASET.processed_str('edges_MENTIONS_USER'), mode='overwrite')
df_edges_mentioned_hashtag.write.parquet(DATASET.processed_str('edges_MENTIONS_HASHTAG'), mode='overwrite')
df_edges_follows.write.parquet(DATASET.processed_str('edges_FOLLOWS'), mode='overwrite')

                                                                                

In [21]:
from shared.schema.graph import GraphSchema, NodeSchema, EdgeSchema

(
    GraphSchema()
        .add_node_schema('User', NodeSchema.from_spark(df_nodes_users.schema, label='name'))
        .add_node_schema('Hashtag', NodeSchema.from_spark(df_nodes_hashtags.schema, label='name'))
        .add_node_schema('Tweet', NodeSchema.from_spark(df_nodes_tweets.schema, label='text', timestamp='created_at', interaction=False))
        .add_edge_schema('TWEETED', EdgeSchema.from_spark(df_edges_tweeted.schema, source_type='User', target_type='Tweet', directed=True))
        .add_edge_schema('REPLIES_TO_USER', EdgeSchema.from_spark(df_edges_reply_to_user.schema, source_type='Tweet', target_type='User', directed=True))
        .add_edge_schema('REPLIES_TO_TWEET', EdgeSchema.from_spark(df_edges_reply_to_tweet.schema, source_type='Tweet', target_type='Tweet', directed=True))
        .add_edge_schema('QUOTES_TWEET', EdgeSchema.from_spark(df_edges_quote_tweet.schema, source_type='Tweet', target_type='Tweet', directed=True))
        .add_edge_schema('MENTIONS_USER', EdgeSchema.from_spark(df_edges_mentioned_user.schema, source_type='Tweet', target_type='User', directed=True))
        .add_edge_schema('MENTIONS_HASHTAG', EdgeSchema.from_spark(df_edges_mentioned_hashtag.schema, source_type='Tweet', target_type='Hashtag', directed=True))
        .add_edge_schema('FOLLOWS', EdgeSchema.from_spark(df_edges_follows.schema, source_type='User', target_type='User', directed=True))
        .save_schema(DATASET.processed())
)

GraphSchema(_path=PosixPath('/dd_volume/Development/Python/Thesis/code/datasets/data/processed/social-distancing-student'), nodes={'User': NodeSchema(_type='User', _schema=..., label='name', properties={'name': GraphProperty(_name='name', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'screen_name': GraphProperty(_name='screen_name', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False))}, dynamic=None), 'Hashtag': NodeSchema(_type='Hashtag', _schema=..., label='name', properties={'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'name': GraphProperty(_name='name', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False))}, dynamic=None), 'Tweet': NodeSchema(_type='Tweet', _schema=..., label='text', properties={'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)),