In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

from shared.constants import DatasetPath

In [3]:
DATASET = DatasetPath('sx-superuser')

In [4]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

22/01/03 00:46:46 WARN Utils: Your hostname, megatron resolves to a loopback address: 127.0.1.1; using 192.168.1.89 instead (on interface enp7s0)
22/01/03 00:46:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/03 00:46:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/03 00:46:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/01/03 00:46:48 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [5]:
schema = T.StructType([
    T.StructField('src', T.IntegerType(), False),
    T.StructField('dst', T.IntegerType(), False),
    T.StructField('timestamp', T.LongType(), False),
])

In [6]:
df_a2q = (
    spark.read.csv(DATASET.raw_str('sx-superuser-a2q.txt'), sep=' ', header=False, schema=schema,
                   mode='DROPMALFORMED')
        .withColumn('timestamp', F.to_timestamp(F.from_unixtime('timestamp'), 'yyyy-MM-dd HH:mm:ss'))
)
df_a2q.head(5)

                                                                                

[Row(src=55, dst=13, timestamp=datetime.datetime(2009, 7, 15, 16, 17, 21)),
 Row(src=8, dst=13, timestamp=datetime.datetime(2009, 7, 15, 16, 17, 32)),
 Row(src=24, dst=13, timestamp=datetime.datetime(2009, 7, 15, 16, 17, 41)),
 Row(src=27, dst=13, timestamp=datetime.datetime(2009, 7, 15, 16, 18, 23)),
 Row(src=46, dst=13, timestamp=datetime.datetime(2009, 7, 15, 16, 19, 14))]

In [7]:
df_c2a = (
    spark.read.csv(DATASET.raw_str('sx-superuser-c2a.txt'), sep=' ', header=False, schema=schema,
                   mode='DROPMALFORMED')
        .withColumn('timestamp', F.to_timestamp(F.from_unixtime('timestamp'), 'yyyy-MM-dd HH:mm:ss'))
)
df_c2a.head(5)

[Row(src=17, dst=45675, timestamp=datetime.datetime(2009, 7, 15, 16, 18, 49)),
 Row(src=55, dst=55, timestamp=datetime.datetime(2009, 7, 15, 16, 31, 43)),
 Row(src=63, dst=76, timestamp=datetime.datetime(2009, 7, 15, 16, 31, 55)),
 Row(src=63, dst=24, timestamp=datetime.datetime(2009, 7, 15, 16, 35, 4)),
 Row(src=63, dst=137, timestamp=datetime.datetime(2009, 7, 15, 16, 36, 11))]

In [8]:
df_c2q = (
    spark.read.csv(DATASET.raw_str('sx-superuser-c2q.txt'), sep=' ', header=False, schema=schema,
                   mode='DROPMALFORMED')
        .withColumn('timestamp', F.to_timestamp(F.from_unixtime('timestamp'), 'yyyy-MM-dd HH:mm:ss'))
)
df_c2q.head(5)

[Row(src=14, dst=26, timestamp=datetime.datetime(2009, 7, 15, 16, 20, 40)),
 Row(src=62, dst=49, timestamp=datetime.datetime(2009, 7, 15, 16, 37, 19)),
 Row(src=120, dst=38, timestamp=datetime.datetime(2009, 7, 15, 16, 38, 30)),
 Row(src=101, dst=49, timestamp=datetime.datetime(2009, 7, 15, 16, 39, 21)),
 Row(src=166, dst=172, timestamp=datetime.datetime(2009, 7, 15, 16, 44, 41))]

In [9]:
df_nodes = (
    df_a2q.select(F.col('src').alias('id'))
        .union(df_a2q.select(F.col('dst').alias('id')))
        .union(df_c2a.select(F.col('src').alias('id')))
        .union(df_c2a.select(F.col('dst').alias('id')))
        .union(df_c2q.select(F.col('src').alias('id')))
        .union(df_c2q.select(F.col('dst').alias('id')))
        .distinct()
)
df_nodes.count()

                                                                                

194085

In [10]:
df_nodes.write.parquet(DATASET.processed_str('nodes_User'), mode='overwrite')

df_a2q.write.parquet(DATASET.processed_str('edges_ANSWERED_QUESTION'), mode='overwrite')
df_c2q.write.parquet(DATASET.processed_str('edges_COMMENTED_ON_QUESTION'), mode='overwrite')
df_c2a.write.parquet(DATASET.processed_str('edges_COMMENTED_ON_ANSWER'), mode='overwrite')

                                                                                

In [11]:
from datasets.build_schema import build_schema

build_schema(
    spark,
    name=str(DATASET),
    nodes=[
        ('User', DATASET.processed_str('nodes_User')),
    ],
    edges=[
        ('AnsweredQuestion', 'User', 'User', DATASET.processed_str('edges_ANSWERED_QUESTION')),
        ('CommentedOnQuestion', 'User', 'User', DATASET.processed_str('edges_COMMENTED_ON_QUESTION')),
        ('CommentedOnAnswer', 'User', 'User', DATASET.processed_str('edges_COMMENTED_ON_ANSWER')),
    ]
)

DatasetSchema(name='sx-superuser', prefix='SxSuperuser', database='sx-superuser', description=None, nodes=[NodeSchema(label='User', path='data/processed/sx-superuser/nodes_User', properties=[Property(name='id', type='int', ignore=False, label=False)])], edges=[EdgeSchema(type='ANSWERED_QUESTION', source='User', target='User', path='data/processed/sx-superuser/edges_ANSWERED_QUESTION', properties=[Property(name='src', type='int', ignore=False, label=False), Property(name='dst', type='int', ignore=False, label=False), Property(name='timestamp', type='datetime', ignore=False, label=False)]), EdgeSchema(type='COMMENTED_ON_QUESTION', source='User', target='User', path='data/processed/sx-superuser/edges_COMMENTED_ON_QUESTION', properties=[Property(name='src', type='int', ignore=False, label=False), Property(name='dst', type='int', ignore=False, label=False), Property(name='timestamp', type='datetime', ignore=False, label=False)]), EdgeSchema(type='COMMENTED_ON_ANSWER', source='User', target=