In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession


In [3]:
from shared.schema import DatasetSchema

DATASET = DatasetSchema.load_schema('sx-mathoverflow')
DATASET.save_schema()

In [4]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

22/01/22 22:42:11 WARN Utils: Your hostname, megatron resolves to a loopback address: 127.0.1.1; using 192.168.1.89 instead (on interface enp7s0)
22/01/22 22:42:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/22 22:42:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/22 22:42:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/01/22 22:42:13 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/01/22 22:42:13 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/01/22 22:42:13 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [5]:
schema = T.StructType([
    T.StructField('src', T.IntegerType(), False),
    T.StructField('dst', T.IntegerType(), False),
    T.StructField('timestamp', T.LongType(), False),
])

In [6]:
df_a2q = (
    spark.read.csv(DATASET.raw_str('sx-mathoverflow-a2q.txt'), sep=' ', header=False, schema=schema,
                   mode='DROPMALFORMED')
        .withColumn('timestamp', F.to_timestamp(F.from_unixtime('timestamp'), 'yyyy-MM-dd HH:mm:ss'))
)
df_a2q.head(5)

                                                                                

[Row(src=1, dst=4, timestamp=datetime.datetime(2009, 9, 29, 4, 56, 28)),
 Row(src=3, dst=4, timestamp=datetime.datetime(2009, 9, 29, 5, 24, 16)),
 Row(src=1, dst=2, timestamp=datetime.datetime(2009, 9, 29, 7, 36, 52)),
 Row(src=25, dst=1, timestamp=datetime.datetime(2009, 9, 29, 16, 0, 4)),
 Row(src=14, dst=16, timestamp=datetime.datetime(2009, 9, 30, 0, 26, 6))]

In [7]:
df_c2a = (
    spark.read.csv(DATASET.raw_str('sx-mathoverflow-c2a.txt'), sep=' ', header=False, schema=schema,
                   mode='DROPMALFORMED')
        .withColumn('timestamp', F.to_timestamp(F.from_unixtime('timestamp'), 'yyyy-MM-dd HH:mm:ss'))
)
df_c2a.head(5)

[Row(src=3, dst=1, timestamp=datetime.datetime(2009, 9, 29, 8, 36, 36)),
 Row(src=1, dst=1, timestamp=datetime.datetime(2009, 9, 29, 9, 0, 2)),
 Row(src=2, dst=1, timestamp=datetime.datetime(2009, 9, 29, 20, 42, 37)),
 Row(src=1, dst=25, timestamp=datetime.datetime(2009, 9, 29, 23, 30, 18)),
 Row(src=1, dst=22, timestamp=datetime.datetime(2009, 9, 30, 3, 12, 32))]

In [8]:
df_c2q = (
    spark.read.csv(DATASET.raw_str('sx-mathoverflow-c2q.txt'), sep=' ', header=False, schema=schema,
                   mode='DROPMALFORMED')
        .withColumn('timestamp', F.to_timestamp(F.from_unixtime('timestamp'), 'yyyy-MM-dd HH:mm:ss'))
)
df_c2q.head(5)

[Row(src=1, dst=16, timestamp=datetime.datetime(2009, 9, 30, 2, 43, 41)),
 Row(src=1, dst=2, timestamp=datetime.datetime(2009, 9, 30, 2, 53, 46)),
 Row(src=2, dst=2, timestamp=datetime.datetime(2009, 9, 30, 3, 28, 48)),
 Row(src=1, dst=2, timestamp=datetime.datetime(2009, 9, 30, 4, 4, 30)),
 Row(src=1, dst=28, timestamp=datetime.datetime(2009, 9, 30, 4, 54, 20))]

In [9]:
df_nodes = (
    df_a2q.select(F.col('src').alias('id'))
        .union(df_a2q.select(F.col('dst').alias('id')))
        .union(df_c2a.select(F.col('src').alias('id')))
        .union(df_c2a.select(F.col('dst').alias('id')))
        .union(df_c2q.select(F.col('src').alias('id')))
        .union(df_c2q.select(F.col('dst').alias('id')))
        .distinct()
)
df_nodes.count()

                                                                                

24818

In [10]:
df_nodes.write.parquet(DATASET.processed_str('nodes_User'), mode='overwrite')

df_a2q.write.parquet(DATASET.processed_str('edges_ANSWERED_QUESTION'), mode='overwrite')
df_c2q.write.parquet(DATASET.processed_str('edges_COMMENTED_ON_QUESTION'), mode='overwrite')
df_c2a.write.parquet(DATASET.processed_str('edges_COMMENTED_ON_ANSWER'), mode='overwrite')

                                                                                

In [11]:
from shared.schema.graph import GraphSchema, NodeSchema, EdgeSchema

(
    GraphSchema()
        .add_node_schema('User', NodeSchema.from_spark(df_nodes.schema, label='id'))
        .add_edge_schema('ANSWERED_QUESTION', EdgeSchema.from_spark(df_a2q.schema, source_type='User', target_type='User', directed=True, timestamp='timestamp', interaction=True))
        .add_edge_schema('COMMENTED_ON_QUESTION', EdgeSchema.from_spark(df_c2q.schema, source_type='User', target_type='User', directed=True, timestamp='timestamp', interaction=True))
        .add_edge_schema('COMMENTED_ON_ANSWER', EdgeSchema.from_spark(df_c2a.schema, source_type='User', target_type='User', directed=True, timestamp='timestamp', interaction=True))
        .save_schema(DATASET.processed())
)

GraphSchema(_path=PosixPath('/dd_volume/Development/Python/Thesis/code/datasets/data/processed/sx-mathoverflow'), nodes={'User': NodeSchema(_type='User', _schema=..., label='id', properties={'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.INT: 'int'>, array=False))}, dynamic=None)}, edges={'ANSWERED_QUESTION': EdgeSchema(_type='ANSWERED_QUESTION', _schema=..., label=None, properties={'src': GraphProperty(_name='src', dtype=DType(atomic=<DTypeAtomic.INT: 'int'>, array=False)), 'dst': GraphProperty(_name='dst', dtype=DType(atomic=<DTypeAtomic.INT: 'int'>, array=False)), 'timestamp': GraphProperty(_name='timestamp', dtype=DType(atomic=<DTypeAtomic.DATETIME: 'datetime'>, array=False))}, dynamic=DynamicConfig(timestamp='timestamp', interaction=True), source_type='User', target_type='User', directed=True), 'COMMENTED_ON_QUESTION': EdgeSchema(_type='COMMENTED_ON_QUESTION', _schema=..., label=None, properties={'src': GraphProperty(_name='src', dtype=DType(atomic=<DTypeAtomic.IN