In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

from shared.schema import DatasetSchema

In [3]:
DATASET = DatasetSchema.load_schema('ucidata-zachary')

In [4]:
spark = (SparkSession.builder
         .appName(f'{DATASET}_preprocess')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .getOrCreate())

22/01/23 00:26:41 WARN Utils: Your hostname, megatron resolves to a loopback address: 127.0.1.1; using 192.168.1.89 instead (on interface enp7s0)
22/01/23 00:26:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/23 00:26:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/23 00:26:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/01/23 00:26:42 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [5]:
schema = T.StructType([
    T.StructField('src', T.IntegerType(), False),
    T.StructField('dst', T.IntegerType(), False),
])

df = (
    spark.read.csv(DATASET.raw_str('karate_club.csv'), sep=' ', header=False, schema=schema)
        .withColumnRenamed('_c0', 'src')
        .withColumnRenamed('_c1', 'dst')
)
df.head(5)

                                                                                

[Row(src=0, dst=1),
 Row(src=0, dst=2),
 Row(src=0, dst=3),
 Row(src=0, dst=4),
 Row(src=0, dst=5)]

In [6]:
df_nodes = (
    df.select(F.col('src').alias('id'))
        .union(df.select(F.col('dst').alias('id')))
        .distinct()
)
df_nodes.head(5)

[Row(id=31), Row(id=28), Row(id=26), Row(id=27), Row(id=22)]

In [7]:
df_edges = (
    df.select(F.col('src'), F.col('dst'))
        .distinct()
)
df_edges.head(5)

[Row(src=22, dst=33),
 Row(src=4, dst=10),
 Row(src=5, dst=16),
 Row(src=19, dst=33),
 Row(src=0, dst=13)]

In [8]:
df_nodes.write.parquet(DATASET.processed_str('nodes_Pupil'), mode='overwrite')
df_edges.write.parquet(DATASET.processed_str('edges_INTERACTS'), mode='overwrite')

                                                                                

In [9]:
from shared.schema.graph import GraphSchema, NodeSchema, EdgeSchema

(
    GraphSchema()
        .add_node_schema('Pupil', NodeSchema.from_spark(df_nodes.schema, label='id'))
        .add_edge_schema('INTERACTS', EdgeSchema.from_spark(df_edges.schema, source_type='Pupil', target_type='Pupil', directed=False))
        .save_schema(DATASET.processed())
)

GraphSchema(_path=PosixPath('/dd_volume/Development/Python/Thesis/code/datasets/data/processed/ucidata-zachary'), nodes={'Pupil': NodeSchema(_type='Pupil', _schema=..., label='id', properties={'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.INT: 'int'>, array=False))}, dynamic=None)}, edges={'INTERACTS': EdgeSchema(_type='INTERACTS', _schema=..., label=None, properties={'src': GraphProperty(_name='src', dtype=DType(atomic=<DTypeAtomic.INT: 'int'>, array=False)), 'dst': GraphProperty(_name='dst', dtype=DType(atomic=<DTypeAtomic.INT: 'int'>, array=False))}, dynamic=None, source_type='Pupil', target_type='Pupil', directed=False)})

In [10]:
from shared.graph import coms_to_comlist, write_comlist
import json

with open(DATASET.raw_str('karate_club.json')) as f:
    meta = json.load(f)

coms = {
    cid: nids
    for cid, nids in enumerate(meta['communities'])
}

comlist = coms_to_comlist(coms)
write_comlist(comlist, DATASET.processed_str('ground_truth.ncomlist'))

In [11]:
DATASET.save_schema()