In [48]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

In [50]:
from shared.schema import DatasetSchema

DATASET = DatasetSchema.load_schema('com-youtube')
DATASET.save_schema()

In [51]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

In [52]:
schema = T.StructType([
    T.StructField('src', T.IntegerType(), False),
    T.StructField('dst', T.IntegerType(), False),
])

df = (
    spark.read.csv(DATASET.raw_str('com-youtube.ungraph.txt'), sep='\t', header=False, schema=schema,
                   mode='DROPMALFORMED')
)
print(df.head(5))
print(df.count())

[Row(src=1, dst=2), Row(src=1, dst=3), Row(src=1, dst=4), Row(src=1, dst=5), Row(src=1, dst=6)]
2987628


In [53]:
df_nodes = (
    df.select(F.col('src').alias('id'))
        .union(df.select(F.col('dst').alias('id')))
        .distinct()
        .filter(F.col('id').isNotNull())
)
df_nodes.schema['id'].nullable = False
df_nodes.head(5)

                                                                                

[Row(id=148), Row(id=463), Row(id=471), Row(id=496), Row(id=1088)]

In [54]:
df_edges = (
    df.select(F.col('src'), F.col('dst'))
        .distinct()
)
df_edges.schema['src'].nullable = False
df_edges.schema['dst'].nullable = False
df_edges.head(5)

                                                                                

[Row(src=2, dst=3386),
 Row(src=2, dst=3468),
 Row(src=2, dst=3696),
 Row(src=4, dst=5135),
 Row(src=4, dst=17394)]

In [55]:
df_nodes.write.parquet(DATASET.processed_str('nodes_User'), mode='overwrite')
df_edges.write.parquet(DATASET.processed_str('edges_FRIENDS'), mode='overwrite')

                                                                                

In [56]:
from shared.schema.graph import GraphSchema, NodeSchema, EdgeSchema

(
    GraphSchema()
        .add_node_schema(
        'User', NodeSchema.from_spark(df_nodes.schema, label='id'),
    )
        .add_edge_schema(
        'FRIENDS', EdgeSchema.from_spark(df_edges.schema, source_type='User', target_type='User', directed=False),
    )
        .save_schema(DATASET.processed())
)

GraphSchema(_path=PosixPath('/dd_volume/Development/Python/Thesis/code/datasets/data/processed/com-youtube'), nodes={'User': NodeSchema(_type='User', _schema=..., label='id', properties={'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.INT: 'int'>, array=False))}, dynamic=None)}, edges={'FRIENDS': EdgeSchema(_type='FRIENDS', _schema=..., label=None, properties={'src': GraphProperty(_name='src', dtype=DType(atomic=<DTypeAtomic.INT: 'int'>, array=False)), 'dst': GraphProperty(_name='dst', dtype=DType(atomic=<DTypeAtomic.INT: 'int'>, array=False))}, dynamic=None, source_type='User', target_type='User', directed=False)})

In [59]:
from shared.graph import CommunityAssignment

coms = CommunityAssignment.load_comms(DATASET.raw_str('com-youtube.top5000.cmty.txt'))
coms.named = False
coms.save_comlist(DATASET.processed_str('ground_truth.ncomlist'))