In [1]:
import shutil
%load_ext autoreload
%autoreload 2

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

from shared.constants import DatasetPath

In [3]:
DATASET = DatasetPath('com-youtube')

In [4]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

22/01/21 23:04:08 WARN Utils: Your hostname, megatron resolves to a loopback address: 127.0.1.1; using 192.168.1.89 instead (on interface enp7s0)
22/01/21 23:04:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/21 23:04:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
schema = T.StructType([
    T.StructField('src', T.IntegerType(), False),
    T.StructField('dst', T.IntegerType(), False),
])

df = (
    spark.read.csv(DATASET.raw_str('com-youtube.ungraph.txt'), sep='\t', header=False, schema=schema, mode='DROPMALFORMED')
)
print(df.head(5))
print(df.count())

                                                                                

[Row(src=1, dst=2), Row(src=1, dst=3), Row(src=1, dst=4), Row(src=1, dst=5), Row(src=1, dst=6)]




2987628


                                                                                

In [6]:
df_nodes = (
    df.select(F.col('src').alias('id'))
        .union(df.select(F.col('dst').alias('id')))
        .distinct()
)
df_nodes.head(5)

                                                                                

[Row(id=148), Row(id=463), Row(id=471), Row(id=496), Row(id=1088)]

In [7]:
df_edges = (
    df.select(F.col('src'), F.col('dst'))
        .distinct()
)
df_edges.head(5)

                                                                                

[Row(src=2, dst=3386),
 Row(src=2, dst=3468),
 Row(src=2, dst=3696),
 Row(src=4, dst=5135),
 Row(src=4, dst=17394)]

In [8]:
df_nodes.write.parquet(DATASET.processed_str('nodes_User'), mode='overwrite')
df_edges.write.parquet(DATASET.processed_str('edges_FRIENDS'), mode='overwrite')

                                                                                

In [21]:
from datasets.formats import read_coms, coms_to_comlist, write_comlist

coms = read_coms(DATASET.raw_str('com-youtube.top5000.cmty.txt'))
comlist = coms_to_comlist(coms)

write_comlist(comlist, DATASET.processed_str('ground_truth.comlist'))

In [23]:
from datasets.build_schema import build_schema

build_schema(
    spark,
    name=str(DATASET),
    nodes=[
        ('User', DATASET.processed_str('nodes_User')),
    ],
    edges=[
        ('Friends', 'User', 'User', DATASET.processed_str('edges_FRIENDS')),
    ],
    ground_truth=str(DATASET.processed_str('ground_truth.comlist'))
)

[2022-01-21 23:31:59,797][/dd_volume/Development/Python/Thesis/code/datasets/datasets/build_schema.py][DEBUG] Merging old schema for com-youtube


DatasetSchema(name='com-youtube', prefix='ComYoutube', database='com-youtube', description='None', nodes=[NodeSchema(path='data/processed/com-youtube/nodes_User', properties=[Property(name='id', type='int', ignore=False, label=True, timestamp=False)], label='User', interaction=False)], edges=[EdgeSchema(path='data/processed/com-youtube/edges_FRIENDS', properties=[Property(name='src', type='int', ignore=False, label=False, timestamp=False), Property(name='dst', type='int', ignore=False, label=False, timestamp=False)], type='FRIENDS', source='User', target='User', directed=False, interaction=False)], ground_truth=PosixPath('data/processed/com-youtube/ground_truth.comlist'), versions={})