In [51]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

from shared.constants import DatasetPath

In [53]:
DATASET = DatasetPath('star-wars')

In [54]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

In [55]:
import json

all_edges = []
for file in DATASET.raw().glob('starwars-episode-*ns.json'):
    _, _, episode, link_type = file.stem.split('-')
    with file.open('r') as f:
        data = json.load(f)

    nodes = list(data['nodes'])
    edges = data['links']
    for e in edges:
        all_edges.append({
            'source': nodes[e['source']]['name'],
            'target': nodes[e['target']]['name'],
            'time': int(episode),
            'weight': e['value'],
            'type': link_type
        })

In [56]:
df = spark.createDataFrame(all_edges)
df.head(5)

[Row(source='NUTE GUNRAY', target='QUI-GON', time=1, type='interactions', weight=1),
 Row(source='PK-4', target='TC-14', time=1, type='interactions', weight=1),
 Row(source='OBI-WAN', target='TC-14', time=1, type='interactions', weight=1),
 Row(source='QUI-GON', target='TC-14', time=1, type='interactions', weight=1),
 Row(source='OBI-WAN', target='QUI-GON', time=1, type='interactions', weight=26)]

In [57]:
df_nodes = (
    df
        .select(F.col('source').alias('name'))
        .union(df.select(F.col('target').alias('name')))
        .distinct()
        .withColumn('id', F.monotonically_increasing_id())
)
print(df_nodes.count())
df_nodes.head(5)

113


[Row(name='C-3PO', id=0),
 Row(name='MACE WINDU', id=1),
 Row(name='BOSS NASS', id=2),
 Row(name='PADME', id=3),
 Row(name='DOFINE', id=4)]

In [58]:
df_all_edges = (
    df.join(df_nodes.alias('s'), df.source == F.col('s.name'), 'left')
        .join(df_nodes.alias('t'), df.target ==  F.col('t.name'), 'left')
        .withColumn('src', F.col('s.id'))
        .withColumn('dst', F.col('t.id'))
        .select('time', 'type', 'src', 'dst', 'weight')
        .dropDuplicates(['time', 'src', 'dst', 'type'])
)
print(df_all_edges.count())
df_all_edges.head(5)

1599


[Row(time=1, type='mentions', src=21, dst=5, weight=7),
 Row(time=3, type='mentions', src=56, dst=9, weight=1),
 Row(time=3, type='mentions', src=18, dst=9, weight=1),
 Row(time=4, type='mentions', src=60, dst=59, weight=1),
 Row(time=5, type='interactions', src=61, dst=9, weight=1)]

In [59]:
df_edges_interactions = (
    df_all_edges.filter(F.col('type') == 'interactions')
        .drop('type')
)
print(df_edges_interactions.count())
df_edges_interactions.head(5)

479


[Row(time=5, src=61, dst=9, weight=1),
 Row(time=7, src=61, dst=88, weight=17),
 Row(time=2, src=9, dst=94, weight=5),
 Row(time=4, src=56, dst=62, weight=3),
 Row(time=7, src=90, dst=112, weight=2)]

In [60]:
df_edges_mentions = (
    df_all_edges.filter(F.col('type') == 'mentions')
        .drop('type')
        .distinct()
)
print(df_edges_mentions.count())
df_edges_mentions.head(5)

1120


[Row(time=6, src=12, dst=44, weight=1),
 Row(time=6, src=75, dst=58, weight=1),
 Row(time=4, src=53, dst=64, weight=1),
 Row(time=1, src=5, dst=10, weight=9),
 Row(time=3, src=28, dst=94, weight=12)]

In [61]:
df_nodes.write.parquet(DATASET.processed_str('nodes_Characters'), mode='overwrite')

df_edges_interactions.write.parquet(DATASET.processed_str('edges_INTERACTIONS'), mode='overwrite')
df_edges_mentions.write.parquet(DATASET.processed_str('edges_MENTIONS'), mode='overwrite')

In [62]:
from datasets.build_schema import build_schema

build_schema(
    spark,
    name=str(DATASET),
    nodes=[
        ('Character', DATASET.processed_str('nodes_Characters')),
    ],
    edges=[
        ('InteractsWith', 'Character', 'Character', DATASET.processed_str('edges_INTERACTIONS')),
        ('Mentions', 'Character', 'Character', DATASET.processed_str('edges_MENTIONS')),
    ]
)

[2022-01-12 12:33:39,600][/dd_volume/Development/Python/Thesis/code/datasets/datasets/build_schema.py][DEBUG] Merging old schema for star-wars


DatasetSchema(name='star-wars', prefix='StarWars', database='star-wars', description='None', nodes=[NodeSchema(label='Character', path='data/processed/star-wars/nodes_Characters', properties=[Property(name='name', type='string', ignore=False, label=True), Property(name='id', type='long', ignore=False, label=False)])], edges=[EdgeSchema(type='INTERACTS_WITH', source='Character', target='Character', path='data/processed/star-wars/edges_INTERACTIONS', properties=[Property(name='time', type='long', ignore=False, label=False), Property(name='src', type='long', ignore=False, label=False), Property(name='dst', type='long', ignore=False, label=False), Property(name='weight', type='long', ignore=False, label=False)]), EdgeSchema(type='MENTIONS', source='Character', target='Character', path='data/processed/star-wars/edges_MENTIONS', properties=[Property(name='time', type='long', ignore=False, label=False), Property(name='src', type='long', ignore=False, label=False), Property(name='dst', type='l