In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

In [13]:
# Whether custom features should be added to the dataset.
# For this you need to run datasets/notebooks/development/import-sw-data.ipynb first
ADD_FEATURES=False

In [4]:
from shared.schema import DatasetSchema

DATASET = DatasetSchema.load_schema('star-wars')
DATASET.save_schema()

In [5]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/09 17:56:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
import json

all_edges = []
for file in DATASET.raw().glob('starwars-episode-*ns.json'):
    _, _, episode, link_type = file.stem.split('-')
    with file.open('r') as f:
        data = json.load(f)

    nodes = list(data['nodes'])
    edges = data['links']
    for e in edges:
        all_edges.append({
            'source': nodes[e['source']]['name'],
            'target': nodes[e['target']]['name'],
            'time': int(episode),
            'weight': e['value'],
            'type': link_type
        })

In [7]:
df = spark.createDataFrame(all_edges)
df.head(5)

                                                                                

[Row(source='CAMIE', target='LUKE', time=4, type='interactions', weight=2),
 Row(source='BIGGS', target='CAMIE', time=4, type='interactions', weight=2),
 Row(source='BIGGS', target='LUKE', time=4, type='interactions', weight=4),
 Row(source='DARTH VADER', target='LEIA', time=4, type='interactions', weight=1),
 Row(source='BERU', target='LUKE', time=4, type='interactions', weight=3)]

In [8]:
df_nodes_tmp = (
    df
        .select(F.col('source').alias('name'))
        .union(df.select(F.col('target').alias('name')))
        .distinct()
        .withColumn('id', F.monotonically_increasing_id())
)
print(df_nodes_tmp.count())
df_nodes_tmp.head(5)

                                                                                

113


[Row(name='C-3PO', id=0),
 Row(name='JERJERROD', id=1),
 Row(name='BERU', id=2),
 Row(name='LANDO', id=3),
 Row(name='CAMIE', id=4)]

In [9]:
df_all_edges = (
    df.join(df_nodes_tmp.alias('s'), df.source == F.col('s.name'), 'left')
        .join(df_nodes_tmp.alias('t'), df.target ==  F.col('t.name'), 'left')
        .withColumn('src', F.col('s.id'))
        .withColumn('dst', F.col('t.id'))
        .select('time', 'type', 'src', 'dst', 'weight')
        .dropDuplicates(['time', 'src', 'dst', 'type'])
)
print(df_all_edges.count())
df_all_edges.head(5)

1599


[Row(time=2, type='mentions', src=25, dst=24, weight=24),
 Row(time=2, type='mentions', src=25, dst=91, weight=1),
 Row(time=3, type='mentions', src=25, dst=11, weight=2),
 Row(time=3, type='mentions', src=39, dst=22, weight=1),
 Row(time=4, type='mentions', src=8, dst=24, weight=36)]

In [10]:
df_edges_interactions = (
    df_all_edges.filter(F.col('type') == 'interactions')
        .drop('type')
)
print(df_edges_interactions.count())
df_edges_interactions.head(5)

479


[Row(time=1, src=24, dst=45, weight=1),
 Row(time=4, src=0, dst=28, weight=2),
 Row(time=1, src=25, dst=77, weight=2),
 Row(time=2, src=36, dst=99, weight=2),
 Row(time=5, src=8, dst=105, weight=1)]

In [11]:
df_edges_mentions = (
    df_all_edges.filter(F.col('type') == 'mentions')
        .drop('type')
        .distinct()
)
print(df_edges_mentions.count())
df_edges_mentions.head(5)

1120


[Row(time=3, src=12, dst=86, weight=1),
 Row(time=7, src=47, dst=53, weight=1),
 Row(time=3, src=84, dst=27, weight=6),
 Row(time=2, src=21, dst=99, weight=2),
 Row(time=4, src=12, dst=8, weight=28)]

## Feature Engineering

In [14]:
if ADD_FEATURES:
    from pymongo import MongoClient
    import unicodedata
    import pandas as pd

In [15]:
if ADD_FEATURES:
    client = MongoClient("mongodb://root:helloworld@127.0.0.1/wiki.starwars?authSource=admin")
    collection = client.wiki.wookiepedia.characters

In [16]:
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

if ADD_FEATURES:
    db_items_df = pd.DataFrame(collection.find({}, {"title": 1, 'text': 1})).set_index("_id")
    db_items_df['text'] = db_items_df['text'].str.lower().apply(strip_accents)
    db_items_df['title'] = db_items_df['title'].str.lower().apply(strip_accents)
    db_items_df

In [17]:
VOCAB = {
    'HAN': 'Han Solo',
    'KYLO REN': 'Ben Solo',
    'REY': 'Rey Skywalker',
    'EMPEROR': 'Darth Sidious',
    'DARTH VADER': 'Anakin Skywalker',
    'FODE/BEED': 'Fodesinbeed Annodue',
}

In [18]:
if ADD_FEATURES:
    import Levenshtein

def distance(a, b):
    return Levenshtein.jaro_winkler(a.lower(), b.lower())

def find_match(title):
    title = title.lower().replace('count', '').replace('senator', '')\
        .replace('captain', '').replace('admiral', '').replace('general', '')\
        .replace('darth', '').replace('colonel', '').replace('clone', '')

    candidates = db_items_df[db_items_df['title'].apply(lambda x: title in x.lower())]

    # if len(candidates) == 0:
    #     candidate_idx = db_items_df['text'].apply(lambda x: x.count(title.lower())).argsort().head(10)
    #     candidates = db_items_df.iloc[candidate_idx]

    if len(candidates) == 0:
        candidates = db_items_df

    idx = candidates['title'].apply(lambda x: distance(x, title)).argmax()
    match = candidates.iloc[idx]

    return match


In [19]:
if ADD_FEATURES:
    names_df = df_nodes_tmp.toPandas()
    node_details = []
    for index, row in names_df.iterrows():
        name = row['name'] if row['name'] not in VOCAB else VOCAB[row['name']]
        match = find_match(name)
        node_details.append({
            **row.to_dict(),
            'match_title': match['title'],
            'match_id': str(match.name)
        })

    node_details_df = pd.DataFrame(node_details)

In [20]:
if ADD_FEATURES:
    node_details_df

In [21]:
if ADD_FEATURES:
    from bson.objectid import ObjectId
    import numpy as np

    node_props = []
    for item in node_details:
        data = collection.find_one({'_id': ObjectId(item['match_id'])}, {'properties': 1})
        node_props.append({
            'id': item['id'],
            **data['properties']
        })

    def strip_tokens(x):
        if not x or (isinstance(x, float) and np.isnan(x)):
            return []
        return [str(e).strip() for e in x]

    node_details_df = pd.DataFrame(node_props)
    node_details_df.drop(columns=['1', '2', 'kajidic', 'clan', 'armament', 'plating', 'sensor', 'width', 'length', 'cost', 'line', 'manufacturer', 'creator', 'model', 'class', 'image'], inplace=True)
    node_details_df = node_details_df.replace('', np.nan)
    node_details_df['hair'] = node_details_df['hair'].str.split(',|;').apply(strip_tokens)
    node_details_df['eyes'] = node_details_df['skin'].str.split(',|;').apply(strip_tokens)
    node_details_df['cyber'] = node_details_df['cyber'].str.split(',|;').apply(strip_tokens)
    node_details_df['skin'] = node_details_df['skin'].str.split(',|;').apply(strip_tokens)
    node_details_df['masters'] = node_details_df['masters'].str.split(r'\|').apply(strip_tokens)
    node_details_df['apprentices'] = node_details_df['apprentices'].str.split(r'\|').apply(strip_tokens)
    node_details_df['affiliation'] = node_details_df['affiliation'].str.split(r'\|').apply(strip_tokens)
    # node_details_df['height'] = node_details_df['height'].apply(lambda x: x.split()[0] if x and isinstance(x, str) else x)
    # node_details_df['mass'] = node_details_df['mass'].apply(lambda x: x.split()[0] if x and isinstance(x, str) else x)
    node_details_df = node_details_df.astype(object).where(pd.notnull(node_details_df), None)
    node_details_df

In [22]:
if ADD_FEATURES:
    node_details_df['homeworld'] = node_details_df['homeworld'].fillna('Unknown')
    node_details_df['gender'] = node_details_df['gender'].fillna('Unknown')
    node_details_df['species'] = node_details_df['species'].fillna('Unknown')
    node_details_df['type'] = node_details_df['type'].fillna('Unknown')

In [23]:
if ADD_FEATURES:
    from datasets.feature_transform import MultiRareLabelEncoder, MOneHotEncoder

    mrare_encoder = MultiRareLabelEncoder(tol=0.05, n_categories=8)
    mfeature_df = mrare_encoder.fit_transform(node_details_df[['affiliation']])

    oh_encoder = MOneHotEncoder()
    mfeature_df = oh_encoder.fit_transform(mfeature_df[['affiliation']]).add_prefix('feat_')

    mfeature_df

In [24]:
if ADD_FEATURES:
    from feature_engine.encoding import RareLabelEncoder, OneHotEncoder

    rare_encoder = RareLabelEncoder(tol=0.05, n_categories=6)
    feature_df = rare_encoder.fit_transform(node_details_df[['homeworld', 'gender', 'species', 'type']])

    oh_encoder = OneHotEncoder()
    feature_df = oh_encoder.fit_transform(feature_df[['homeworld', 'gender', 'species', 'type']]).add_prefix('feat_')

    feature_df['feat_species_Droid'] = node_details_df['is_droid'].apply(lambda x: 1 if x else 0)
    feature_df['feat_hasMaster'] = node_details_df['masters'].apply(lambda x: 1 if len(x) else 0)
    feature_df['feat_hasApprentices'] = node_details_df['apprentices'].apply(lambda x: 1 if len(x) else 0)
    feature_df['feat_hasCyber'] = node_details_df['cyber'].apply(lambda x: 1 if len(x) else 0)

    feature_df

In [25]:
if ADD_FEATURES:
    node_details_and_features_df = node_details_df.join(feature_df).join(mfeature_df)
    node_details_and_features_df.columns = [c.replace(' ', '') for c in node_details_and_features_df.columns]
    node_details_and_features_df

In [26]:
if ADD_FEATURES:
    df_nodes_feat = spark.createDataFrame(node_details_and_features_df)
    df_nodes_feat = (
        df_nodes_feat
            .withColumn('full_name', F.col('name'))
            .drop('name')
    )
    df_nodes_feat.head(5)

In [28]:
if ADD_FEATURES:
    df_nodes = (
        df_nodes_tmp
            .join(df_nodes_feat, on='id', how='left')
    )
else:
    df_nodes = df_nodes_tmp
df_nodes.head(5)

[Row(name='C-3PO', id=0),
 Row(name='JERJERROD', id=1),
 Row(name='BERU', id=2),
 Row(name='LANDO', id=3),
 Row(name='CAMIE', id=4)]

In [25]:
df_nodes.write.parquet(DATASET.processed_str('nodes_Character'), mode='overwrite')

df_edges_interactions.write.parquet(DATASET.processed_str('edges_INTERACTIONS'), mode='overwrite')
df_edges_mentions.write.parquet(DATASET.processed_str('edges_MENTIONS'), mode='overwrite')

                                                                                

In [26]:
from shared.schema.graph import GraphSchema, NodeSchema, EdgeSchema

(
    GraphSchema()
        .add_node_schema('Character', NodeSchema.from_spark(df_nodes.schema, label='name'))
        .add_edge_schema('INTERACTIONS', EdgeSchema.from_spark(df_edges_interactions.schema, source_type='Character', target_type='Character', directed=False, timestamp='time', interaction=True))
        .add_edge_schema('MENTIONS', EdgeSchema.from_spark(df_edges_mentions.schema, source_type='Character', target_type='Character', directed=True, timestamp='time', interaction=True))
        .save_schema(DATASET.processed())
)

GraphSchema(_path=PosixPath('/data/pella/projects/University/Thesis/Thesis/code/storage/datasets/processed/star-wars'), nodes={'Character': NodeSchema(_type='Character', _schema=..., label='name', properties={'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.INT: 'int'>, array=False)), 'name': GraphProperty(_name='name', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'homeworld': GraphProperty(_name='homeworld', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'birth': GraphProperty(_name='birth', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'death': GraphProperty(_name='death', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'height': GraphProperty(_name='height', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'mass': GraphProperty(_name='mass', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'gender': GraphProperty(_name='gender', dtype=DType(atomic=<DTypeAtom