In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import numpy as np
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

In [3]:
from shared.schema import DatasetSchema

DATASET = DatasetSchema.load_schema('star-wars')
DATASET.save_schema()

In [4]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/03 22:55:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/02/03 22:55:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
import json

all_edges = []
for file in DATASET.raw().glob('starwars-episode-*ns.json'):
    _, _, episode, link_type = file.stem.split('-')
    with file.open('r') as f:
        data = json.load(f)

    nodes = list(data['nodes'])
    edges = data['links']
    for e in edges:
        all_edges.append({
            'source': nodes[e['source']]['name'],
            'target': nodes[e['target']]['name'],
            'time': int(episode),
            'weight': e['value'],
            'type': link_type
        })

In [6]:
df = spark.createDataFrame(all_edges)
df.head(5)

                                                                                

[Row(source='CAMIE', target='LUKE', time=4, type='interactions', weight=2),
 Row(source='BIGGS', target='CAMIE', time=4, type='interactions', weight=2),
 Row(source='BIGGS', target='LUKE', time=4, type='interactions', weight=4),
 Row(source='DARTH VADER', target='LEIA', time=4, type='interactions', weight=1),
 Row(source='BERU', target='LUKE', time=4, type='interactions', weight=3)]

In [258]:
df_nodes_tmp = (
    df
        .select(F.col('source').alias('name'))
        .union(df.select(F.col('target').alias('name')))
        .distinct()
        .withColumn('id', F.monotonically_increasing_id())
)
print(df_nodes_tmp.count())
df_nodes_tmp.head(5)

113


[Row(name='C-3PO', id=0),
 Row(name='JERJERROD', id=1),
 Row(name='BERU', id=2),
 Row(name='LANDO', id=3),
 Row(name='CAMIE', id=4)]

In [259]:
df_all_edges = (
    df.join(df_nodes_tmp.alias('s'), df.source == F.col('s.name'), 'left')
        .join(df_nodes_tmp.alias('t'), df.target ==  F.col('t.name'), 'left')
        .withColumn('src', F.col('s.id'))
        .withColumn('dst', F.col('t.id'))
        .select('time', 'type', 'src', 'dst', 'weight')
        .dropDuplicates(['time', 'src', 'dst', 'type'])
)
print(df_all_edges.count())
df_all_edges.head(5)

1599


[Row(time=2, type='mentions', src=25, dst=24, weight=24),
 Row(time=2, type='mentions', src=25, dst=91, weight=1),
 Row(time=3, type='mentions', src=25, dst=11, weight=2),
 Row(time=3, type='mentions', src=39, dst=22, weight=1),
 Row(time=4, type='mentions', src=8, dst=24, weight=36)]

In [9]:
df_edges_interactions = (
    df_all_edges.filter(F.col('type') == 'interactions')
        .drop('type')
)
print(df_edges_interactions.count())
df_edges_interactions.head(5)

479


[Row(time=1, src=24, dst=45, weight=1),
 Row(time=4, src=0, dst=28, weight=2),
 Row(time=1, src=25, dst=77, weight=2),
 Row(time=2, src=36, dst=99, weight=2),
 Row(time=5, src=8, dst=105, weight=1)]

In [10]:
df_edges_mentions = (
    df_all_edges.filter(F.col('type') == 'mentions')
        .drop('type')
        .distinct()
)
print(df_edges_mentions.count())
df_edges_mentions.head(5)

1120


[Row(time=3, src=12, dst=86, weight=1),
 Row(time=7, src=47, dst=53, weight=1),
 Row(time=3, src=84, dst=27, weight=6),
 Row(time=2, src=21, dst=99, weight=2),
 Row(time=4, src=12, dst=8, weight=28)]

In [210]:
from pymongo import MongoClient
import unicodedata
import pandas as pd

In [211]:
client = MongoClient("mongodb://root:helloworld@127.0.0.1/wiki.starwars?authSource=admin")
collection = client.wiki.wookiepedia.characters

In [212]:
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

db_items_df = pd.DataFrame(collection.find({}, {"title": 1, 'text': 1})).set_index("_id")
db_items_df['text'] = db_items_df['text'].str.lower().apply(strip_accents)
db_items_df['title'] = db_items_df['title'].str.lower().apply(strip_accents)
db_items_df

Unnamed: 0_level_0,title,text
_id,Unnamed: 1_level_1,Unnamed: 2_level_1
61fc3a9aec14b5022a30fc16,meetra surik,{{top|sprot|ffa|canon=surik}}\n{{otheruses|sur...
61fc3a9aec14b5022a30fc9a,firmus piett,{{top}}\n{{otheruses|title=firmus}}\n{{cleanup...
61fc3a9aec14b5022a30fcff,leia organa solo,{{top|ffa|sprot|canon=leia skywalker organa so...
61fc3a9aec14b5022a30fd2a,kalpana (supreme chancellor),{{top|canon=skor kalpana}}\n{{otheruses|kalpan...
61fc3a9aec14b5022a30fdc7,human replica droid,{{top|canon=replica droid}}\n{{droid_series\n|...
...,...,...
61fc3ab7ec14b5022a3988f7,tal bota,{{top}}\n{{character\n|type=jedi\n|image=[[fil...
61fc3ab7ec14b5022a3988f8,obratuk glii,{{top}}\n{{character\n|type=jedi\n|image=[[fil...
61fc3ab7ec14b5022a398a37,ishnar ti-kharatal,{{top}}\n{{character\n|type=hutt cartel\n|imag...
61fc3ab7ec14b5022a398a89,kalaxo,{{top}}\n{{image}}\n{{character\n|type=galacti...


In [213]:
VOCAB = {
    'HAN': 'Han Solo',
    'KYLO REN': 'Ben Solo',
    'REY': 'Rey Skywalker',
    'EMPEROR': 'Darth Sidious',
    'DARTH VADER': 'Anakin Skywalker',
    'FODE/BEED': 'Fodesinbeed Annodue',
}

In [214]:
import Levenshtein

def distance(a, b):
    return Levenshtein.jaro_winkler(a.lower(), b.lower())

def find_match(title):
    title = title.lower().replace('count', '').replace('senator', '')\
        .replace('captain', '').replace('admiral', '').replace('general', '')\
        .replace('darth', '').replace('colonel', '').replace('clone', '')

    candidates = db_items_df[db_items_df['title'].apply(lambda x: title in x.lower())]

    # if len(candidates) == 0:
    #     candidate_idx = db_items_df['text'].apply(lambda x: x.count(title.lower())).argsort().head(10)
    #     candidates = db_items_df.iloc[candidate_idx]

    if len(candidates) == 0:
        candidates = db_items_df

    idx = candidates['title'].apply(lambda x: distance(x, title)).argmax()
    match = candidates.iloc[idx]

    return match


In [215]:
names_df = df_nodes_tmp.toPandas()
node_details = []
for index, row in names_df.iterrows():
    name = row['name'] if row['name'] not in VOCAB else VOCAB[row['name']]
    match = find_match(name)
    node_details.append({
        **row.to_dict(),
        'match_title': match['title'],
        'match_id': str(match.name)
    })

node_details_df = pd.DataFrame(node_details)

In [216]:
node_details_df

Unnamed: 0,name,id,match_title,match_id
0,C-3PO,0,c-3po,61fc3aabec14b5022a34cab1
1,JERJERROD,1,jax jerjerrod,61fc3ab2ec14b5022a3744a9
2,BERU,2,beru whitesun lars,61fc3aacec14b5022a356134
3,LANDO,3,landonis balthazar calrissian,61fc3aabec14b5022a34cabd
4,CAMIE,4,camie marstrap,61fc3aadec14b5022a35e55f
...,...,...,...,...
108,TEY HOW,108,tey how,61fc3aacec14b5022a352290
109,WALD,109,wald,61fc3aafec14b5022a369bb2
110,VALORUM,110,tarsus valorum,61fc3ab5ec14b5022a3882d7
111,TION MEDON,111,tion medon,61fc3aaeec14b5022a3664bc


In [246]:
from bson.objectid import ObjectId
import numpy as np

node_props = []
for item in node_details:
    data = collection.find_one({'_id': ObjectId(item['match_id'])}, {'properties': 1})
    node_props.append({
        'id': item['id'],
        **data['properties']
    })

node_details_df = pd.DataFrame(node_props)
node_details_df.drop(columns=['1', '2', 'kajidic', 'clan', 'armament', 'plating', 'sensor', 'width', 'length', 'cost', 'line', 'manufacturer', 'creator', 'model', 'class'], inplace=True)
# node_details_df = node_details_df.fillna(value=None, inplace=True)
node_details_df = node_details_df.replace('', None)
node_details_df['height'] = node_details_df['height'].apply(lambda x: x.split()[0] if isinstance(x, str) else x)
node_details_df['mass'] = node_details_df['mass'].apply(lambda x: x.split()[0] if isinstance(x, str) else x)
node_details_df = node_details_df.astype(object).where(pd.notnull(node_details_df), None)
node_details_df

Unnamed: 0,id,image,name,homeworld,birth,death,height,mass,gender,equipment,affiliation,is_droid,type,species,hair,eyes,skin,cyber,masters,apprentices
0,0,File:C-3PO_TLJ_Card_Trader_Award_Card.png,C-3PO,Tatooine,Prior to 32 BBYStar Wars: Galactic Atlas,"3 ABY, Bespin",1.77,75,Masculine programming,TranLang III communication module,"Skywalker family,Confederacy of Independent Sy...",True,,,,,,,,
1,1,File:Jax_Jerjerrod.png,Jax Jerjerrod,Tinnel IVJoin the Resistance: Attack on Starki...,Prior to 32 BBYStar Wars: Galactic Atlas,"3 ABY, Bespin",1.77,75,Male,,"Jerjerrod family,First Order",False,First Order,HumanJoin the Resistance,,,,,,
2,2,File:BeruCardTrader.png,Beru Whitesun Lars,Tatooine,Prior to 32 BBYStar Wars: Galactic Atlas,"0 BBY,Star Wars: Galactic Atlas Tatooine",1.65,75,Female,,Lars family,False,First Order,Human,Brown,Blue,Light,,,
3,3,File:LandoCalrissian-TROSOCE.png,Landonis Balthazar Calrissian,SocorroUltimate Star Wars,"c. 43 BBY,Star Wars: The Rise of Skywalker: Th...","0 BBY,Star Wars: Galactic Atlas Tatooine",1.77,79,Male,,"Calrissian family,Crimson DawnSolo: A Star War...",False,Resistance,Human,Black,Brown,Dark,,,
4,4,File:CamieMarstrap-BoBFCh2.png,Camie Marstrap,Tatooine,"c. 43 BBY,Star Wars: The Rise of Skywalker: Th...","0 BBY,Star Wars: Galactic Atlas Tatooine",1.77,79,Female,,"Calrissian family,Crimson DawnSolo: A Star War...",False,Resistance,Human,Brown,Blue,Fair,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,108,File:Tey_How.png,Tey How,Naboo,"AlderaanStar Wars: Character Encyclopedia, Upd...",32 BBY;Star Wars: Galactic Atlas dates the eve...,2.24,113,Female,,Trade Federation,False,Corporate,Neimoidian,Brown,Orange,Mottled green,Right arm,Darth Sidious,UnidentifiedStar Wars Insider Special Edition ...
109,109,File:WaldFull-SWE.png,Wald,Tatooine,38 BBY,32 BBY;Star Wars: Galactic Atlas dates the eve...,0.69,113,Male,,Trade Federation,False,Corporate,Rodian,Brown,Black,Green,Right arm,Darth Sidious,UnidentifiedStar Wars Insider Special Edition ...
110,110,File:WaldFull-SWE.png,Tarsus Valorum,Tatooine,38 BBY,32 BBY;Star Wars: Galactic Atlas dates the eve...,0.69,113,Male,,"House ValorumRise of the Separatists,Galactic ...",False,Galactic Republic,Rodian,Brown,Black,Green,Right arm,Darth Sidious,UnidentifiedStar Wars Insider Special Edition ...
111,111,File:Tion_Medon.jpg,Tion Medon,UtapauUltimate Star Wars,38 BBY,32 BBY;Star Wars: Galactic Atlas dates the eve...,2.06,113,Male,,Utapaun CommitteeAccording to Tion Medon is a...,False,Galactic Republic,Pau'an,Brown,Black,Gray,Right arm,Darth Sidious,UnidentifiedStar Wars Insider Special Edition ...


In [262]:
df_nodes_feat = spark.createDataFrame(node_details_df)
df_nodes_feat = (
    df_nodes_feat
        .withColumn('height', F.col('height').cast('double'))
        .withColumn('mass', F.col('mass').cast('double'))
        .withColumn('full_name', F.col('name'))
        .drop('image', 'name')
)
df_nodes_feat.head(5)

[Row(id=0, homeworld='Tatooine', birth='Prior to 32 BBYStar Wars: Galactic Atlas', death='3 ABY, Bespin ', height=1.77, mass=75.0, gender='Masculine programming', equipment='TranLang III communication module', affiliation='Skywalker family,Confederacy of Independent Systems,Royal House of Naboo,Galactic Republic,House of Organa,Galactic Empire,Alliance to Restore the Republic,Bright Tree tribe,New RepublicBloodline,ResistanceThe Weapon of a Jedi: A Luke Skywalker Adventure', is_droid=True, type=None, species=None, hair=None, eyes=None, skin=None, cyber=None, masters=None, apprentices=None, full_name='C-3PO'),
 Row(id=1, homeworld='Tinnel IVJoin the Resistance: Attack on Starkiller Base', birth='Prior to 32 BBYStar Wars: Galactic Atlas', death='3 ABY, Bespin ', height=1.77, mass=75.0, gender='Male', equipment=None, affiliation='Jerjerrod family,First Order', is_droid=False, type='First Order', species='HumanJoin the Resistance', hair=None, eyes=None, skin=None, cyber=None, masters=None,

In [263]:
df_nodes = (
    df_nodes_tmp
        .join(df_nodes_feat, on='id', how='left')
)
df_nodes.head(5)

[Row(id=0, name='C-3PO', homeworld='Tatooine', birth='Prior to 32 BBYStar Wars: Galactic Atlas', death='3 ABY, Bespin ', height=1.77, mass=75.0, gender='Masculine programming', equipment='TranLang III communication module', affiliation='Skywalker family,Confederacy of Independent Systems,Royal House of Naboo,Galactic Republic,House of Organa,Galactic Empire,Alliance to Restore the Republic,Bright Tree tribe,New RepublicBloodline,ResistanceThe Weapon of a Jedi: A Luke Skywalker Adventure', is_droid=True, type=None, species=None, hair=None, eyes=None, skin=None, cyber=None, masters=None, apprentices=None, full_name='C-3PO'),
 Row(id=1, name='JERJERROD', homeworld='Tinnel IVJoin the Resistance: Attack on Starkiller Base', birth='Prior to 32 BBYStar Wars: Galactic Atlas', death='3 ABY, Bespin ', height=1.77, mass=75.0, gender='Male', equipment=None, affiliation='Jerjerrod family,First Order', is_droid=False, type='First Order', species='HumanJoin the Resistance', hair=None, eyes=None, skin

In [264]:
df_nodes.write.parquet(DATASET.processed_str('nodes_Character'), mode='overwrite')

df_edges_interactions.write.parquet(DATASET.processed_str('edges_INTERACTIONS'), mode='overwrite')
df_edges_mentions.write.parquet(DATASET.processed_str('edges_MENTIONS'), mode='overwrite')

                                                                                

In [265]:
from shared.schema.graph import GraphSchema, NodeSchema, EdgeSchema

(
    GraphSchema()
        .add_node_schema('Character', NodeSchema.from_spark(df_nodes.schema, label='name'))
        .add_edge_schema('INTERACTIONS', EdgeSchema.from_spark(df_edges_interactions.schema, source_type='Character', target_type='Character', directed=False, timestamp='time', interaction=True))
        .add_edge_schema('MENTIONS', EdgeSchema.from_spark(df_edges_mentions.schema, source_type='Character', target_type='Character', directed=True, timestamp='time', interaction=True))
        .save_schema(DATASET.processed())
)

GraphSchema(_path=PosixPath('/data/pella/projects/University/Thesis/Thesis/code/storage/datasets/processed/star-wars'), nodes={'Character': NodeSchema(_type='Character', _schema=..., label='name', properties={'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.INT: 'int'>, array=False)), 'name': GraphProperty(_name='name', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'homeworld': GraphProperty(_name='homeworld', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'birth': GraphProperty(_name='birth', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'death': GraphProperty(_name='death', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'height': GraphProperty(_name='height', dtype=DType(atomic=<DTypeAtomic.FLOAT: 'float'>, array=False)), 'mass': GraphProperty(_name='mass', dtype=DType(atomic=<DTypeAtomic.FLOAT: 'float'>, array=False)), 'gender': GraphProperty(_name='gender', dtype=DType(atomic=<DTypeAtomic.S