In [127]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [128]:
import numpy as np
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

In [129]:
from shared.paths import DatasetPath

DS = DatasetPath('star-wars')

In [130]:
spark = (
    SparkSession.builder
        .appName(f'{DS}')
        .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
        .config("spark.executor.memory", "8g")
        .config("spark.driver.memory", "8g")
        .config("spark.memory.offHeap.enabled", True)
        .config("spark.memory.offHeap.size", "16g")
        .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.13.0")
        .getOrCreate()
)

# Load Graph

In [131]:
import json

all_edges = []
for file in DS.raw().glob('starwars-episode-*ns.json'):
    _, _, episode, link_type = file.stem.split('-')
    with file.open('r') as f:
        data = json.load(f)

    nodes = list(data['nodes'])
    edges = data['links']
    for e in edges:
        all_edges.append({
            'source': nodes[e['source']]['name'],
            'target': nodes[e['target']]['name'],
            'timestamp_from': int(episode),
            'weight': e['value'],
            'type': link_type
        })

In [132]:
df = spark.createDataFrame(all_edges)
df.head(5)

[Row(source='CAMIE', target='LUKE', timestamp_from=4, type='interactions', weight=2),
 Row(source='BIGGS', target='CAMIE', timestamp_from=4, type='interactions', weight=2),
 Row(source='BIGGS', target='LUKE', timestamp_from=4, type='interactions', weight=4),
 Row(source='DARTH VADER', target='LEIA', timestamp_from=4, type='interactions', weight=1),
 Row(source='BERU', target='LUKE', timestamp_from=4, type='interactions', weight=3)]

In [133]:
df_raw_nodes = (
    df
        .select(F.col('source').alias('name'))
        .union(df.select(F.col('target').alias('name')))
        .distinct()
        .withColumn('id', F.monotonically_increasing_id())
)
print(df_raw_nodes.count())
df_raw_nodes.head(5)

113


[Row(name='C-3PO', id=0),
 Row(name='JERJERROD', id=1),
 Row(name='BERU', id=2),
 Row(name='LANDO', id=3),
 Row(name='CAMIE', id=4)]

In [134]:
df_raw_edges = (
    df.join(df_raw_nodes.alias('s'), df.source == F.col('s.name'), 'left')
        .join(df_raw_nodes.alias('t'), df.target ==  F.col('t.name'), 'left')
        .withColumn('src', F.col('s.id'))
        .withColumn('dst', F.col('t.id'))
        .select('timestamp_from', 'type', 'src', 'dst', 'weight')
        .dropDuplicates(['timestamp_from', 'src', 'dst', 'type'])
)
print(df_raw_edges.count())
df_raw_edges.head(5)

1599


[Row(timestamp_from=2, type='mentions', src=25, dst=24, weight=24),
 Row(timestamp_from=2, type='mentions', src=25, dst=91, weight=1),
 Row(timestamp_from=3, type='mentions', src=25, dst=11, weight=2),
 Row(timestamp_from=3, type='mentions', src=39, dst=22, weight=1),
 Row(timestamp_from=4, type='mentions', src=8, dst=24, weight=36)]

In [135]:
df_edges_interactions = (
    df_raw_edges.filter(F.col('type') == 'interactions')
        .drop('type')
)
print(df_edges_interactions.count())
df_edges_interactions.head(5)

479


[Row(timestamp_from=1, src=24, dst=45, weight=1),
 Row(timestamp_from=4, src=0, dst=28, weight=2),
 Row(timestamp_from=1, src=25, dst=77, weight=2),
 Row(timestamp_from=2, src=36, dst=99, weight=2),
 Row(timestamp_from=5, src=8, dst=105, weight=1)]

In [136]:
df_edges_mentions = (
    df_raw_edges.filter(F.col('type') == 'mentions')
        .drop('type')
        .distinct()
)
print(df_edges_mentions.count())
df_edges_mentions.head(5)

1120


[Row(timestamp_from=3, src=12, dst=86, weight=1),
 Row(timestamp_from=7, src=47, dst=53, weight=1),
 Row(timestamp_from=3, src=84, dst=27, weight=6),
 Row(timestamp_from=2, src=21, dst=99, weight=2),
 Row(timestamp_from=4, src=12, dst=8, weight=28)]

# Load WikiData Features

In [137]:
wiki_df = (
    spark.read.format('xml')
        .option("rowTag", "page")
        .load(str(DS.raw('starwars_pages_current.xml')))
        .select('id', 'title', F.col('revision.text._VALUE').alias('text'))
).cache()
wiki_df.head(5)

                                                                                

[Row(id=1, title='File:Wiki.png', text='{{Top|fprot|uprot}}\n==Summary==\n{{Information\n|attention=\n|description=The logo of [[Wookieepedia]]. Cropped and modified from a picture of the [[Death Star II/Legends|second Death Star]].\n|source=Take a guess.\n|artist=*[[Tracy Duncan]] provided original design\n*[[User:Jaden Kenobi|Tyber]] provided higher-resolution version\n|filespecs=\n|licensing={{GFDL}}\n{{Cc-by-sa|3.0}}\n|other versions=[[:File:Wiki-shrinkable.png]]\n|cat artist=skip\n|cat licensee=skip\n|cat subject=skip\n|cat type=[[Category:Wookieepedia icons]]\n}}'),
 Row(id=2, title='User:LouCypher/monobook.css', text='@import url("http://www.wikicities.com/index.php?title=User:LouCypher/monobook.css&action=raw&ctype=text/css");'),
 Row(id=4, title='Template:GFDL', text='{| id="gfdl" class="darkbackground1 messagebox noprint" align="center" border="0" cellpadding="4" cellspacing="4" style="border: 1px solid #CC9; background-color: #F1F1DE"\n|-\n| [[File:Heckert_gnu.svg|70px|cente

In [138]:
wiki_chars_df = (
    wiki_df
        .filter(~F.col('title').rlike('^(Talk:|File:|User:|Forum:)'))
        .filter(F.col('text').rlike('\{\{(Character|Droid)'))
        .filter(~F.col('title').rlike('\/Legends$'))
        .filter(~F.col('text').rlike('^\{\{Top\|leg\}\}'))
        .filter(~F.col('text').rlike('\{\{Noncanon\|'))
        .filter(~F.col('title').rlike('lightsaber'))
        .filter(~F.col('text').rlike('\{\{Top\|.*real.*\}\}'))
        .filter(
            ~(
                F.col('text').rlike('\{\{Top\|.*leg.*\}\}') &
                ~F.col('text').rlike('\{\{Top\|.*legends=.*\}\}') &
                ~F.col('text').rlike('\{\{Top\|.*canon.*\}\}')
            )
        )
        .drop_duplicates(['id'])
).cache()
wiki_chars_df.head(5)

                                                                                

[Row(id=233176, title='Emergency beacon', text='{{Top|canon=Distress beacon}}\n[[File:EmergencyBeacon.jpg|thumb|right|250px|An emergency beacon]]\n{{Quote|Have you located the \'\'source\'\' of the \'\'distress beacon\'\'?\'\'"<br />"\'\'It appears to be an \'\'Imperial escape pod\'\', your \'\'Highness\'\'. \'\'One\'\' lifeform on board.|[[Leia Organa Solo|Leia Organa]] and [[Jan Dodonna/Legends|Jan Dodonna]]|Leia\'s Trust}}\nAn \'\'\'emergency beacon\'\'\' or \'\'\'distress beacon\'\'\' was a type of [[Homing beacon/Legends|homing beacon]] that was designed for use in emergency situations to summon or locate aid. Alliance operatives used them when they were in trouble, signaling other members of the Alliance during times of distress. Sometime before the [[Battle of Endor/Legends|Battle of Endor]], when [[Prince/Legends|Princess]] [[Leia Organa Solo|Leia Organa]] was preparing for a diplomatic mission to the [[Planet/Legends|planet]] of [[Yinchorr/Legends|Yinchorr]], [[Luke Skywalker/

## Data Matching

In [139]:
import unicodedata
import pandas as pd

In [140]:
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

df_wiki_docs = wiki_chars_df.toPandas().set_index("id")
# db_items_df['text'] = db_items_df['text'].str.lower().apply(strip_accents)
df_wiki_docs['title'] = df_wiki_docs['title'].str.lower().apply(strip_accents)
df_wiki_docs

Unnamed: 0_level_0,title,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
233176,emergency beacon,{{Top|canon=Distress beacon}}\n[[File:Emergenc...
255354,housekeeping droid,{{Top|canon=Housekeeper droid}}\n'''Housekeepi...
318541,99,{{Top|canon=Clone 99}}\n{{Character\n|type=Gal...
451699,breha organa,{{Top}}\n{{Twoconflicting|''[[Queen's Shadow]]...
453212,braylen stramm,{{Top}}\n{{Character\n|type=New Republic\n|ima...
...,...,...
683058,nan's father,{{Top}}\n{{Conjecture}}\n{{Character\n|type=Ni...
685347,ghordansk,{{Top}}\n{{Character\n|type=New Republic\n|ima...
686305,unidentified kajain'sa'nikto's ally 2,{{Top}}\n{{Conjecture}}\n{{Character\n|type=Bo...
695853,unidentified co,{{Top}}\n{{Conjecture}}\n{{Character\n|type=Ga...


In [141]:
import Levenshtein

VOCAB = {
    'HAN': 'Han Solo',
    'KYLO REN': 'Ben Solo',
    'REY': 'Rey Skywalker',
    'EMPEROR': 'Darh Sidious',
    'DARTH VADER': 'Anakin Skywalker',
    'FODE/BEED': 'Fodesinbeed Annodue',
    'JERJERROD': 'tiaan jerjerrod',
}


def distance(a, b):
    return Levenshtein.jaro_winkler(a.lower(), b.lower())

def find_match(title):
    title = title.lower().replace('count', '').replace('senator', '')\
        .replace('captain', '').replace('admiral', '').replace('general', '')\
        .replace('darth', '').replace('colonel', '').replace('clone', '')

    candidates = df_wiki_docs[df_wiki_docs['title'].apply(lambda x: title in x.lower())]
    if len(candidates) == 0:
        candidates = df_wiki_docs

    idx = candidates['title'].apply(lambda x: distance(x, title)).argmax()
    match = candidates.iloc[idx]

    return match

In [142]:
names_df = df_raw_nodes.toPandas()
node_details = []
for index, row in names_df.iterrows():
    name = row['name'] if row['name'] not in VOCAB else VOCAB[row['name']]
    match = find_match(name)
    node_details.append({
        **row.to_dict(),
        'match_title': match['title'],
        'match_id': str(match.name),
        'text': match['text'],
    })

df_node_matches = pd.DataFrame(node_details).set_index('id')
df_node_matches

Unnamed: 0_level_0,name,match_title,match_id,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,C-3PO,c-3po,452947,"{{Top}}\n{{Doom}}\n{{Update|[[Trouble Again]],..."
1,JERJERROD,tiaan jerjerrod,269385,{{Top}}\n{{Youmay|the [[Galactic Empire|Imperi...
2,BERU,beru whitesun lars,457342,{{Top}}\n{{Update|[[Star Wars (LINE Webtoon)]]...
3,LANDO,landonis balthazar calrissian,452964,{{Top|legends=Lando Calrissian/Legends}}\n{{Rh...
4,CAMIE,camie marstrap,512525,{{Top|legends=Camie Loneozner}}\n{{Spoiler|Cha...
...,...,...,...,...
108,TEY HOW,tey how,498826,{{Top|ga|audio=https://www.youtube.com/watch?v...
109,WALD,wald,475139,{{Top|legends=W. Wald}}\n{{Youmay|the Rodian|t...
110,VALORUM,tarsus valorum,625117,{{Top}}\n{{Character\n|type=Galactic Republic\...
111,TION MEDON,tion medon,473370,"{{Top}}\n{{Update|[[Force Collector]], [[Star ..."


## Feature Extraction

In [143]:
import wikitextparser as wtp
from lxml import html

def select_character(parsed_item):
    for template in parsed_item.templates:
        if template.name.startswith('Character') or template.name.startswith('Droid'):
            return template
    return None

def clean_value(value):
    tree = html.fromstring(f'<span>{value}</span>')
    refs = tree.find('ref')
    while refs is not None:
        refs.drop_tree()
        refs = tree.find('ref')

    value = tree.text_content().strip()

    value = wtp.remove_markup(value)

    parsed = wtp.parse(value)
    if parsed.get_lists():
        value = '|'.join(map(lambda x: x.strip(), parsed.get_lists()[0].items))

    return value.strip()

def extract_args(template):
    args = {}
    for arg in getattr(template, 'arguments', []):
        args[arg.name.strip()] = wtp.remove_markup(clean_value(arg.value.strip()))
    return args

def extract_props(text):
    parsed = wtp.parse(text)

    character = select_character(parsed)
    properties = extract_args(character)
    properties['is_droid'] = character and character.name.startswith('Droid')
    return properties

In [144]:
node_props = []
for (id, row) in df_node_matches.iterrows():
    node_props.append({
        'id': id,
        **extract_props(row['text']),
    })

df_raw_node_feats = pd.DataFrame(node_props).set_index('id')
df_raw_node_feats

Unnamed: 0_level_0,image,name,homeworld,birth,death,creator,manufacturer,line,model,class,...,species,hair,eyes,skin,cyber,masters,apprentices,clan,1,2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,File:C-3PO_TLJ_Card_Trader_Award_Card.png,C-3PO,Tatooine,Prior to 32 BBY,"3 ABY, Bespin",Anakin Skywalker,Cybot Galactica,,3PO-series protocol droid,Protocol droid,...,,,,,,,,,,
1,File:MoffJerjerrod-SWI187.png,Tiaan Jerjerrod,Tinnel IV,35 BBY,"4 ABY, DS-2 Death Star II Mobile Battle Statio...",,,,,,...,Human,Brown,Green,Light,,,,,,
2,File:BeruCardTrader.png,Beru Whitesun Lars,Tatooine,,"0 BBY, Tatooine",,,,,,...,Human,Brown,Blue,Light,,,,,,
3,File:LandoCalrissian-TROSOCE.png,Landonis Balthazar Calrissian,Socorro,"c. 43 BBY, Socorro",,,,,,,...,Human,Black,Brown,Dark,,,,,,
4,File:CamieMarstrap-BoBFCh2.png,Camie Marstrap,Tatooine,,,,,,,,...,Human,Brown,Blue,Fair,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,File:Tey_How.png,Tey How,,,"32 BBY; Vuutun Palaa, Naboo system",,,,,,...,Neimoidian,,,Mottled green,,,,,,
109,File:WaldFull-SWE.png,Wald,Tatooine,38 BBY,,,,,,,...,Rodian,,Black,Green,,,,,,
110,,Tarsus Valorum,,,,,,,,,...,,,,,,,,,,
111,File:Tion_Medon.jpg,Tion Medon,Utapau,,,,,,,,...,Pau'an,,Black,Gray,,,,,,


In [145]:
import numpy as np

def strip_tokens(x):
    if not x or (isinstance(x, float) and np.isnan(x)):
        return []
    return [str(e).strip() for e in x]

df_raw_node_feats.drop(columns=['1', '2', 'clan', 'armament', 'plating', 'sensor', 'width', 'length', 'cost', 'line', 'manufacturer', 'creator', 'model', 'class', 'image'], inplace=True)
df_cleaned_node_feats = df_raw_node_feats.replace('', np.nan)
df_cleaned_node_feats['hair'] = df_cleaned_node_feats['hair'].str.split(',|;').apply(strip_tokens)
df_cleaned_node_feats['eyes'] = df_cleaned_node_feats['skin'].str.split(',|;').apply(strip_tokens)
df_cleaned_node_feats['cyber'] = df_cleaned_node_feats['cyber'].str.split(',|;').apply(strip_tokens)
df_cleaned_node_feats['skin'] = df_cleaned_node_feats['skin'].str.split(',|;').apply(strip_tokens)
df_cleaned_node_feats['masters'] = df_cleaned_node_feats['masters'].str.split(r'\|').apply(strip_tokens)
df_cleaned_node_feats['apprentices'] = df_cleaned_node_feats['apprentices'].str.split(r'\|').apply(strip_tokens)
df_cleaned_node_feats['affiliation'] = df_cleaned_node_feats['affiliation'].str.split(r'\|').apply(strip_tokens)
# node_details_df['height'] = node_details_df['height'].apply(lambda x: x.split()[0] if x and isinstance(x, str) else x)
# node_details_df['mass'] = node_details_df['mass'].apply(lambda x: x.split()[0] if x and isinstance(x, str) else x)
df_cleaned_node_feats = df_cleaned_node_feats.astype(object).where(pd.notnull(df_cleaned_node_feats), None)

df_cleaned_node_feats['homeworld'] = df_cleaned_node_feats['homeworld'].fillna('Unknown')
df_cleaned_node_feats['gender'] = df_cleaned_node_feats['gender'].fillna('Unknown')
df_cleaned_node_feats['species'] = df_cleaned_node_feats['species'].fillna('Unknown')
df_cleaned_node_feats['type'] = df_cleaned_node_feats['type'].fillna('Unknown')

df_cleaned_node_feats

Unnamed: 0_level_0,name,homeworld,birth,death,height,mass,gender,equipment,affiliation,is_droid,type,species,hair,eyes,skin,cyber,masters,apprentices
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,C-3PO,Tatooine,Prior to 32 BBY,"3 ABY, Bespin",1.77 meters,75 kilograms,Masculine programming,TranLang III communication module,"[Skywalker family, Confederacy of Independent ...",True,Unknown,Unknown,[],[],[],[],[],[]
1,Tiaan Jerjerrod,Tinnel IV,35 BBY,"4 ABY, DS-2 Death Star II Mobile Battle Statio...",1.73 meters,75 kilograms,Male,,"[Taung & Zhell Society, Corellian Engineering ...",False,Galactic Empire,Human,[Brown],[Light],[Light],[],[],[]
2,Beru Whitesun Lars,Tatooine,,"0 BBY, Tatooine",1.65 meters,,Female,,[Lars family],False,Unknown,Human,[Brown],[Light],[Light],[],[],[]
3,Landonis Balthazar Calrissian,Socorro,"c. 43 BBY, Socorro",,1.77 meters (5ft 10in),79 kilograms,Male,,"[Calrissian family, Crimson Dawn, Alliance to ...",False,Resistance,Human,[Black],[Dark],[Dark],[],[],[]
4,Camie Marstrap,Tatooine,,,,,Female,,[],False,Unknown,Human,[Brown],[Fair],[Fair],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,Tey How,Unknown,,"32 BBY; Vuutun Palaa, Naboo system",,,Female,,[Trade Federation],False,Corporate,Neimoidian,[],[Mottled green],[Mottled green],[],[],[]
109,Wald,Tatooine,38 BBY,,0.69 meters,,Male,,[],False,Unknown,Rodian,[],[Green],[Green],[],[],[]
110,Tarsus Valorum,Unknown,,,,,Unknown,,"[House Valorum, Galactic Republic]",False,Galactic Republic,Unknown,[],[],[],[],[],[]
111,Tion Medon,Utapau,,,2.06 meters,,Male,,"[Utapaun Committee, Galactic Republic]",False,Galactic Republic,Pau'an,[],[Gray],[Gray],[],[],[]


In [146]:
from datasets.utils.feature_transform import MultiRareLabelEncoder, MOneHotEncoder

mrare_encoder = MultiRareLabelEncoder(tol=0.05, n_categories=8)
mfeature_df = mrare_encoder.fit_transform(df_cleaned_node_feats[['affiliation']])

oh_encoder = MOneHotEncoder()
mfeature_df = oh_encoder.fit_transform(mfeature_df[['affiliation']]).add_prefix('feat_')

mfeature_df

Unnamed: 0_level_0,feat_affiliation_Jedi Order,feat_affiliation_Galactic Empire,feat_affiliation_New Republic,feat_affiliation_Galactic Republic,feat_affiliation_Resistance,feat_affiliation_Alliance to Restore the Republic
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,1,1,1,1,1
1,0,1,0,0,0,0
2,0,0,0,0,0,0
3,0,0,1,0,1,1
4,0,0,0,0,0,0
...,...,...,...,...,...,...
108,0,0,0,0,0,0
109,0,0,0,0,0,0
110,0,0,0,1,0,0
111,0,0,0,1,0,0


In [147]:
from feature_engine.encoding import RareLabelEncoder, OneHotEncoder

rare_encoder = RareLabelEncoder(tol=0.05, n_categories=6)
feature_df = rare_encoder.fit_transform(df_cleaned_node_feats[['homeworld', 'gender', 'species', 'type']])

oh_encoder = OneHotEncoder()
feature_df = oh_encoder.fit_transform(feature_df[['homeworld', 'gender', 'species', 'type']]).add_prefix('feat_')

feature_df['feat_species_Droid'] = df_cleaned_node_feats['is_droid'].apply(lambda x: 1 if x else 0)
feature_df['feat_hasMaster'] = df_cleaned_node_feats['masters'].apply(lambda x: 1 if len(x) else 0)
feature_df['feat_hasApprentices'] = df_cleaned_node_feats['apprentices'].apply(lambda x: 1 if len(x) else 0)
feature_df['feat_hasCyber'] = df_cleaned_node_feats['cyber'].apply(lambda x: 1 if len(x) else 0)

feature_df



Unnamed: 0_level_0,feat_homeworld_Tatooine,feat_homeworld_Rare,feat_homeworld_Naboo,feat_homeworld_Alderaan,feat_homeworld_Unknown,feat_gender_Masculine programming,feat_gender_Male,feat_gender_Female,feat_gender_Unknown,feat_gender_Feminine programming,...,feat_type_Resistance,feat_type_Rare,feat_type_Jedi,feat_type_Criminal,feat_type_Galactic Republic,feat_type_Rebel,feat_species_Droid,feat_hasMaster,feat_hasApprentices,feat_hasCyber
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,0,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
109,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
110,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
111,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [148]:
node_details_and_features_df = df_node_matches[['name']].join(feature_df).join(mfeature_df)
node_details_and_features_df.columns = [c.replace(' ', '') for c in node_details_and_features_df.columns]
node_details_and_features_df

Unnamed: 0_level_0,name,feat_homeworld_Tatooine,feat_homeworld_Rare,feat_homeworld_Naboo,feat_homeworld_Alderaan,feat_homeworld_Unknown,feat_gender_Masculineprogramming,feat_gender_Male,feat_gender_Female,feat_gender_Unknown,...,feat_species_Droid,feat_hasMaster,feat_hasApprentices,feat_hasCyber,feat_affiliation_JediOrder,feat_affiliation_GalacticEmpire,feat_affiliation_NewRepublic,feat_affiliation_GalacticRepublic,feat_affiliation_Resistance,feat_affiliation_AlliancetoRestoretheRepublic
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,C-3PO,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,1,1,1,1,1
1,JERJERROD,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,BERU,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,LANDO,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,1
4,CAMIE,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,TEY HOW,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
109,WALD,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
110,VALORUM,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
111,TION MEDON,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [149]:
df_nodes_feat = spark.createDataFrame(node_details_and_features_df.reset_index())
df_nodes_feat = (
    df_nodes_feat
        .withColumn('full_name', F.col('name'))
        .drop('name')
)
df_nodes_feat.head(5)

[Row(id=0, feat_homeworld_Tatooine=1, feat_homeworld_Rare=0, feat_homeworld_Naboo=0, feat_homeworld_Alderaan=0, feat_homeworld_Unknown=0, feat_gender_Masculineprogramming=1, feat_gender_Male=0, feat_gender_Female=0, feat_gender_Unknown=0, feat_gender_Feminineprogramming=0, feat_species_Unknown=1, feat_species_Human=0, feat_species_Rare=0, feat_type_Unknown=1, feat_type_GalacticEmpire=0, feat_type_Resistance=0, feat_type_Rare=0, feat_type_Jedi=0, feat_type_Criminal=0, feat_type_GalacticRepublic=0, feat_type_Rebel=0, feat_species_Droid=1, feat_hasMaster=0, feat_hasApprentices=0, feat_hasCyber=0, feat_affiliation_JediOrder=0, feat_affiliation_GalacticEmpire=1, feat_affiliation_NewRepublic=1, feat_affiliation_GalacticRepublic=1, feat_affiliation_Resistance=1, feat_affiliation_AlliancetoRestoretheRepublic=1, full_name='C-3PO'),
 Row(id=1, feat_homeworld_Tatooine=0, feat_homeworld_Rare=1, feat_homeworld_Naboo=0, feat_homeworld_Alderaan=0, feat_homeworld_Unknown=0, feat_gender_Masculineprogra

In [150]:
df_nodes = (
    df_raw_nodes
        .join(df_nodes_feat, on='id', how='left')
)
df_nodes.head(5)

[Row(id=0, name='C-3PO', feat_homeworld_Tatooine=1, feat_homeworld_Rare=0, feat_homeworld_Naboo=0, feat_homeworld_Alderaan=0, feat_homeworld_Unknown=0, feat_gender_Masculineprogramming=1, feat_gender_Male=0, feat_gender_Female=0, feat_gender_Unknown=0, feat_gender_Feminineprogramming=0, feat_species_Unknown=1, feat_species_Human=0, feat_species_Rare=0, feat_type_Unknown=1, feat_type_GalacticEmpire=0, feat_type_Resistance=0, feat_type_Rare=0, feat_type_Jedi=0, feat_type_Criminal=0, feat_type_GalacticRepublic=0, feat_type_Rebel=0, feat_species_Droid=1, feat_hasMaster=0, feat_hasApprentices=0, feat_hasCyber=0, feat_affiliation_JediOrder=0, feat_affiliation_GalacticEmpire=1, feat_affiliation_NewRepublic=1, feat_affiliation_GalacticRepublic=1, feat_affiliation_Resistance=1, feat_affiliation_AlliancetoRestoretheRepublic=1, full_name='C-3PO'),
 Row(id=1, name='JERJERROD', feat_homeworld_Tatooine=0, feat_homeworld_Rare=1, feat_homeworld_Naboo=0, feat_homeworld_Alderaan=0, feat_homeworld_Unknow

In [151]:
df_nodes.write.parquet(DS.processed_str('node__Character'), mode='overwrite')

df_edges_interactions.write.parquet(DS.processed_str('edge__Character_INTERACTIONS_Character'), mode='overwrite')
df_edges_mentions.write.parquet(DS.processed_str('edge__Character_MENTIONS_Character'), mode='overwrite')