# Westworld cleanup

Fixing the raw data coming from the Fandom wiki


## TODO
* [x] clean html
* [x] clean wiki markup
* [x] gender uppercase
* [x] remove unwanted columns
* [x] drop categories and old movie characters
* [ ] standardize categoricals: species, status, seasons, ethnicity, hair, eye

In [111]:
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', None)

In [112]:
chars_raw = pd.read_csv('data/characters.csv', encoding='utf-8')
chars_raw.head()

Unnamed: 0,id,name,url,links,is_host,is_human,image,aka,status,species,gender,actor,seasons,firstseen,lastseen,ethnicity,hair,eye,death,occupation,deathdate,deathcause,family,age,images,imagecaption,appearedin,height,weight,title,park,creationdate,birth,origin,headquarter,type,language,leadership,founder,eyes
0,7655,Akane,/wiki/Akane,,True,False,{{PAGENAME}}.jpg,"あかね, アカネ",Active,Host,Female,[[Rinko Kikuchi]],[[Season Two]],"""[[Akane No Mai]]""","""[[Phase Space]]""",Japanese,Black,Brown,,,,,,,,,,,,,,,,,,,,,,
1,7681,Akecheta,/wiki/Akecheta,"['Kohana', 'Maeve']",True,False,<gallery>\nAkecheta.png,Ghost <small>(by the [[Homestead Girl]])</smal...,Decommissioned <small>(conscious mind in the [...,Host,Male,[[Zahn McClarnon]],[[Season Two,"""[[Reunion]]""","""[[The Passenger]]""",Native American,Black,Brown,"""[[Kiksuya]]""<br/>""[[The Passenger]]""",Leader of the [[Ghost Nation]]<br/>Tribesman <...,June 2052 <small>(last death)</small>,Repeatedly stabbed in the gut by a guest <smal...,[[Kohana]] <small>(lover)</small>,,,,,,,,,,,,,,,,,
2,3182,Angela,/wiki/Angela,['Wyatt'],True,False,Angela Reunion.jpg,"""Angela the cult member""",Decommissioned,[[Host]],Female,[[Talulah Riley]],[[Season One,"""[[Chestnut]]""","""[[Les Ecorches]]""",British Caucasian,Blonde,Hazel,,"Townswoman, Guest Greeter, Cult Member",,self-destruction by detonating a grenade,,30s,,,,,,,,,,,,,,,,
3,7615,Antoine Costa,/wiki/Antoine_Costa,,False,True,Antoine Costa.png,,Deceased,,Male,[[Fares Fares]],[[Season Two,"""[[Journey_Into_Night_(episode)","""[[The_Passenger",,Black,Brown,,Technician,,Shot repeatedly by [[Dolores Abernathy]] <smal...,,,[[:Category:Images of {{PAGENAME,,,,,,,,,,,,,,,
4,2165,Armistice,/wiki/Armistice,"['Hector Escaton', 'Wyatt']",True,False,<gallery>\n Armistice Akane No Mai.jpg,,Decommissioned,[[Host]],Female,[[Ingrid Bolsø Berdal]],[[Season One,"""[[The Original]]""",,Caucasian American,Blonde,,,,,,,30s,[[:Category:Images of Armistice,,,,,,,,,,,,,,,


---

In [113]:
chars = chars_raw.copy()

# Drop unused columns
"""
image, images, imagecaption: just filenames of images in the wiki
appearedin: infrequently used episode count or list of episode names
"""
drop_cols = ['image',  'images', 'imagecaption', 'appearedin']
chars = chars.drop(columns=drop_cols)
# Drop mostly empty columns (less than 15 non-null values)
chars = chars.dropna(axis=1, how='any', thresh=15)


def remove_html(s):
    """Converts some html tags to unicode, then discards the rest."""
    s = re.sub(r'<br/>', '\n', s)
    s = re.sub(html_reftag, '', s)
    s =  re.sub(html_remove_r, '', s)
    return s


def remove_markdown(s):
    """Remove double brackets and bullets for wiki markdown"""
    s = re.sub(r'\[{2}|\]{2}', '', s) 
    s = re.sub(r'\*', '', s)
    return s

# Compile regex for cleaning
html_remove_r = re.compile('<.*?>')
html_reftag = re.compile(r'<ref.*</ref>')

# Clean all string columns
for col in chars.select_dtypes(include='object'):
    chars[col] = (chars[col]
                  .map(remove_html, na_action='ignore')
                  .map(remove_markdown, na_action='ignore')
                 )
    # Replace every TBA with null
    chars[col] = chars[col].replace(to_replace='TBA', value=None)

# Gender to uppercase
chars['gender'] = chars['gender'].str.upper()

# Drop movie characters
chars = chars.drop(index=chars[chars.name.str.contains(r'\(19[0-9]{2}\)')].index)

# Drop categories
chars = chars.drop(index=chars[chars.url.str.contains('Category')].index)



def fix_species(df):
    """Map certain abnormal values of species."""
    species_map = {
        'Human & Host': 'Both',
        'Unknown': None,
        'Host/Simulated': 'Simulation',
        'Human/Simulation': 'Simulation',
    }
    df['species'] = df['species'].replace(species_map)

fix_species(chars)

In [114]:
chars.head()

Unnamed: 0,id,name,url,links,is_host,is_human,aka,status,species,gender,actor,seasons,firstseen,lastseen,ethnicity,hair,eye,death,occupation,deathcause,family,age
0,7655,Akane,/wiki/Akane,,True,False,"あかね, アカネ",Active,Host,FEMALE,Rinko Kikuchi,Season Two,"""Akane No Mai""","""Phase Space""",Japanese,Black,Brown,,,,,
1,7681,Akecheta,/wiki/Akecheta,"['Kohana', 'Maeve']",True,False,Ghost (by the Homestead Girl)\nAke (by Kohana),Decommissioned (conscious mind in the Valley B...,Host,MALE,Zahn McClarnon,Season Two,"""Reunion""","""The Passenger""",Native American,Black,Brown,"""Kiksuya""\n""The Passenger""",Leader of the Ghost Nation\nTribesman (formerl...,Repeatedly stabbed in the gut by a guest (repa...,Kohana (lover),
2,3182,Angela,/wiki/Angela,['Wyatt'],True,False,"""Angela the cult member""",Decommissioned,Host,FEMALE,Talulah Riley,Season One,"""Chestnut""","""Les Ecorches""",British Caucasian,Blonde,Hazel,,"Townswoman, Guest Greeter, Cult Member",self-destruction by detonating a grenade,,30s
3,7615,Antoine Costa,/wiki/Antoine_Costa,,False,True,,Deceased,,MALE,Fares Fares,Season Two,"""Journey_Into_Night_(episode)","""The_Passenger",,Black,Brown,,Technician,Shot repeatedly by Dolores Abernathy (as Charl...,,
4,2165,Armistice,/wiki/Armistice,"['Hector Escaton', 'Wyatt']",True,False,,Decommissioned,Host,FEMALE,Ingrid Bolsø Berdal,Season One,"""The Original""",,Caucasian American,Blonde,,,,,,30s


In [115]:
(chars.notna().sum()).sort_values()

death          15
aka            21
family         28
deathcause     30
links          33
eye            34
hair           59
age            67
occupation     67
ethnicity      76
lastseen       77
species       105
seasons       112
status        115
gender        115
firstseen     115
actor         116
is_host       160
url           160
name          160
is_human      160
id            160
dtype: int64

In [116]:
chars.seasons.value_counts()

Season One                62
Season Three              24
Season Two                23
Season One, Season Two     3
Name: seasons, dtype: int64

In [117]:
chars.loc[chars.fillna('').seasons.str.contains(',')]

Unnamed: 0,id,name,url,links,is_host,is_human,aka,status,species,gender,actor,seasons,firstseen,lastseen,ethnicity,hair,eye,death,occupation,deathcause,family,age
11,3552,Barkeep (Las Mudas),/wiki/Barkeep_(Las_Mudas),,True,False,,Decommissioned,Host,MALE,Price Carson,"Season One, Season Two","""Chestnut""","""The Riddle of the Sphinx""",,,,,Barkeep,,,50s
68,7816,Gold Miner Host,/wiki/Gold_Miner_Host,,True,False,,Active,Host,MALE,Micah Fitzgerald,"Season One, Season Two","""Trace Decay""","""Journey Into Night (episode)",Caucasian,Brown,Blue,,,,,40s
128,3378,Revolutionaries,/wiki/Revolutionaries,,True,False,,,Host,,,"Season One, Season Two","""Contrapasso""","""Reunion""",,,,,,,,
