In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.tree import DecisionTreeClassifier #Decision Tree

from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction

from sklearn.preprocessing import MinMaxScaler # data scaler

from sklearn.model_selection import GridSearchCV # for performing grid-search


In [2]:
# Load our dataset
df = pd.read_csv("data/character-predictions.csv")

In [3]:
# Show first 5 rows
df.head(5)

Unnamed: 0,S.No,actual,pred,alive,plod,name,title,male,culture,dateOfBirth,...,isAliveHeir,isAliveSpouse,isMarried,isNoble,age,numDeadRelations,boolDeadRelations,isPopular,popularity,isAlive
0,1,0,0,0.054,0.946,Viserys II Targaryen,,1,,,...,0.0,,0,0,,11,1,1,0.605351,0
1,2,1,0,0.387,0.613,Walder Frey,Lord of the Crossing,1,Rivermen,208.0,...,,1.0,1,1,97.0,1,1,1,0.896321,1
2,3,1,0,0.493,0.507,Addison Hill,Ser,1,,,...,,,0,1,,0,0,0,0.267559,1
3,4,0,0,0.076,0.924,Aemma Arryn,Queen,0,,82.0,...,,0.0,1,1,23.0,0,0,0,0.183946,0
4,5,1,1,0.617,0.383,Sylva Santagar,Greenstone,0,Dornish,276.0,...,,1.0,1,1,29.0,0,0,0,0.043478,1


In [4]:
# Lots of Nans
df.isna().sum()

S.No                    0
actual                  0
pred                    0
alive                   0
plod                    0
name                    0
title                1008
male                    0
culture              1269
dateOfBirth          1513
DateoFdeath          1502
mother               1925
father               1920
heir                 1923
house                 427
spouse               1670
book1                   0
book2                   0
book3                   0
book4                   0
book5                   0
isAliveMother        1925
isAliveFather        1920
isAliveHeir          1923
isAliveSpouse        1670
isMarried               0
isNoble                 0
age                  1513
numDeadRelations        0
boolDeadRelations       0
isPopular               0
popularity              0
isAlive                 0
dtype: int64

In [5]:
# weird mean why?
print(df["age"].mean())
# Negative Age
df["age"][df["age"] < 0]
# There are mistakes in the data Doreah is actually 25 and Rhaego was never even born
print(df["name"][1684]) 
print(df["name"][1868])

-1293.5635103926097
Doreah
Rhaego


In [6]:
# Replace negative ages
df.loc[1684, "age"] = 24.0
df.loc[1868, "age"] = 0.0

In [7]:
# Drop columns  # drop popularity
drop = ["S.No", "pred", "alive", "plod", "name", "isAlive", "DateoFdeath" ,"dateOfBirth", 
        "popularity", "numDeadRelations"]
df.drop(drop, inplace=True, axis=1)

In [8]:
df["age"].mean()

36.70207852193995

In [9]:
# Fill the nans we can
df["age"].fillna(df["age"].mean(), inplace=True)
df["culture"].fillna("", inplace=True)

# Some nans values are nan because we dont know them so fill them with -1
df.fillna(value=-1, inplace=True)

In [10]:
# Lots of different names for one culture so lets group them up
set(df["culture"])

{'',
 'Andal',
 'Andals',
 'Asshai',
 "Asshai'i",
 'Astapor',
 'Astapori',
 'Braavos',
 'Braavosi',
 'Crannogmen',
 'Dorne',
 'Dornish',
 'Dornishmen',
 'Dothraki',
 'First Men',
 'Free Folk',
 'Free folk',
 'Ghiscari',
 'Ghiscaricari',
 'Ibbenese',
 'Ironborn',
 'Ironmen',
 'Lhazareen',
 'Lhazarene',
 'Lysene',
 'Lyseni',
 'Meereen',
 'Meereenese',
 'Myrish',
 'Naathi',
 'Northern mountain clans',
 'Northmen',
 'Norvos',
 'Norvoshi',
 'Pentoshi',
 'Qarth',
 'Qartheen',
 'Qohor',
 'Reach',
 'Reachmen',
 'Rhoynar',
 'Riverlands',
 'Rivermen',
 'Sistermen',
 'Stormlander',
 'Stormlands',
 'Summer Islander',
 'Summer Islands',
 'Summer Isles',
 'The Reach',
 'Tyroshi',
 'Vale',
 'Vale mountain clans',
 'Valemen',
 'Valyrian',
 'Westerlands',
 'Westerman',
 'Westermen',
 'Westeros',
 'Wildling',
 'Wildlings',
 'free folk',
 'ironborn',
 'northmen',
 'westermen'}

In [11]:
cult = {
    'Summer Islands': ['summer islands', 'summer islander', 'summer isles'],
    'Ghiscari': ['ghiscari', 'ghiscaricari',  'ghis'],
    'Asshai': ["asshai'i", 'asshai'],
    'Lysene': ['lysene', 'lyseni'],
    'Andal': ['andal', 'andals'],
    'Braavosi': ['braavosi', 'braavos'],
    'Dornish': ['dornishmen', 'dorne', 'dornish'],
    'Myrish': ['myr', 'myrish', 'myrmen'],
    'Westermen': ['westermen', 'westerman', 'westerlands'],
    'Westerosi': ['westeros', 'westerosi'],
    'Stormlander': ['stormlands', 'stormlander'],
    'Norvoshi': ['norvos', 'norvoshi'],
    'Northmen': ['the north', 'northmen'],
    'Free Folk': ['wildling', 'first men', 'free folk'],
    'Qartheen': ['qartheen', 'qarth'],
    'Reach': ['the reach', 'reach', 'reachmen'],
    'Ironborn': ['ironborn', 'ironmen'],
    'Mereen': ['meereen', 'meereenese'],
    'RiverLands': ['riverlands', 'rivermen'],
    'Vale': ['vale', 'valemen', 'vale mountain clans']
}

In [12]:
def get_cult(value):
    value = value.lower()
    v = [k for (k, v) in cult.items() if value in v]
    return v[0] if len(v) > 0 else value.title()

In [13]:
df.loc[:, "culture"] = [get_cult(x) for x in df["culture"]]

In [14]:
# Let's turn our strings features into one-hot encoded variables
df = pd.get_dummies(df)

In [15]:
# Separate our labels from our features
y = df.iloc[:, 0].values
x = df.iloc[:, 1:].values

In [16]:
# Scale our data between 0 and 1
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

In [17]:
# Let's build many different models to train with our data

kfold = KFold(n_splits=3, shuffle=True) # split data into 3 equal groups for validation
mean=[]
std=[]

models = [LogisticRegression(solver='liblinear'), RandomForestClassifier(n_estimators=100), 
          DecisionTreeClassifier(), svm.SVC(kernel='linear', gamma='scale'), 
          svm.SVC(kernel='rbf', gamma='scale'), KNeighborsClassifier()]

In [18]:
# Validate each model using K-fold cross validation
for model in models:
    result = cross_val_score(model, x, y, cv=kfold, scoring="accuracy", n_jobs=4)
    mean.append(result.mean())
    std.append(result.std())

In [19]:
classifiers=['Logistic Regression', 'Random Forest', 'Decision Tree', 
             'Linear SVM', 'Radial SVM', 'KNN']

df2 = pd.DataFrame({'Mean Acc':mean,'Std Acc':std},index=classifiers) # maybe plot the mean and std ??
df2

Unnamed: 0,Mean Acc,Std Acc
Logistic Regression,0.776972,0.009587
Random Forest,0.787777,0.010408
Decision Tree,0.767201,0.018721
Linear SVM,0.761048,0.005035
Radial SVM,0.75232,0.01083
KNN,0.775952,0.014663


In [20]:
# Random Forest is our best classifier out of the box but perhaps with a little tuning the svms can do much better 

c = [0.1, 0.3, 0.5, 0.7, 0.9]
gamma = [0.1, 0.3, 0.5, 0.7, 0.9]
kernel = ['rbf','linear']

hyper_parameters = {'kernel':kernel,'C':c,'gamma':gamma}

gs = GridSearchCV(estimator=svm.SVC(),param_grid=hyper_parameters,verbose=True, cv=kfold, n_jobs=4)

# Find the best hyperparameters using grid-search
gs.fit(x,y)
print(gs.best_score_)
print(gs.best_estimator_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   31.9s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:  1.9min finished


0.78879753340185
SVC(C=0.9, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.3, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
