In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

In [2]:
marvel_df = pd.read_csv('marvel-wikia-data.csv')

In [3]:
marvel_df.head()

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,Year
0,1678,Spider-Man (Peter Parker),\/Spider-Man_(Peter_Parker),Secret Identity,Good Characters,Hazel Eyes,Brown Hair,Male Characters,,Living Characters,4043.0,Aug-62,1962.0
1,7139,Captain America (Steven Rogers),\/Captain_America_(Steven_Rogers),Public Identity,Good Characters,Blue Eyes,White Hair,Male Characters,,Living Characters,3360.0,Mar-41,1941.0
2,64786,"Wolverine (James \""Logan\"" Howlett)",\/Wolverine_(James_%22Logan%22_Howlett),Public Identity,Neutral Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3061.0,Oct-74,1974.0
3,1868,"Iron Man (Anthony \""Tony\"" Stark)",\/Iron_Man_(Anthony_%22Tony%22_Stark),Public Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2961.0,Mar-63,1963.0
4,2460,Thor (Thor Odinson),\/Thor_(Thor_Odinson),No Dual Identity,Good Characters,Blue Eyes,Blond Hair,Male Characters,,Living Characters,2258.0,Nov-50,1950.0


In [4]:
marvel_df = marvel_df.drop(columns=["ID", 'name', 'page_id', 'urlslug', 'FIRST APPEARANCE', 'APPEARANCES', "Year"])
marvel_df.head()

Unnamed: 0,ALIGN,EYE,HAIR,SEX,GSM,ALIVE
0,Good Characters,Hazel Eyes,Brown Hair,Male Characters,,Living Characters
1,Good Characters,Blue Eyes,White Hair,Male Characters,,Living Characters
2,Neutral Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters
3,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters
4,Good Characters,Blue Eyes,Blond Hair,Male Characters,,Living Characters


In [5]:
def blankGSM(val):
    if val != "nan":
        return val
    else:
        return(val.replace('nan', 'Heterosexual'))
    
def blankEye(val):
    if val != "nan":
        return val
    else:
        return(val.replace('nan', 'Unknown'))
    
def blankHair(val):
    if val != "nan":
        return val
    else:
        return(val.replace('nan', 'Unknown'))
    
def blankSex(val):
    if val != "nan":
        return val
    else:
        return(val.replace('nan', 'Unknown'))
    
def blankAlive(val):
    if val != "nan":
        return val
    else:
        return(val.replace('nan', 'Unknown'))
    
def alterAlign(val):
    
        return(val.replace('Neutral Characters', 'nan'))
    
def blankAlign(val):

        return(val.replace(' ', 'nan'))
    
def replaceNan(val):
    if val == "Neutral Characters":
        val = alterAlign(val)
    if val == " ":
        val = blankAlign(val)
    else:
        return val
    
marvel_df['GSM'] = marvel_df['GSM'].astype('str')
marvel_df['EYE'] = marvel_df['EYE'].astype('str')
marvel_df['HAIR'] = marvel_df['HAIR'].astype('str')
marvel_df['SEX'] = marvel_df['SEX'].astype('str')
marvel_df['ALIVE'] = marvel_df['ALIVE'].astype('str')
marvel_df['ALIGN'] = marvel_df['ALIGN'].astype('str')


In [6]:
marvel_df['GSM'] = marvel_df['GSM'].apply(blankGSM)
marvel_df['EYE'] = marvel_df['EYE'].apply(blankEye)
marvel_df['HAIR'] = marvel_df['HAIR'].apply(blankHair)
marvel_df['SEX'] = marvel_df['SEX'].apply(blankSex)
marvel_df['ALIVE'] = marvel_df['ALIVE'].apply(blankAlive)
marvel_df['ALIGN'] = marvel_df['ALIGN'].apply(replaceNan)

In [7]:
pd.isnull(marvel_df).sum()

ALIGN    0
EYE      0
HAIR     0
SEX      0
GSM      0
ALIVE    0
dtype: int64

In [8]:
marvel_df

Unnamed: 0,ALIGN,EYE,HAIR,SEX,GSM,ALIVE
0,Good Characters,Hazel Eyes,Brown Hair,Male Characters,Heterosexual,Living Characters
1,Good Characters,Blue Eyes,White Hair,Male Characters,Heterosexual,Living Characters
2,,Blue Eyes,Black Hair,Male Characters,Heterosexual,Living Characters
3,Good Characters,Blue Eyes,Black Hair,Male Characters,Heterosexual,Living Characters
4,Good Characters,Blue Eyes,Blond Hair,Male Characters,Heterosexual,Living Characters
...,...,...,...,...,...,...
16371,Bad Characters,Green Eyes,No Hair,Male Characters,Heterosexual,Living Characters
16372,Good Characters,Blue Eyes,Bald,Male Characters,Heterosexual,Living Characters
16373,Bad Characters,Black Eyes,Bald,Male Characters,Heterosexual,Living Characters
16374,,Unknown,Unknown,Male Characters,Heterosexual,Living Characters


In [9]:
marvel_df['ALIGN'] = marvel_df['ALIGN'].astype('str')
marvel_df = marvel_df[~marvel_df.ALIGN.str.contains("nan")]

In [10]:
def binaryCharacter(val):
    if val == "Good Characters":
        val = val.replace("Good Characters", "1")
        return pd.to_numeric(val)
    else:
        val = val.replace("Bad Characters", "0")
        return pd.to_numeric(val)
    
marvel_df['ALIGN'] = marvel_df['ALIGN'].apply(binaryCharacter)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [11]:
marvel_df

Unnamed: 0,ALIGN,EYE,HAIR,SEX,GSM,ALIVE
0,1,Hazel Eyes,Brown Hair,Male Characters,Heterosexual,Living Characters
1,1,Blue Eyes,White Hair,Male Characters,Heterosexual,Living Characters
3,1,Blue Eyes,Black Hair,Male Characters,Heterosexual,Living Characters
4,1,Blue Eyes,Blond Hair,Male Characters,Heterosexual,Living Characters
5,1,Blue Eyes,No Hair,Male Characters,Heterosexual,Living Characters
...,...,...,...,...,...,...
16369,1,Blue Eyes,Black Hair,Female Characters,Heterosexual,Living Characters
16371,0,Green Eyes,No Hair,Male Characters,Heterosexual,Living Characters
16372,1,Blue Eyes,Bald,Male Characters,Heterosexual,Living Characters
16373,0,Black Eyes,Bald,Male Characters,Heterosexual,Living Characters


In [12]:
for c in marvel_df.columns:
    col_type = marvel_df[c].dtype
    if col_type == 'object' :
        a=marvel_df[c].unique()
        keys= range(a.shape[0])
        diction={}
        for idx,val in enumerate(a):
            diction[idx] = a[idx]
        diction = {v: k for k, v in diction.items()}
        print(diction)

{'Hazel Eyes': 0, 'Blue Eyes': 1, 'Brown Eyes': 2, 'Grey Eyes': 3, 'Green Eyes': 4, 'Yellow Eyes': 5, 'Gold Eyes': 6, 'Red Eyes': 7, 'Black Eyeballs': 8, 'Amber Eyes': 9, 'Unknown': 10, 'White Eyes': 11, 'Black Eyes': 12, 'Orange Eyes': 13, 'Variable Eyes': 14, 'Silver Eyes': 15, 'Pink Eyes': 16, 'Purple Eyes': 17, 'One Eye': 18, 'Violet Eyes': 19, 'Multiple Eyes': 20, 'Magenta Eyes': 21, 'Yellow Eyeballs': 22, 'No Eyes': 23, 'Compound Eyes': 24}
{'Brown Hair': 0, 'White Hair': 1, 'Black Hair': 2, 'Blond Hair': 3, 'No Hair': 4, 'Blue Hair': 5, 'Red Hair': 6, 'Bald': 7, 'Auburn Hair': 8, 'Grey Hair': 9, 'Silver Hair': 10, 'Strawberry Blond Hair': 11, 'Green Hair': 12, 'Reddish Blond Hair': 13, 'Gold Hair': 14, 'Unknown': 15, 'Orange Hair': 16, 'Pink Hair': 17, 'Variable Hair': 18, 'Yellow Hair': 19, 'Purple Hair': 20, 'Light Brown Hair': 21, 'Magenta Hair': 22, 'Bronze Hair': 23, 'Orange-brown Hair': 24}
{'Male Characters': 0, 'Female Characters': 1, 'Unknown': 2, 'Agender Characters': 

In [13]:
# Get dummy data for model
marvel_df_ip = pd.get_dummies(marvel_df)

In [14]:
# Create feature list for independent and dependent variables
features = list(marvel_df_ip)
features.remove('ALIGN')
response = ['ALIGN']

In [16]:
# Normalize data for modeling
marvel_df_ip_scaled_ftrs = pd.DataFrame(preprocessing.normalize(marvel_df_ip[features]))
marvel_df_ip_scaled_ftrs.columns=list(marvel_df_ip[features])

In [17]:
marvel_df_ip_scaled_ftrs[:3]

Unnamed: 0,EYE_Amber Eyes,EYE_Black Eyeballs,EYE_Black Eyes,EYE_Blue Eyes,EYE_Brown Eyes,EYE_Compound Eyes,EYE_Gold Eyes,EYE_Green Eyes,EYE_Grey Eyes,EYE_Hazel Eyes,...,SEX_Male Characters,SEX_Unknown,GSM_Bisexual Characters,GSM_Genderfluid Characters,GSM_Heterosexual,GSM_Homosexual Characters,GSM_Pansexual Characters,GSM_Transgender Characters,ALIVE_Deceased Characters,ALIVE_Living Characters
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,...,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.447214
1,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,...,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.447214
2,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,...,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.447214


In [18]:
#train the mdodel
X_train, X_test, y_train, y_test = train_test_split(marvel_df_ip_scaled_ftrs, marvel_df_ip[response], test_size=0.30)

In [19]:
# Create and fir Decision Tree Model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [20]:
# Predicition of model
y_pred = model.predict(X_test)

In [21]:
# Classification Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.80      0.73      1996
           1       0.61      0.45      0.52      1411

    accuracy                           0.65      3407
   macro avg       0.64      0.62      0.62      3407
weighted avg       0.65      0.65      0.64      3407



In [22]:
# Accuracy of Model
accuracy = np.mean(cross_val_score(model, X_test, y_test, scoring='accuracy')) * 100
print("Accuracy: {}%".format(accuracy))

Accuracy: 64.80800003099586%




In [23]:
# Confusion Matrix
print('confusion matrix {}'.format(pd.DataFrame(
            confusion_matrix(y_test, y_pred),
            columns=['Predicted Loss', 'Predicted Win'],
            index=['True Loss', 'True Win']
        )))

confusion matrix            Predicted Loss  Predicted Win
True Loss            1591            405
True Win              779            632
