In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

In [2]:
dc_df = pd.read_csv('dc-wikia-data.csv')

In [3]:
dc_df.head()

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
0,1422,Batman (Bruce Wayne),\/wiki\/Batman_(Bruce_Wayne),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3093.0,"1939, May",1939.0
1,23387,Superman (Clark Kent),\/wiki\/Superman_(Clark_Kent),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2496.0,"1986, October",1986.0
2,1458,Green Lantern (Hal Jordan),\/wiki\/Green_Lantern_(Hal_Jordan),Secret Identity,Good Characters,Brown Eyes,Brown Hair,Male Characters,,Living Characters,1565.0,"1959, October",1959.0
3,1659,James Gordon (New Earth),\/wiki\/James_Gordon_(New_Earth),Public Identity,Good Characters,Brown Eyes,White Hair,Male Characters,,Living Characters,1316.0,"1987, February",1987.0
4,1576,Richard Grayson (New Earth),\/wiki\/Richard_Grayson_(New_Earth),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,1237.0,"1940, April",1940.0


In [4]:
dc_df = dc_df.drop(columns=["ID", 'name', 'page_id', 'urlslug', 'FIRST APPEARANCE', 'APPEARANCES', "YEAR"])
dc_df.head()

Unnamed: 0,ALIGN,EYE,HAIR,SEX,GSM,ALIVE
0,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters
1,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters
2,Good Characters,Brown Eyes,Brown Hair,Male Characters,,Living Characters
3,Good Characters,Brown Eyes,White Hair,Male Characters,,Living Characters
4,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters


In [5]:
def blankGSM(val):
    if val != "nan":
        return val
    else:
        return(val.replace('nan', 'Heterosexual'))
    
def blankEye(val):
    if val != "nan":
        return val
    else:
        return(val.replace('nan', 'Unknown'))
    
def blankHair(val):
    if val != "nan":
        return val
    else:
        return(val.replace('nan', 'Unknown'))
    
def blankSex(val):
    if val != "nan":
        return val
    else:
        return(val.replace('nan', 'Unknown'))
    
def blankAlive(val):
    if val != "nan":
        return val
    else:
        return(val.replace('nan', 'Unknown'))
    
def alterAlign(val):
    if val == "Neutral Characters":
        return(val.replace('Neutral Characters', 'nan'))
    else:
        return(val.replace(val, 'nan'))
    
def blankAlign(val):

        return(val.replace(' ', 'nan'))
    
def replaceNan(val):
    if val == "Neutral Characters" or val == "Reformed Criminals":
        val = alterAlign(val)
    if val == " ":
        val = blankAlign(val)
    else:
        return val
    
dc_df['GSM'] = dc_df['GSM'].astype('str')
dc_df['EYE'] = dc_df['EYE'].astype('str')
dc_df['HAIR'] = dc_df['HAIR'].astype('str')
dc_df['SEX'] = dc_df['SEX'].astype('str')
dc_df['ALIVE'] = dc_df['ALIVE'].astype('str')
dc_df['ALIGN'] = dc_df['ALIGN'].astype('str')


In [6]:
dc_df['GSM'] = dc_df['GSM'].apply(blankGSM)
dc_df['EYE'] = dc_df['EYE'].apply(blankEye)
dc_df['HAIR'] = dc_df['HAIR'].apply(blankHair)
dc_df['SEX'] = dc_df['SEX'].apply(blankSex)
dc_df['ALIVE'] = dc_df['ALIVE'].apply(blankAlive)
dc_df['ALIGN'] = dc_df['ALIGN'].apply(replaceNan)

In [7]:
pd.isnull(dc_df).sum()

ALIGN    0
EYE      0
HAIR     0
SEX      0
GSM      0
ALIVE    0
dtype: int64

In [8]:
dc_df

Unnamed: 0,ALIGN,EYE,HAIR,SEX,GSM,ALIVE
0,Good Characters,Blue Eyes,Black Hair,Male Characters,Heterosexual,Living Characters
1,Good Characters,Blue Eyes,Black Hair,Male Characters,Heterosexual,Living Characters
2,Good Characters,Brown Eyes,Brown Hair,Male Characters,Heterosexual,Living Characters
3,Good Characters,Brown Eyes,White Hair,Male Characters,Heterosexual,Living Characters
4,Good Characters,Blue Eyes,Black Hair,Male Characters,Heterosexual,Living Characters
...,...,...,...,...,...,...
6891,Good Characters,Unknown,Unknown,Female Characters,Heterosexual,Living Characters
6892,Good Characters,Unknown,Unknown,Male Characters,Heterosexual,Living Characters
6893,Good Characters,Unknown,Unknown,Male Characters,Heterosexual,Living Characters
6894,Good Characters,Unknown,Unknown,Male Characters,Heterosexual,Living Characters


In [9]:
dc_df['ALIGN'] = dc_df['ALIGN'].astype('str')
dc_df = dc_df[~dc_df.ALIGN.str.contains("nan")]

In [10]:
def binaryCharacter(val):
    if val == "Good Characters":
        val = val.replace("Good Characters", "1")
        return pd.to_numeric(val)
    else:
        val = val.replace("Bad Characters", "0")
        return pd.to_numeric(val)
    
dc_df['ALIGN'] = dc_df['ALIGN'].apply(binaryCharacter)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [11]:
dc_df

Unnamed: 0,ALIGN,EYE,HAIR,SEX,GSM,ALIVE
0,1,Blue Eyes,Black Hair,Male Characters,Heterosexual,Living Characters
1,1,Blue Eyes,Black Hair,Male Characters,Heterosexual,Living Characters
2,1,Brown Eyes,Brown Hair,Male Characters,Heterosexual,Living Characters
3,1,Brown Eyes,White Hair,Male Characters,Heterosexual,Living Characters
4,1,Blue Eyes,Black Hair,Male Characters,Heterosexual,Living Characters
...,...,...,...,...,...,...
6891,1,Unknown,Unknown,Female Characters,Heterosexual,Living Characters
6892,1,Unknown,Unknown,Male Characters,Heterosexual,Living Characters
6893,1,Unknown,Unknown,Male Characters,Heterosexual,Living Characters
6894,1,Unknown,Unknown,Male Characters,Heterosexual,Living Characters


In [12]:
for c in dc_df.columns:
    col_type = dc_df[c].dtype
    if col_type == 'object' :
        a=dc_df[c].unique()
        keys= range(a.shape[0])
        diction={}
        for idx,val in enumerate(a):
            diction[idx] = a[idx]
        diction = {v: k for k, v in diction.items()}
        print(diction)

{'Blue Eyes': 0, 'Brown Eyes': 1, 'Green Eyes': 2, 'Purple Eyes': 3, 'Red Eyes': 4, 'Hazel Eyes': 5, 'Amber Eyes': 6, 'Unknown': 7, 'Grey Eyes': 8, 'Yellow Eyes': 9, 'Black Eyes': 10, 'Photocellular Eyes': 11, 'White Eyes': 12, 'Pink Eyes': 13, 'Violet Eyes': 14, 'Orange Eyes': 15, 'Auburn Hair': 16, 'Gold Eyes': 17}
{'Black Hair': 0, 'Brown Hair': 1, 'White Hair': 2, 'Blond Hair': 3, 'Red Hair': 4, 'Unknown': 5, 'Green Hair': 6, 'Strawberry Blond Hair': 7, 'Grey Hair': 8, 'Silver Hair': 9, 'Orange Hair': 10, 'Purple Hair': 11, 'Gold Hair': 12, 'Blue Hair': 13, 'Reddish Brown Hair': 14, 'Pink Hair': 15, 'Violet Hair': 16, 'Platinum Blond Hair': 17}
{'Male Characters': 0, 'Female Characters': 1, 'Unknown': 2, 'Genderless Characters': 3, 'Transgender Characters': 4}
{'Heterosexual': 0, 'Bisexual Characters': 1, 'Homosexual Characters': 2}
{'Living Characters': 0, 'Deceased Characters': 1, 'Unknown': 2}


In [13]:
# Get dummy data for model
dc_df_ip = pd.get_dummies(dc_df)

In [14]:
# Create feature list for independent and dependent variables
features = list(dc_df_ip)
features.remove('ALIGN')
response = ['ALIGN']

In [15]:
# Normalize data for modeling
dc_df_ip_scaled_ftrs = pd.DataFrame(preprocessing.normalize(dc_df_ip[features]))
dc_df_ip_scaled_ftrs.columns=list(dc_df_ip[features])

In [16]:
dc_df_ip_scaled_ftrs[:3]

Unnamed: 0,EYE_Amber Eyes,EYE_Auburn Hair,EYE_Black Eyes,EYE_Blue Eyes,EYE_Brown Eyes,EYE_Gold Eyes,EYE_Green Eyes,EYE_Grey Eyes,EYE_Hazel Eyes,EYE_Orange Eyes,...,SEX_Genderless Characters,SEX_Male Characters,SEX_Transgender Characters,SEX_Unknown,GSM_Bisexual Characters,GSM_Heterosexual,GSM_Homosexual Characters,ALIVE_Deceased Characters,ALIVE_Living Characters,ALIVE_Unknown
0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.447214,0.0
1,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.447214,0.0
2,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,...,0.0,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.447214,0.0


In [17]:
#train the mdodel
X_train, X_test, y_train, y_test = train_test_split(dc_df_ip_scaled_ftrs, dc_df_ip[response], test_size=0.30)

In [18]:
# Create and fir Decision Tree Model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [19]:
# Predicition of model
y_pred = model.predict(X_test)

In [20]:
# Classification Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.55      0.58       893
           1       0.57      0.63      0.60       826

    accuracy                           0.59      1719
   macro avg       0.59      0.59      0.59      1719
weighted avg       0.59      0.59      0.59      1719



In [21]:
# Accuracy of Model
accuracy = np.mean(cross_val_score(model, X_test, y_test, scoring='accuracy')) * 100
print("Accuracy: {}%".format(accuracy))

Accuracy: 54.27707162798466%




In [22]:
# Confusion Matrix
print('confusion matrix {}'.format(pd.DataFrame(
            confusion_matrix(y_test, y_pred),
            columns=['Predicted Loss', 'Predicted Win'],
            index=['True Loss', 'True Win']
        )))

confusion matrix            Predicted Loss  Predicted Win
True Loss             494            399
True Win              305            521
