# Classification

## Exploratory Data Analysis:

### Step 1: Reading, printing, looking for NaN's

In [2]:
import pandas as pd
import plotly.express as px

titanicdf = pd.read_csv("../../data/train_v1.csv", index_col=0)
titanicdf.sample(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q
88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S
745,1,3,"Stranden, Mr. Juho",male,31.0,0,0,STON/O 2. 3101288,7.925,,S
680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,B51 B53 B55,C
698,1,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q
631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S
732,0,3,"Hassan, Mr. Houssein G N",male,11.0,0,0,2699,18.7875,,C
790,0,1,"Guggenheim, Mr. Benjamin",male,46.0,0,0,PC 17593,79.2,B82 B84,C
543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11.0,4,2,347082,31.275,,S
891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [3]:
# Remark: If you just use .dropna you lose at least (682/891)*100 % of data, bc of Cabin feature!

px.bar(titanicdf.isna().sum())

### Step 2: Wrangling and Plot

In [4]:
titanic_survived = pd.DataFrame(titanicdf["Survived"].value_counts())
titanic_survived = titanic_survived.reset_index().rename(columns={"index": "Survived?", "Survived": "Survival_count"})
titanic_survived = titanic_survived.replace({0: "Died", 1: "Survived"}).set_index("Survived?")

In [5]:
px.bar(titanic_survived, y="Survival_count", labels={"Survival_count": "Count"})

### Step 3: 1st class survivors / Total number of 1st class passengers

In [6]:
first_class_pass_count = titanicdf["Pclass"].value_counts().loc[1]
first_class_survivors = titanicdf[(titanicdf["Survived"] == 1) & (titanicdf["Pclass"] == 1)].shape[0]

fcs = first_class_survivors / first_class_pass_count

print(f"{round(fcs*100,1):0.1f}% of first class passengers survived")

63.0% of first class passengers survived


### Step 4: bar plots passenger count by sex and class

In [7]:
px.histogram(titanicdf, x="Sex", color="Pclass", barmode="group", category_orders={"Pclass": [1, 2, 3]})

### Step 5: Histogram age and survival

In [8]:
px.histogram(titanicdf, x="Age", color="Survived", barmode="stack", nbins=20)

### Step 6: avg age survived vs. drowned

In [9]:
avg_age_sur = titanicdf[titanicdf["Survived"] == 1]["Age"].mean()
avg_age_dro = titanicdf[titanicdf["Survived"] == 0]["Age"].mean()
print(f"Average age of survivors: {avg_age_sur:0.1f} years")
print(f"Average age of drowned: {avg_age_dro:0.1f} years")

Average age of survivors: 28.3 years
Average age of drowned: 30.6 years


### Step 7: Fill NaN-age values with mean of all ages

In [10]:
titanicdf["Age"] = titanicdf["Age"].fillna(round(titanicdf["Age"].mean(),0))

### Step 8: create df counting surv vs. dead seperated by Pclass and sex

In [11]:
titanic_ssp = titanicdf[["Sex","Pclass","Survived"]]

In [12]:
tab1 = titanic_ssp[titanic_ssp["Survived"] == 1].groupby(["Sex","Pclass"]).count()
tab2 = titanic_ssp[titanic_ssp["Survived"] == 0].groupby(["Sex","Pclass"]).count().rename(columns={"Survived": "Drowned/Died"})
pd.concat([tab1, tab2], axis=1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Survived,Drowned/Died
Sex,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
female,1,91,3
female,2,70,6
female,3,72,72
male,1,45,77
male,2,17,91
male,3,47,300


## Build a logistic regression / random forest model:

### Step 0: Further Feature engineering

In [13]:
# Remark: Already filled NaN ages in tatianicdf with mean age

titanicdf["Age"] = titanicdf["Age"].astype(int)
titanicdf["is_Male"] = pd.get_dummies(titanicdf["Sex"], drop_first=True)
titanicdf[["is_Pclass_2", "is_Pclass_3"]] = pd.get_dummies(titanicdf["Pclass"], drop_first=True)
age_bins = pd.cut(titanicdf['Age'], bins=[0, 20, 55, 85], labels=['young', 'middleaged', 'old'])  
titanicdf["AgeClass"] = age_bins
titanicdf[["is_middleaged","is_old"]] = pd.get_dummies(titanicdf["AgeClass"], drop_first=True)
#titanicdf["AgeClass"].value_counts()
titanicdf.sample(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Male,is_Pclass_2,is_Pclass_3,AgeClass,is_middleaged,is_old
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
341,1,2,"Navratil, Master. Edmond Roger",male,2,1,1,230080,26.0,F2,S,1,1,0,young,0,0
568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29,0,4,349909,21.075,,S,0,0,1,middleaged,1,0
559,1,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",female,39,1,1,110413,79.65,E67,S,0,0,0,middleaged,1,0
458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,30,1,0,17464,51.8625,D21,S,0,0,0,middleaged,1,0
505,1,1,"Maioni, Miss. Roberta",female,16,0,0,110152,86.5,B79,S,0,0,0,young,0,0
606,0,3,"Lindell, Mr. Edvard Bengtsson",male,36,1,0,349910,15.55,,S,1,0,1,middleaged,1,0
820,0,3,"Skoog, Master. Karl Thorsten",male,10,3,2,347088,27.9,,S,1,0,1,young,0,0
527,1,2,"Ridsdale, Miss. Lucy",female,50,0,0,W./C. 14258,10.5,,S,0,1,0,middleaged,1,0
452,0,3,"Hagland, Mr. Ingvald Olai Olsen",male,30,1,0,65303,19.9667,,S,1,0,1,middleaged,1,0
88,0,3,"Slocovski, Mr. Selman Francis",male,30,0,0,SOTON/OQ 392086,8.05,,S,1,0,1,middleaged,1,0


In [14]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [15]:
#titanicdf["Survived"] = titanicdf["Survived"].replace({0: "drowned", 1: "survived"})

### Step 1: Data splitting and model building

In [16]:
y = titanicdf["Survived"]
X = titanicdf[["is_Male","is_middleaged","is_old","is_Pclass_2","is_Pclass_3","SibSp"]]
type(y), type(X)

(pandas.core.series.Series, pandas.core.frame.DataFrame)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=666)

In [18]:
"""
# Scaling tested -> doesn't change score for LogReg

sscaler = StandardScaler()
mscaler = MinMaxScaler()
rscaler = RobustScaler()

sscaler.fit(X_train)
mscaler.fit(X_train)
rscaler.fit(X_train)

X_train = rscaler.transform(X_train)
X_test = rscaler.transform(X_test)
"""

"\n# Scaling tested -> doesn't change score for LogReg\n\nsscaler = StandardScaler()\nmscaler = MinMaxScaler()\nrscaler = RobustScaler()\n\nsscaler.fit(X_train)\nmscaler.fit(X_train)\nrscaler.fit(X_train)\n\nX_train = rscaler.transform(X_train)\nX_test = rscaler.transform(X_test)\n"

In [19]:
m_lr = LogisticRegression(class_weight="balanced")
m_rf = RandomForestClassifier(max_depth=3, n_estimators=1000, class_weight="balanced")

In [20]:
m_lr.fit(X_train, y_train)
m_rf.fit(X_train,y_train)

### Step 2: Print model coefficients

In [21]:
X_train.keys(), m_lr.coef_[0], m_lr.intercept_[0]

(Index(['is_Male', 'is_middleaged', 'is_old', 'is_Pclass_2', 'is_Pclass_3',
        'SibSp'],
       dtype='object'),
 array([-2.51147629, -0.77626592, -1.4012375 , -0.8614021 , -1.75603631,
        -0.3314261 ]),
 3.4346491873291103)

### Step 3: Model eval for m_lr:

In [22]:
prob = m_lr.predict_proba(X_test)

In [23]:
evaluate_testdata = pd.DataFrame()
evaluate_testdata['Survived?'] = y_test
evaluate_testdata['prediction'] = m_lr.predict(X_test)
evaluate_testdata['prob for surv'] = m_lr.predict_proba(X_test)[:,1]
evaluate_testdata['prob for drow'] = m_lr.predict_proba(X_test)[:,0]

In [24]:
evaluate_testdata.sample(10)

Unnamed: 0_level_0,Survived?,prediction,prob for surv,prob for drow
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
282,0,0,0.16671,0.83329
769,0,0,0.125587,0.874413
357,1,1,0.934526,0.065474
105,0,0,0.09347,0.90653
352,0,1,0.536661,0.463339
212,1,1,0.857781,0.142219
418,1,1,0.92912,0.07088
72,0,1,0.50537,0.49463
234,1,1,0.587323,0.412677
815,0,0,0.16671,0.83329


In [25]:
trn_scr = m_lr.score(X_train, y_train)
tst_scr = m_lr.score(X_test, y_test)
print(f"{round(trn_scr,3)*100:0.1f}% of survival-statuses were predicted correctly for train data")
print(f"{round(tst_scr,3)*100:0.1f}% of survival-statuses were predicted correctly for test data")

76.4% of survival-statuses were predicted correctly for train data
77.7% of survival-statuses were predicted correctly for test data


### Step 4: Alternative positive thresholds at 90% and 10%

In [26]:
new_thresh_eval90 = evaluate_testdata.copy()
new_thresh_eval10 = evaluate_testdata.copy()

In [27]:
new_thresh_eval90.loc[new_thresh_eval90["prob for surv"] >= 0.9, "prob for surv"] = 1
new_thresh_eval90.loc[new_thresh_eval90["prob for surv"] < 0.9, "prob for surv"] = 0

new_thresh_eval10.loc[new_thresh_eval10["prob for surv"] >= 0.1, "prob for surv"] = 1
new_thresh_eval10.loc[new_thresh_eval10["prob for surv"] < 0.1, "prob for surv"] = 0


#new_thresh_eval

In [28]:
corr_pred90 = (new_thresh_eval90["prob for surv"].astype(int) == new_thresh_eval90["Survived?"]).value_counts()

corr_pred10 = (new_thresh_eval10["prob for surv"].astype(int) == new_thresh_eval10["Survived?"]).value_counts()

th90 = round(corr_pred90[0] / corr_pred90.sum()*100,1)
th10 = round(corr_pred10[0] / corr_pred10.sum()*100,1)

print(f"With threshold at 90%, the percentage of correct predictions is: {th90}%")
print(f"With threshold at 10%, the percentage of correct predictions is: {th10}%")

With threshold at 90%, the percentage of correct predictions is: 73.2%
With threshold at 10%, the percentage of correct predictions is: 58.7%


### Step 5: Model eval for m_rf

In [29]:
prob = m_rf.predict_proba(X_test)

eval_tdata = pd.DataFrame()
eval_tdata['Survived?'] = y_test
eval_tdata['prediction'] = m_rf.predict(X_test)
eval_tdata['prob for surv'] = prob[:,1]
eval_tdata['prob for drow'] = prob[:,0]

In [30]:
eval_tdata.sample(10)

Unnamed: 0_level_0,Survived?,prediction,prob for surv,prob for drow
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
591,0,0,0.266508,0.733492
757,0,0,0.266508,0.733492
617,0,0,0.297435,0.702565
448,1,0,0.412722,0.587278
742,0,0,0.47664,0.52336
694,0,0,0.266508,0.733492
321,0,0,0.266508,0.733492
514,1,1,0.849723,0.150277
471,0,0,0.266508,0.733492
791,0,0,0.266508,0.733492


In [31]:
m_rf.score(X_train,y_train), m_rf.score(X_test,y_test)

(0.8103932584269663, 0.7932960893854749)

In [32]:
feature_label = pd.DataFrame(X_train).columns
feature_importance = m_rf.feature_importances_

print(feature_label, feature_importance)

Index(['is_Male', 'is_middleaged', 'is_old', 'is_Pclass_2', 'is_Pclass_3',
       'SibSp'],
      dtype='object') [0.60486414 0.02160593 0.01254142 0.02517833 0.23143492 0.10437527]
