# Data preprocessing and binary classification

In [1]:
import math
import pandas as pd
import numpy as np
import string

from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression 

In [2]:
# funkce

def edit_oddil(val):
    if type(val) != float:
        return val[:1]
    else: 
        return np.nan
    
def encode_categories(df, mappers, dummies=False):
    le = LabelEncoder()
    for col in df.select_dtypes('object').columns:
        if col not in mappers and df[col].nunique() < 30:
            df[col] = df[col].fillna('NaN')
            df[col] = le.fit_transform(df[col])
            if dummies:
                prefix = 'd_' + col
                df = pd.concat([df.drop(columns=[col]), pd.get_dummies(df[col], prefix=prefix)], axis=1)
        elif col in mappers:
            df[col] = df[col].replace(mappers[col])
        elif df[col].nunique() >= 30:
            df = df.drop(columns=[col])
    return df    

def replace_nans(df, cols_nan):
    for col in cols_nan:
        d1 = df[df[col].isnull()]
        d2 = df[df[col].notnull()]
        
        y = d2[col]
        x = d2.drop(columns=cols_nan)
        x2 = d1.drop(columns=cols_nan)

        if df[col].dtype == 'float64':
            model = KNeighborsRegressor(n_neighbors=5)
        else:
            model = KNeighborsClassifier(n_neighbors=5)
        
        model.fit(x,y)
        y2 = model.predict(x2)
        
        df[col][df[col].isnull()] = y2  
    return df

def cross_val(x, y, folds, model, dummies = False):
    score = 0
    np.random.seed(seed=654) 
    
    if not dummies:
            x = x.loc[:, x.nunique() > 2]   
            
    fold_idx = np.random.randint(folds, size=x.shape[0])
    
    for fold in range(folds):
        xtr = x[ fold_idx != fold ]
        xval   = x[ fold_idx == fold ]
        ytr = y[ fold_idx != fold ]
        yval   = y[ fold_idx == fold ]        

        model.fit(xtr, ytr)
        score += metrics.accuracy_score(ytr, model.predict(xtr))
        
    return score/folds

Pro predikci jsem nejdříve upravila sloupec oddil, kde me zajímá ve které sekci cestující bydlel, dalo by se říct, že pokud někco bydlel na přídi/zádi, mohl by mít lepší přístup k úniku. Oddíl jsem rozdělila pouze podle písmen v abecedě.

Od jmena jsem vyseparovala osloveni, ktere nam mohlo rict, ze napriklad slecny byly vice zachranovany nez pani a dalsi.

Odstranila jsem příliš veliké a nevhodné příznaky jako je cabin,name,ticket,ID,fare,home.dest.

K odstraneni neciselnych priznaku jsem pouzila funkci ze cviceni encode_categories, ktery pouziva Label Encoder a upravila ji tak, ze ty příznaky, které se během celé funkce neupraví na číselný, tak se odstraní.

K odstranění NaN hodnot jsem použila opět funkci ze cvičení replace_nans, která predikuje pomocí KNN algoritmu chybějící hodnoty.

Nakonec jsem float hodnoty, převedla na integery

In [15]:
# načíst a upravit data 

data = pd.read_csv('data.csv')
# rozlisit nominalni a ordinalni neciselne priznaky a prevest je na ciselne
data['oddil'] = data['cabin'].apply(edit_oddil)

n = data['name'].str.rsplit(',',1,expand=True)[1].str.split('.',1,expand=True)
data['osloveni'] = n[0]

to_drop = ['cabin','name','ticket','ID','fare','home.dest']
data = data.drop(columns=to_drop,axis=1)
# display(data['oddil'].unique())

mappers = {}
mappers['oddil'] = {ch: n for n, ch in enumerate(string.ascii_uppercase)}

# label encoding zpusob predzpracovani
data = encode_categories(data,mappers,True)

# odstraneni Nan hodnot pomoci knn predikce
cols_nan = data.loc[:,data.isnull().sum() > 0].columns
# data = replace_nans(data,cols_nan)
data = data.replace(np.nan,-1)
data[['age','oddil']] = data[['age','oddil']].astype(int)

display(data)



Unnamed: 0,survived,pclass,age,sibsp,parch,oddil,d_sex_0,d_sex_1,d_embarked_0,d_embarked_1,...,d_osloveni_7,d_osloveni_8,d_osloveni_9,d_osloveni_10,d_osloveni_11,d_osloveni_12,d_osloveni_13,d_osloveni_14,d_osloveni_15,d_osloveni_16
0,1,3,22,0,0,-1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,3,-1,0,0,-1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,1,19,1,0,1,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,3,25,0,0,-1,0,1,1,0,...,0,0,0,0,0,1,0,0,0,0
4,0,3,-1,0,0,-1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,1,48,1,1,1,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0
996,1,2,3,1,2,-1,1,0,1,0,...,0,0,1,0,0,0,0,0,0,0
997,1,3,-1,0,0,-1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
998,1,3,-1,0,0,-1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


Protoze používám cross validation, data jsem rozdelila pouze na 2 sady: testovaci a trenovaci. Predikovanou promennou je "survived"

In [18]:
# rozdel data na testovaci a trenovaci sadu

ydata = data['survived']
xdata = data.loc[:, data.columns != 'survived']

rd_seed = 333
xtrain, xtest, ytrain, ytest = train_test_split(xdata, ydata, test_size=0.25, random_state=rd_seed)
#   validacni sada
xtrain, xval, ytrain, yval= train_test_split(xtrain, ytrain, test_size=0.25, random_state=rd_seed)

In [None]:
U každého modelu se nejdříve snažím o nalezení optimálních parametrů jejich procházením a počítáním na validačních datech (cross validací). Optimální parametry pak použiju k vytvoření a naučení konečného modelu. Jeho přesnost na testovacích a treénovacích datech si nechám vypsat.

In [25]:
# jednoduchy model stromu

param_grid = {
    "max_depth": range(5,100)
}

# cross validation
param_comb = ParameterGrid(param_grid)
val_acc = 0
for params in param_comb:
    dt = DecisionTreeClassifier(**params)
    score = cross_val(xtrain.copy(), ytrain, 10, dt, True)
    if val_acc < score:
        opt_params_dt = params
        val_acc = score
print(opt_params_dt)

dt = DecisionTreeClassifier(**opt_params_dt)
dt.fit(xtrain,ytrain)
print('accuracy on train data: ' +str(metrics.accuracy_score(ytrain, dt.predict(xtrain))))
print('accuracy on test data: ' +str(metrics.accuracy_score(ytest, dt.predict(xtest))))
print('accuracy on val data: '+str(metrics.accuracy_score(yval, dt.predict(xval))))

acc_dt = metrics.accuracy_score(ytest, dt.predict(xtest))

{'max_depth': 20}
accuracy on train data: 0.9412811387900356
accuracy on test data: 0.74
accuracy on val data: 0.7553191489361702


In [24]:
# Bagging: nahodne lesy

param_grid = {
    "n_estimators": range(1,100,5),
    "max_depth": range(1,10)
}

# cross validation
param_comb = ParameterGrid(param_grid)
val_acc = 0
for params in param_comb:
    rfc = RandomForestClassifier(**params,random_state = 0)
    score = cross_val(xtrain.copy(), ytrain, 10, rfc, True)
    if val_acc < score:
        opt_params_rfc = params
        val_acc = score
print(opt_params_rfc)

# create forest
rfc = RandomForestClassifier(**opt_params_rfc,random_state = 0)
rfc.fit(xtrain, ytrain)
print('accuracy on train data: ' +str(metrics.accuracy_score(ytrain, rfc.predict(xtrain))))
print('accuracy on test data: ' +str(metrics.accuracy_score(ytest, rfc.predict(xtest))))
print('accuracy on val data: '+str(metrics.accuracy_score(yval, dt.predict(xval))))

acc_rfc = metrics.accuracy_score(yval, rfc.predict(xval))

{'max_depth': 9, 'n_estimators': 76}
accuracy on train data: 0.9039145907473309
accuracy on test data: 0.788
accuracy on val data: 0.7553191489361702


In [26]:
# Boosting: AdaBoost

param_grid = {
    'n_estimators': range(1,100,5),
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5, 1]
}

# cross validation
param_comb = ParameterGrid(param_grid)
val_acc = 0
for params in param_comb:
    ada = AdaBoostClassifier(**params)
    score = cross_val(xtrain.copy(), ytrain, 10, ada, True)
    if val_acc < score:
        opt_params_ada = params
        val_acc = score
print(opt_params_ada)

# create ada
ada = AdaBoostClassifier(**opt_params_ada)
ada.fit(xtrain, ytrain)
print('accuracy on train data: ' +str(metrics.accuracy_score(ytrain, ada.predict(xtrain))))
print('accuracy on test data: ' +str(metrics.accuracy_score(ytest, ada.predict(xtest))))
print('accuracy on val data: '+str(metrics.accuracy_score(yval, dt.predict(xval))))

acc_ada = metrics.accuracy_score(yval, ada.predict(xval))

{'learning_rate': 0.5, 'n_estimators': 66}
accuracy on train data: 0.8309608540925267
accuracy on test data: 0.804
accuracy on val data: 0.7553191489361702


In [27]:
# KNN

param_grid = {
    'n_neighbors' : range(1,20),
    'p': range(1,5),
    'weights': ['uniform', 'distance']
}

#cross validation
param_comb = ParameterGrid(param_grid)
val_acc = 0
for params in param_comb:
    knn = KNeighborsClassifier(**params)
    score = cross_val(xtrain.copy(), ytrain, 10, knn, True)
    if val_acc < score:
        opt_params_knn = params
        val_acc = score
print(opt_params_knn)


# create KNN
knn = KNeighborsClassifier(**opt_params_knn)
knn.fit(xtrain,ytrain)
print('accuracy on train data: ' +str(metrics.accuracy_score(ytrain, knn.predict(xtrain))))
print('accuracy on test data: ' +str(metrics.accuracy_score(ytest, knn.predict(xtest))))
print('accuracy on val data: '+str(metrics.accuracy_score(yval, dt.predict(xval))))

acc_knn = metrics.accuracy_score(yval, knn.predict(xval))

{'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
accuracy on train data: 0.9412811387900356
accuracy on test data: 0.8
accuracy on val data: 0.7553191489361702


Pro logistickou regresi jsem se inspirovala v odkazu níže a vyzkoušela proiterovat různé nastavení regularizace a různé algoritmy řešení.

https://www.kaggle.com/joparga3/2-tuning-parameters-for-logistic-regression

In [28]:
# logisticka regrese

param_grid = {
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C': [0.001,0.01,0.1,1,10,100]
}

#cross validation
param_comb = ParameterGrid(param_grid)
val_acc = 0
for params in param_comb:
    lgr = LogisticRegression(**params,random_state= 0)
    score = cross_val(xtrain.copy(), ytrain, 10, lgr, True)
    if val_acc < score:
        opt_params_lgr = params
        val_acc = score
print(opt_params_lgr)

# create Logistic regression model
lgr = LogisticRegression(**opt_params_lgr,random_state= 0)
lgr.fit(xtrain,ytrain)
print('accuracy on train data: ' +str(metrics.accuracy_score(ytrain, lgr.predict(xtrain))))
print('accuracy on test data: ' +str(metrics.accuracy_score(ytest, lgr.predict(xtest))))
print('accuracy on val data: '+str(metrics.accuracy_score(yval, dt.predict(xval))))
      
acc_lgr = metrics.accuracy_score(yval, lgr.predict(xval))

{'C': 1, 'solver': 'liblinear'}
accuracy on train data: 0.8238434163701067
accuracy on test data: 0.82
accuracy on val data: 0.7553191489361702


Pro predikci promenne survived jsem nejprve upravila data stejnym zpusobem jako testovaci.
Následně si porovnala výsledné přesnosti jednotlivých modelů a z nich vyvodila, že nejpřesnějším pro tento případ je 

In [29]:
# evaluation data to test model

test_data = pd.read_csv('evaluation.csv')

# uprav data stejnym zpusobem jako predchozi
test_data['oddil'] = test_data['cabin'].apply(edit_oddil)

n = test_data['name'].str.rsplit(',',1,expand=True)[1].str.split('.',1,expand=True)
test_data['osloveni'] = n[0]

to_drop = ['cabin','name','ticket','ID','fare','home.dest']
test_data = test_data.drop(columns=to_drop)

test_data = encode_categories(test_data,mappers,True)

cols_nan = test_data.loc[:,test_data.isnull().sum() > 0].columns
test_data = replace_nans(test_data,cols_nan)
test_data[['age','oddil']] = test_data[['age','oddil']].astype(int)

to_add = [item for item in data.columns.drop('survived') if item not in test_data.columns]
test_data[to_add] = pd.DataFrame(0, index=range(data.shape[0]), columns=to_add)

# vyber nejlepsi model
print(acc_dt)
print(acc_rfc)
print(acc_ada)
print(acc_knn)
print(acc_lgr)

# lgr (logisticka regrese) ma nejlepsi presnost
ypred = lgr.predict(test_data)
print('accuracy on val data: '+str(metrics.accuracy_score(yval, dt.predict(xval))))
result = pd.DataFrame(columns=['ID','Survived'])
result.ID = pd.read_csv('evaluation.csv').ID
result.Survived = ypred
result.to_csv('results.csv',index= False)

0.74
0.7872340425531915
0.7659574468085106
0.7606382978723404
0.7978723404255319
accuracy on val data: 0.7553191489361702
