# Home task: Supervised ML cover

## Titanic - Machine Learning from Disaster

### Loading train dataset

In [302]:
import os
import pandas as pd

cwd= os.getcwd()
path = os.path.join(cwd,'data')

def get_train_set():
    fp = os.path.join(path, 'train.csv')
    X = pd.read_csv(fp) 
    
    return X

X = get_train_set()
X.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Loading predict dataset

In [303]:
def get_predict_set():    
    fp = os.path.join(path,'test.csv')
    df_predict = pd.read_csv(fp)

    return df_predict

df_predict= get_predict_set()
list(df_predict)

['PassengerId',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

### Converting NaN values to zeros

In [304]:
def convert_nan_to_zero(df):
    df = df.fillna(0)
    return df

X = convert_nan_to_zero(X)

### Get rid of irrelevant features

In [305]:
def drop_irrelevant_features(df): 
    cols_to_drop = [
        "PassengerId",
        "Name",
        "Ticket",
        "Fare",
        "Cabin"
    ]
    df = df.drop(cols_to_drop, axis=1)
    return df

X = drop_irrelevant_features(X)
y = X["Survived"]

# Remove the target variable from the training set
X = X.drop("Survived", axis=1)

### Encoding non-numeric features

In [306]:
from sklearn.preprocessing import LabelEncoder

def label_encoder(df): 
    cols_to_encode = ["Sex", "Embarked"]
    print ('label encoding.')
    df = df.copy() # to avoid warning related to setting the copy...

    for col in cols_to_encode:
        le = LabelEncoder().fit(df[col].astype(str)) # convert to str first since le may fail due to difference type of data
        df[col] = le.transform(df[col].astype(str))
    return df

X = label_encoder(X)

label encoding.


### Split train set to evaluate the model

In [307]:
from sklearn.model_selection import train_test_split

print("X.shape: ", X.shape)
print("y.shape: ", y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train.shape: ", X_train.shape)
print("X_test.shape: ", X_test.shape)
print("y_train.shape: ", y_train.shape)
print("y_test.shape: ", y_test.shape)

X.shape:  (891, 6)
y.shape:  (891,)
X_train.shape:  (712, 6)
X_test.shape:  (179, 6)
y_train.shape:  (712,)
y_test.shape:  (179,)


### Normalizing the data

In [308]:
from sklearn.preprocessing import MinMaxScaler

def normalize(X_train, X_test):
    print ('normalizing.')
    scaler= MinMaxScaler()
    X_train_scaled= scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled 

X_train_scaled, X_test_scaled  = normalize(X_train, X_test)
type(X_train_scaled)

normalizing.


numpy.ndarray

### Finding best classifier

In [309]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score


def run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train, y_test= None):
    print ('Running GridSearchCV.')
    grid_clf = GridSearchCV(clf, param_grid=grid_values,scoring='f1')
    grid_clf.fit(X_train_scaled, y_train)
    print('Grid best parameter (max f1 ): ', grid_clf.best_params_) 
    print('Grid best score (f1): ', grid_clf.best_score_) 

    if not y_test is None:
        test_score= grid_clf.score(X_test_scaled, y_test)
        print("test f1= {}".format(test_score))


def run_all_classifiers(X_train_scaled, X_test_scaled, y_train, y_test=None, list_classifiers= None):
    if list_classifiers is None or 'LogisticRegression' in list_classifiers:
        print ('\nLogisticRegression.')
        clf = LogisticRegression(max_iter=10000)
        grid_values = {'C': [0.005, 0.01,0.1, 1, 100, 10000, 100000]}
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled,  y_train,  y_test= y_test)

    if list_classifiers is None or 'DecisionTreeClassifier' in list_classifiers:
        print ('\nDecisionTreeClassifier')
        clf = DecisionTreeClassifier()       
        grid_values = {'max_depth': [2,5,7, 20, 50]}
        run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'RandomForestClassifier' in list_classifiers:
        print ('\nRandomForestClassifier.')
        clf = RandomForestClassifier()       
        grid_values = {'n_estimators': [20,50]} #,200,300]}
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'SVC_poly' in list_classifiers:
        print ('\nSVC_poly')
        clf = SVC(kernel='poly')           
        grid_values = {'C': [0.01]}# , 0.1, 1, 100, ]}
        run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'SVC_rbf' in list_classifiers:
        print ('\nSVC_rbf')
        clf = SVC(kernel='rbf')
        grid_values = {'C': [0.005, 0.01]}# , 0.02, 0.03, 0.1, 1, 100, 10000], 'gamma':[0.001, 0.01, 0.1]}
        run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'NB' in list_classifiers:
        print ('\nNB')
        clf =  GaussianNB().fit(X_train_scaled, y_train)
        train_f1 = f1_score(y_train, clf.predict(X_train_scaled))
        print("train set f1= {}".format(train_f1))
        if not y_test is None:
            test_f1 = f1_score(y_test, clf.predict(X_test_scaled))
            print("train set f1= {}".format(test_f1))

            
    if list_classifiers is None or 'GradientBoostingClassifier' in list_classifiers:
        print ('\nGradientBoostingClassifier.')
        clf = GradientBoostingClassifier() # learning_rate = 0.03)       
        grid_values = {'max_depth': [3,5,7]}
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'MLP' in list_classifiers:
        print ('\nMLP.')
        clf = MLPClassifier(hidden_layer_sizes = [50]) #, 100])
        grid_values = {'alpha' : [0.001, 0.01, 0.1, 1, 10]}
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'xgboost' in list_classifiers:
        print ('\nxgboost.')
        clf = XGBClassifier().fit(X_train_scaled, y_train)
        y_predicted = clf.predict(X_test_scaled)
        print ('f1_score  = {:.2}'.format(f1_score(y_test, y_predicted)))

In [310]:
list_classifiers= [
    'LogisticRegression',
    'DecisionTreeClassifier',
    'RandomForestClassifier',
    'NB',
    'GradientBoostingClassifier', 
#   'MLP', 
    'xgboost', 
    
]
run_all_classifiers(X_train_scaled, X_test_scaled, y_train, y_test, list_classifiers= list_classifiers)


LogisticRegression.
Running GridSearchCV.


Grid best parameter (max f1 ):  {'C': 100}
Grid best score (f1):  0.7083895170064711
test f1= 0.7534246575342465

DecisionTreeClassifier
Running GridSearchCV.
Grid best parameter (max f1 ):  {'max_depth': 50}
Grid best score (f1):  0.712069712475673
test f1= 0.7172413793103449

RandomForestClassifier.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'n_estimators': 50}
Grid best score (f1):  0.7242111756905032
test f1= 0.762589928057554

NB
train set f1= 0.7145557655954632
train set f1= 0.7320261437908496

GradientBoostingClassifier.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'max_depth': 3}
Grid best score (f1):  0.7325192524933138
test f1= 0.7591240875912408

xgboost.
f1_score  = 0.78


### Getting results on predict dataset

In [311]:
from xgboost import XGBClassifier

clf = XGBClassifier().fit(X_train_scaled, y_train)

# Preprocessing predict dataset
df_predict = convert_nan_to_zero(df_predict)

PassengerId = df_predict["PassengerId"]
df_predict = drop_irrelevant_features(df_predict)
df_predict = label_encoder(df_predict)

y_predicted = clf.predict(df_predict)

result_df = pd.DataFrame({'PassengerId': PassengerId, 'Survived': y_predicted})
result_df.to_csv('submission.csv', index=False)

result_df.head(10)

label encoding.


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0
