<a href="https://www.kaggle.com/code/erayyuztyurk/titanic-machine-learning-from-disaster?scriptVersionId=156316602" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# TITANIC COMPETITION
---

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

pd.set_option("display.expand_frame_repr",False)

# Load Dataset

In [None]:
df = pd.read_csv("/kaggle/input/titanic/train.csv")
df.head()

## >> Overview

In [None]:
def dataframe_summary(dataframe, show_value_counts = False):
    import pandas as pd
    pd.set_option("display.colheader_justify","left")
    print("-------------------------------------------------------------------------------------------------------------------------------------------------")
    print("-------- SHAPE of Dataset -----------------------------------------------------------------------------------------------------------------------")
    print(dataframe.shape)
    print("-------------------------------------------------------------------------------------------------------------------------------------------------")
    print("-------- DATA TYPES of Dataset ------------------------------------------------------------------------------------------------------------------")
    print(dataframe.dtypes)
    print("-------------------------------------------------------------------------------------------------------------------------------------------------")
    print("-------- MEMORY USAGE of Dataset --------------------------------------------------------------------------------------------------------------  ")
    print(dataframe.memory_usage(deep=True) / (1024 * 1024))
    print("TOTAL >>> ", round((dataframe.memory_usage(deep=True) / (1024 * 1024)).sum(),2), "\n (All are in MB)")
    print("-------------------------------------------------------------------------------------------------------------------------------------------------")
    print("-------- MISSING VALUES in Dataset --------------------------------------------------------------------------------------------------------------")
    print(dataframe.isnull().sum())
    print("-------------------------------------------------------------------------------------------------------------------------------------------------")
    print("-------- DESCRIPTIVE Info about Dataset ---------------------------------------------------------------------------------------------------------")
    print(dataframe.describe([0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]).T)
    print("-------------------------------------------------------------------------------------------------------------------------------------------------")

    if show_value_counts:
        print("-------- VALUE COUNTS in Dataset ----------------------------------------------------------------------------------------------------------------")
        for col in dataframe.columns:
            print(dataframe[col].value_counts())
            print("-------------------------------------------------------------------------------------------------------------------------------------------------")

In [None]:
dataframe_summary(df)

## >> Lower All Variable Names and Take an Overview of Dataset

In [None]:
df.columns = [col.lower() for col in df.columns]
print("-"*33)
print(df.shape)
print("-"*33)
print(df.info())
print("-"*33)
print(df.describe([0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99]).T)
print("-"*33)
df.head()

# Creating New Features

## >> Define New Features

In [None]:
# passeger have a cabin
df["n_has_cabin"] = df["cabin"].notnull().astype(int)

# 'dr in name
df["n_name_dr"] = df["name"].apply(lambda x: len([x for x in x.split() if x.startswith("Dr")]))

# 'title' in name
df['n_title'] = df["name"].str.extract(' ([A-Za-z]+)\.', expand=False)

# family size
df["n_family_size"] = df["sibsp"] + df["parch"] + 1

# is alone
df.loc[((df['sibsp'] + df['parch']) > 0), "n_is_alone"] = "No"
df.loc[((df['sibsp'] + df['parch']) == 0), "n_is_alone"] = "Yes"

# age level
df.loc[(df['age'] < 18), 'n_age_cat'] = 'Young'
df.loc[(df['age'] >= 18) & (df['age'] < 56), 'n_age_cat'] = 'Mature'
df.loc[(df['age'] >= 56), 'n_age_cat'] = 'Senior'

# sex according to age
df.loc[(df['sex'] == 'male') & (df['age'] <= 21), 'n_sex_cat'] = 'Young male'
df.loc[(df['sex'] == 'male') & (df['age'] > 21) & (df['age'] < 50), 'n_sex_cat'] = 'Mature male'
df.loc[(df['sex'] == 'male') & (df['age'] >= 50), 'n_sex_cat'] = 'Senior male'
df.loc[(df['sex'] == 'female') & (df['age'] <= 21), 'n_sex_cat'] = 'Young female'
df.loc[(df['sex'] == 'female') & (df['age'] > 21) & (df['age'] < 50), 'n_sex_cat'] = 'Mature female'
df.loc[(df['sex'] == 'female') & (df['age'] >= 50), 'n_sex_cat'] = 'Senior female'

print("-"*33)
print("New Shape of Dataset:")
print("-"*33)
print(df.shape)
print("-"*33)
df.head()

## >> Define Data Types

In [None]:
df.dtypes

In [None]:
cat_cols = ["survived", "pclass", "sex","sibsp","parch","cabin", "embarked","n_name_dr","n_title","n_is_alone","n_age_cat","n_sex_cat"] 
num_cols = ["age","fare","n_family_size"]
car_cols = ["passengerid", "name","ticket"]
target_label = 'survived'

print("-"*33)
print(f"Categorical Variables: {cat_cols}")
print("-"*33)
print(f"Numerical Variables: {num_cols}")
print("-"*33)
print(f"Cardinal Variables: {car_cols}")
print("-"*33)

In [None]:
for col in cat_cols:
    print("-"*33)
    print(df[col].value_counts())
    print("-"*33)

## >> Explore Data

In [None]:
def categoricals_summary(dataframe, column_name, plot=False):
    print(pd.DataFrame({column_name: dataframe[column_name].value_counts(),
                 "Ratio": 100 * (dataframe[column_name].value_counts() / len(dataframe))}))
    print("--------------------------------------------------------------------------------------------------------")

    if plot:
        sns.countplot(x=dataframe[column_name], data=dataframe)
        plt.show()
        print("--------------------------------------------------------------------------------------------------------")
#--------------------------------------------------------------------------------------------------------        
def numericals_summary(dataframe, column_name, plot = False, plot_bins = 20):

    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[column_name].describe(quantiles).T)
    print("--------------------------------------------------------------------------------------------------------")
    
    if plot:
        dataframe[column_name].hist(bins = plot_bins)
        plt.xlabel(column_name)
        plt.title(column_name)
        plt.show()
        print("--------------------------------------------------------------------------------------------------------")

In [None]:
for col in cat_cols:
    categoricals_summary(df, column_name = col, plot=True)

In [None]:
for col in num_cols:    
    numericals_summary(df, col, plot=True)

# Check Outliers

In [None]:
def outlier_thresholds(dataframe, column_name, q1 = 0.25, q3 = 0.75, print_info=True):
    q1_num = dataframe[column_name].quantile(q1)
    q3_num = dataframe[column_name].quantile(q3)
    iqr = q3_num - q1_num
    lower_threshold = q1_num - iqr * 1.5
    upper_threshold = q3_num + iqr * 1.5
    if print_info:
        print("-----------------------------------------------------------------------------------------------------------")
        print(f"for {column_name}:")
        print("-----------------------------------------------------------------------------------------------------------")
        print(f"Q1 is {q1}")
        print(f"Q3 is {q3}")
        print(f"Q1 threshold is {q1_num}")
        print(f"Q3 threshold is {q3_num}")
        print(f"IQR is {iqr}")
        print(f"Lower threshold is {lower_threshold}")
        print(f"Upper Threshold is {upper_threshold}")
        print("-----------------------------------------------------------------------------------------------------------")
    return lower_threshold, upper_threshold
#----------------------------------------------------------------------------------------------------------------------
def check_if_outlier_exists(dataframe, column_name, q1 = 0.25, q3 = 0.75, print_details=True):
    low_limit, up_limit = outlier_thresholds(dataframe, column_name, q1, q3, print_details)
    if dataframe[((dataframe[column_name] < low_limit) | (dataframe[column_name] > up_limit))].shape[0] > 0:
        print("-----------------------------------------------------------------------------------------------------------")
        print(f"{column_name} : Outliers exist based on Q1={q1} and Q3={q3}")
        print("-----------------------------------------------------------------------------------------------------------")
        print(f"Under lower bound: {len(dataframe[dataframe[column_name] < low_limit])} observation of {len(dataframe)}")
        print(f"Above upper bound: {len(dataframe[dataframe[column_name] > up_limit])} observation of {len(dataframe)}")
        print("-------------------------------------------------------------------------------------------------------")
        return True
    else:
        print("-----------------------------------------------------------------------------------------------------------")
        print(f"{column_name} : No Outlier based on Q1={q1} and Q3={q3}")
        print("-------------------------------------------------------------------------------------------------------")
        return False
#----------------------------------------------------------------------------------------------------------------------

In [None]:
for col in num_cols:
    print(col, check_if_outlier_exists(df, col, 0.05, 0.95))

## >> Winsorise thresholds

In [None]:
def winsorize_with_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name, q1, q3)
    dataframe.loc[(dataframe[col_name] < low_limit), col_name] = low_limit
    dataframe.loc[(dataframe[col_name] > up_limit), col_name] = up_limit
#----------------------------------------------------------------------------------------------------------------------

In [None]:
for col in num_cols:
    winsorize_with_thresholds(df, col, 0.05, 0.95)

for col in num_cols:
    print(col, check_if_outlier_exists(df, col, 0.05, 0.95))

# Check for Missing Values

In [None]:
df.isnull().sum()

## >> Dropping 'cabin' variable as it is mostly empty (77% empty) and other cardinal variables ('passengerid','ticket','name')

In [None]:
# drop columns
list_to_drop = ['ticket','name','cabin']
df.drop(labels=list_to_drop, axis=1, inplace=True)

# reassign categorical variables
cat_cols = ["pclass", "sex","sibsp","parch","embarked","n_name_dr","n_title","n_is_alone","n_age_cat","n_sex_cat"] 
car_cols = ["passengerid"]
df.head()

## >> Fill 'age' variable with the medians considering 'n_title' variable and fill rest with means considering there 'sex'

In [None]:
# fill 'age' according to the classes of 't_title'
df["age"].fillna(df.groupby('n_title')["age"].transform("median"), inplace=True)
#df["age"].fillna(df.groupby('sex')["age"].transform("mean"), inplace=True)
df.isnull().sum()

## >> Convert 'age' to int from float

In [None]:
df["age"] = df["age"].astype(int)
df["age"].dtype

## >> Fill 'age' related variables accordingly

In [None]:
# is alone
df.loc[((df['sibsp'] + df['parch']) > 0), "n_is_alone"] = "No"
df.loc[((df['sibsp'] + df['parch']) == 0), "n_is_alone"] = "Yes"

# age level
df.loc[(df['age'] < 18), 'n_age_cat'] = 'Young'
df.loc[(df['age'] >= 18) & (df['age'] < 56), 'n_age_cat'] = 'Mature'
df.loc[(df['age'] >= 56), 'n_age_cat'] = 'Senior'

# sex according to age
df.loc[(df['sex'] == 'male') & (df['age'] <= 21), 'n_sex_cat'] = 'Young male'
df.loc[(df['sex'] == 'male') & (df['age'] > 21) & (df['age'] < 50), 'n_sex_cat'] = 'Mature male'
df.loc[(df['sex'] == 'male') & (df['age'] >= 50), 'n_sex_cat'] = 'Senior male'
df.loc[(df['sex'] == 'female') & (df['age'] <= 21), 'n_sex_cat'] = 'Young female'
df.loc[(df['sex'] == 'female') & (df['age'] > 21) & (df['age'] < 50), 'n_sex_cat'] = 'Mature female'
df.loc[(df['sex'] == 'female') & (df['age'] >= 50), 'n_sex_cat'] = 'Senior female'

print("-"*33)
print("New Shape of Dataset:")
print("-"*33)
print(df.shape)
print("-"*33)
print(df.isnull().sum())
print("-"*33)
df.head()

## >> Fill 'embarked' variable with the median considering 'pclass' variable

In [None]:
df["embarked"].fillna(df['embarked'].mode()[0], inplace=True)
df.isnull().sum()

# Label Encoding

## >> Find Binary Variables

In [None]:
# find binary variables
binary_cols = [col for col in df.columns if col in cat_cols and df[col].nunique() == 2]

print("-"*33)
print(f"Binary Variables: {binary_cols}")
print("-"*33)
df.head()

## >> Binary Label Encoding

In [None]:
le = LabelEncoder()

for col in binary_cols:
    df[col] =le.fit_transform(df[col])
    
df.head()

## >> Find categorical variables with multiple classes

In [None]:
one_hot_cols = [col for col in df.columns if col in cat_cols and df[col].nunique() > 2]
print(f"Categorical variables with multiple classes: {one_hot_cols}")

## >> One-Hot Label Encoding

In [None]:
df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)
df.head()

# Scaling

## >> Apply StandardScaler to numarical values

In [None]:
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

df.head()

# Modelling

## >> Prepare Test Data

In [None]:
def prepare_test_data():
    df = pd.read_csv("/kaggle/input/titanic/test.csv")
    df.columns = [col.lower() for col in df.columns]
    # passeger have a cabin
    df["n_has_cabin"] = df["cabin"].notnull().astype(int)
    # 'dr in name
    df["n_name_dr"] = df["name"].apply(lambda x: len([x for x in x.split() if x.startswith("Dr")]))
    # 'title' in name
    df['n_title'] = df["name"].str.extract(' ([A-Za-z]+)\.', expand=False)
    # family size
    df["n_family_size"] = df["sibsp"] + df["parch"] + 1
    # age pclass
    #df["n_age_class"] = df["age"] * df["pclass"]
    # is alone
    df.loc[((df['sibsp'] + df['parch']) > 0), "n_is_alone"] = "No"
    df.loc[((df['sibsp'] + df['parch']) == 0), "n_is_alone"] = "Yes"
    # age level
    df.loc[(df['age'] < 18), 'n_age_cat'] = 'Young'
    df.loc[(df['age'] >= 18) & (df['age'] < 56), 'n_age_cat'] = 'Mature'
    df.loc[(df['age'] >= 56), 'n_age_cat'] = 'Senior'
    # sex according to age
    df.loc[(df['sex'] == 'male') & (df['age'] <= 21), 'n_sex_cat'] = 'Young male'
    df.loc[(df['sex'] == 'male') & (df['age'] > 21) & (df['age'] < 50), 'n_sex_cat'] = 'Mature male'
    df.loc[(df['sex'] == 'male') & (df['age'] >= 50), 'n_sex_cat'] = 'Senior male'
    df.loc[(df['sex'] == 'female') & (df['age'] <= 21), 'n_sex_cat'] = 'Young female'
    df.loc[(df['sex'] == 'female') & (df['age'] > 21) & (df['age'] < 50), 'n_sex_cat'] = 'Mature female'
    df.loc[(df['sex'] == 'female') & (df['age'] >= 50), 'n_sex_cat'] = 'Senior female'
    cat_cols = ["pclass", "sex","sibsp","parch","cabin", "embarked","n_name_dr","n_title","n_age_class","n_is_alone","n_age_cat","n_sex_cat"] 
    num_cols = ["age","fare","n_family_size"]
    car_cols = ["passengerid", "name","ticket"]
    target_label = ''
    for col in num_cols:
        winsorize_with_thresholds(df, col, 0.05, 0.95)
    # drop columns
    list_to_drop = ['ticket','name','cabin']
    df.drop(labels=list_to_drop, axis=1, inplace=True)
    # reassign categorical variables
    cat_cols = ["pclass", "sex","sibsp","parch","embarked","n_name_dr","n_title","n_age_class","n_is_alone","n_age_cat","n_sex_cat"] 
    car_cols = ["passengerid"]
    # fill 'age' according to the classes of 't_title'
    df["age"].fillna(df.groupby('n_title')["age"].transform("median"), inplace=True)
    df["age"].fillna(df.groupby('sex')["age"].transform("mean"), inplace=True)
    df["age"] = df["age"].astype(int)
    # age pclass
    #df["n_age_class"] = df["age"] * df["pclass"]
    # is alone
    df.loc[((df['sibsp'] + df['parch']) > 0), "n_is_alone"] = "No"
    df.loc[((df['sibsp'] + df['parch']) == 0), "n_is_alone"] = "Yes"
    # age level
    df.loc[(df['age'] < 18), 'n_age_cat'] = 'Young'
    df.loc[(df['age'] >= 18) & (df['age'] < 56), 'n_age_cat'] = 'Mature'
    df.loc[(df['age'] >= 56), 'n_age_cat'] = 'Senior'
    # sex according to age
    df.loc[(df['sex'] == 'male') & (df['age'] <= 21), 'n_sex_cat'] = 'Young male'
    df.loc[(df['sex'] == 'male') & (df['age'] > 21) & (df['age'] < 50), 'n_sex_cat'] = 'Mature male'
    df.loc[(df['sex'] == 'male') & (df['age'] >= 50), 'n_sex_cat'] = 'Senior male'
    df.loc[(df['sex'] == 'female') & (df['age'] <= 21), 'n_sex_cat'] = 'Young female'
    df.loc[(df['sex'] == 'female') & (df['age'] > 21) & (df['age'] < 50), 'n_sex_cat'] = 'Mature female'
    df.loc[(df['sex'] == 'female') & (df['age'] >= 50), 'n_sex_cat'] = 'Senior female'
    df["fare"].fillna(df.groupby('pclass')["fare"].transform("mean"), inplace=True)
    df["embarked"].fillna(df['embarked'].mode()[0], inplace=True)
    # find binary variables
    binary_cols = [col for col in df.columns if col in cat_cols and df[col].nunique() == 2]
    le = LabelEncoder()

    for col in binary_cols:
        df[col] =le.fit_transform(df[col])
    one_hot_cols = [col for col in df.columns if col in cat_cols and df[col].nunique() > 2]
    df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])
    return df

In [None]:
df_test = prepare_test_data()
df_test.head()

## >> Get only columns exists in both train and test dataset as they do not have the same variables after feature extraction in both datasets

In [None]:
# find columns in both train and test datasets
df_train_cols = {col for col in df.columns if col != 'survived'}
df_test_cols = {col for col in df_test.columns}

diff_cols = df_train_cols.difference(df_test_cols)

diff_cols = list(diff_cols)
diff_cols

In [None]:
# add missing columns in test dataset and assign 0 as values to all
for col in diff_cols:
    df_test[col] = 0

# reorder columns
train_cols = [col for col in df.columns if col != target_label]
df_test = df_test[train_cols]

## >> Prepare Train Data for models

In [None]:
# split data into depentdent and independent variables
X = df.drop(target_label,axis=1)
y = df[target_label]

# K-Nearest Neighbors Model

## >> Train Model

In [None]:
# train model
knn = KNeighborsClassifier()

knn_params = {"n_neighbors": range(2,50)}

knn_grid = GridSearchCV(knn,
                        knn_params,
                        cv=5,
                        n_jobs=-1,
                        verbose=True).fit(X, y)

knn_grid.best_params_

## >> Hyperparameters

In [None]:
# model with hyperparameters
knn_hp = knn.set_params(**knn_grid.best_params_).fit(X,y)

cv_output = cross_validate(knn_hp,
                           X,
                           y,
                           cv=10,
                           scoring=["accuracy","f1","roc_auc"])
print("-"*33)
print(f"Test Accuracy: {cv_output['test_accuracy'].mean():.4f}")
print("-"*33)
print(f"Test F1: {cv_output['test_f1'].mean():.4f}")
print("-"*33)
print(f"Test ROC AUC: {cv_output['test_roc_auc'].mean():.4f}")
print("-"*33)

## >> Get Prediction for Test Data

In [None]:
y_pred = knn_hp.predict(df_test)

df_test["survived"] = y_pred

df_test.head()

# Random Forest Classifier

## >> Train Model and Find Accuracy

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)

rf_model = RandomForestClassifier().fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

print("-"*33)
print(f"Model Accuracy is: {accuracy_score(y_pred, y_test):.4f}")
print("-"*33)

## >> Find Hyperparameters

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

rf = RandomForestClassifier()
rf_params = {"n_estimators":[10,50,100,150,200],
            "max_depth":[None,10,20,30,50]}

rf_grid = GridSearchCV(rf,
                      rf_params,
                      cv=10,
                      n_jobs=-1,
                      verbose=True,
                      scoring="accuracy").fit(X_train,y_train)

rf_grid.best_params_

## >> Get Accuracy Score

In [None]:
rf_hp = rf.set_params(**rf_grid.best_params_).fit(X_train,y_train)

y_pred = rf_hp.predict(X_test)

print("-"*33)
print(f"Model Accuracy is: {accuracy_score(y_pred, y_test):.4f}")
print("-"*33)

## >> Get Prediction for Test Data

In [None]:
#drop target label first as assign in the previous model
df_test.drop(target_label,axis=1,inplace=True)

y_pred = rf_hp.predict(df_test)

df_test["survived"] = y_pred

df_test.head()

### FINAL: Random Forest Classifier Model >> Accuracy is: 83.21%

In [None]:
df_train_csv = df[["passengerid","survived"]]
df_test_csv = pd.DataFrame(df_test["passengerid"])

new_df = df_test_csv.merge(df_train_csv, on="passengerid", how="left")
#new_df = df_test_csv.merge(df_train_csv, on="passengerid", how='outer')

new_df.head()

In [None]:
dff_test = pd.read_csv("/kaggle/input/titanic/test.csv")

import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

parameters = {
   'boosting_type': 'gbdt',
   'objective': 'binary',
   'metric': 'auc',
   'num_leaves': 31,
   'learning_rate': 0.05,
   'feature_fraction': 0.9,
   'bagging_fraction': 0.8,
   'bagging_freq': 5,
   'verbose': 0}

model = lgb.train(parameters,
                    lgb_train,
                    num_boost_round=20,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=5)

#drop target label first as assign in the previous model
df_test.drop(target_label,axis=1,inplace=True)
y_pred = model.predict(df_test, num_iteration=model.best_iteration)

print(len(y_pred))
#df_test_csv = pd.DataFrame(dff_test["PassengerId"])

#df_test_csv["Survived"] = y_pred

#df_test_csv.head()

In [None]:
dff = df_test["passengerid"]
dff["survived"] = y_pred
dff.columns = ["PassengerId","Survived"]
dff.to_csv("titanic.csv")
pd.DataFrame(dff)