In [19]:
import pandas as pd
import numpy as np

In [20]:
train_kaggle = "/kaggle/input/titanic/train.csv"
test_kaggle = "/kaggle/input/titanic/test.csv"
gender_submission_kaggle = "/kaggle/input/titanic/gender_submission.csv"
train_local = "titanic/train.csv"
test_local = "titanic/test.csv"
gender_submission_local = "titanic/gender_submission.csv"

In [21]:
train_data = pd.read_csv(train_kaggle)
train_data.head()


In [22]:
test_data = pd.read_csv(test_kaggle)
test_data.head()

In [23]:
# test_kaggle_new = "/kaggle/input/testtt/test.csv"
# new_test_data =  pd.read_csv(test_kaggle_new)

Data exploration

In [24]:
import matplotlib.pyplot as plt
import seaborn as sns 

In [25]:
categorical_variables_to_plot = ["Sex","Embarked","Survived","Pclass","SibSp","Parch"]
f, axes = plt.subplots(3,2,figsize=(9,12))
for f,ax in zip(categorical_variables_to_plot, axes.ravel()):
    sns.countplot(x=train_data[f],data=train_data,hue='Survived', ax=ax)
sns.set(style="darkgrid")

In [26]:
sns.displot(train_data,x="Fare",  hue = 'Survived', kind = 'kde')
sns.displot(train_data,x="Age",  hue = 'Survived', kind = 'kde')


In [27]:
passenger_id = test_data["PassengerId"]
train_data  = train_data.drop(columns=["Cabin","PassengerId","Ticket","Name"])
test_data  = test_data.drop(columns=["Cabin","PassengerId","Ticket","Name"])



In [28]:
train_data = pd.get_dummies(train_data,drop_first=True)
test_data = pd.get_dummies(test_data,drop_first=True)



In [29]:
train_data = pd.get_dummies(train_data, columns=["Pclass"], prefix=["Class_is"] )
test_data = pd.get_dummies(test_data, columns=["Pclass"], prefix=["Class_is"] )


In [30]:
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute


In [31]:
imputer = impute.SimpleImputer(strategy="median")
imputed = imputer.fit_transform(train_data[["Age","Fare"]])
train_data[["Age","Fare"]] = imputed
imputed = imputer.transform(test_data[["Age","Fare"]])
test_data[["Age","Fare"]] = imputed


In [32]:
from sklearn import (
    ensemble,
    preprocessing,
    tree,
)
from sklearn.metrics import (
    auc,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)



In [33]:
normalized_columns = ["Age","SibSp","Parch","Fare"]
scaler = preprocessing.StandardScaler()
train_data[normalized_columns] = scaler.fit_transform(train_data[normalized_columns])
test_data[normalized_columns] = scaler.transform(test_data[normalized_columns])


In [34]:
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
)

y_train = train_data["Survived"]
X_train = train_data.drop(columns = ["Survived"])


In [35]:
from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import (
    LogisticRegression,
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import (
    KNeighborsClassifier,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier,
    RandomForestRegressor
)
import xgboost


In [36]:
y_train = y_train.astype(int)
y_train = y_train.to_numpy()
X_train = X_train.to_numpy()
for model in [
    LogisticRegression,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    GaussianNB,
    SVC,
    RandomForestClassifier,
    RandomForestRegressor,
    xgboost.XGBClassifier
]:
    if model == xgboost.XGBClassifier:
        cls = model(use_label_encoder=False,verbosity= 0)
    else:
        cls = model()
    kfold = model_selection.KFold(n_splits=10)
    s = model_selection.cross_val_score(cls, X_train, y_train, scoring="roc_auc", cv=kfold)
    print(
        f"{model.__name__:22} AUC: "
        f"{s.mean():.3f} STD: {s.std():.2f}"
    )


In [None]:

from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_
kfold = model_selection.KFold(n_splits=10)
s = model_selection.cross_val_score(best_grid, X_train, y_train, scoring="roc_auc", cv=kfold)
print(
    f" AUC: "
    f"{s.mean():.3f} STD: {s.std():.2f}")

In [None]:

# y_new_test = new_test_data["Survived"]
# X_new_test = new_test_data.drop(columns = ["Survived"])
# y_new_test = y_new_test.astype(int)
# s = model_selection.cross_val_score(best_grid, X_new_test, y_new_test, scoring="roc_auc", cv=kfold)
# print(
#     f" AUC: "
#     f"{s.mean():.3f} STD: {s.std():.2f}")

In [None]:
 prediction = best_grid.predict(test_data)

In [None]:
my_submission = pd.DataFrame({'PassengerId': passenger_id, 'Survived': prediction})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)