In [359]:
import pandas as pd
import numpy as np

In [360]:
train_kaggle = "/kaggle/input/titanic/train.csv"
test_kaggle = "/kaggle/input/titanic/test.csv"
gender_submission_kaggle = "/kaggle/input/titanic/gender_submission.csv"
train_local = "titanic/train.csv"
test_local = "titanic/test.csv"
gender_submission_local = "titanic/gender_submission.csv"

In [361]:
train_data = pd.read_csv(train_kaggle)
train_data.head()


In [362]:
test_data = pd.read_csv(test_kaggle)
test_data.head()

Data exploration

In [363]:
import matplotlib.pyplot as plt
import seaborn as sns 

In [364]:
categorical_variables_to_plot = ["Sex","Embarked","Survived","Pclass","SibSp","Parch"]
f, axes = plt.subplots(3,2,figsize=(9,12))
for f,ax in zip(categorical_variables_to_plot, axes.ravel()):
    sns.countplot(x=train_data[f],data=train_data,hue='Survived', ax=ax)
sns.set(style="darkgrid")

In [365]:
sns.displot(train_data,x="Fare",  hue = 'Survived', kind = 'kde')
sns.displot(train_data,x="Age",  hue = 'Survived', kind = 'kde')


In [366]:
passenger_id = test_data["PassengerId"]
train_data  = train_data.drop(columns=["Cabin","PassengerId","Ticket","Name"])
test_data  = test_data.drop(columns=["Cabin","PassengerId","Ticket","Name"])




In [367]:
train_data = pd.get_dummies(train_data,drop_first=True)
test_data = pd.get_dummies(test_data,drop_first=True)



In [368]:
train_data = pd.get_dummies(train_data, columns=["Pclass"], prefix=["Class_is"] )
test_data = pd.get_dummies(test_data, columns=["Pclass"], prefix=["Class_is"] )

In [369]:
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute


In [370]:
imputer = impute.SimpleImputer(strategy="median")
imputed = imputer.fit_transform(train_data[["Age","Fare"]])
train_data[["Age","Fare"]] = imputed
imputed = imputer.transform(test_data[["Age","Fare"]])
test_data[["Age","Fare"]] = imputed

In [371]:
from sklearn import (
    ensemble,
    preprocessing,
    tree,
)
from sklearn.metrics import (
    auc,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)



In [372]:
normalized_columns = ["Age","SibSp","Parch","Fare"]
scaler = preprocessing.StandardScaler()
train_data[normalized_columns] = scaler.fit_transform(train_data[normalized_columns])
test_data[normalized_columns] = scaler.transform(test_data[normalized_columns])


In [373]:
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
)

y_train = train_data["Survived"]
X_train = train_data.drop(columns = ["Survived"])


In [374]:
from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import (
    LogisticRegression,
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import (
    KNeighborsClassifier,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier,
)
import xgboost


In [375]:
y_train = y_train.astype(int)
y_train = y_train.to_numpy()
X_train = X_train.to_numpy()
for model in [
    LogisticRegression,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    GaussianNB,
    SVC,
    RandomForestClassifier,
    xgboost.XGBClassifier
]:
    if model == xgboost.XGBClassifier:
        cls = model(use_label_encoder=False,verbosity= 0)
    else:
        cls = model()
    kfold = model_selection.KFold(n_splits=10)
    s = model_selection.cross_val_score(cls, X_train, y_train, scoring="roc_auc", cv=kfold)
    print(
        f"{model.__name__:22} AUC: "
        f"{s.mean():.3f} STD: {s.std():.2f}"
    )


In [376]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(1, 110, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
rnd_forest = ensemble.RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rnd_forest, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train,y_train)
print(rf_random.best_params_)

In [377]:
 prediction = rf_random.predict(test_data)

In [378]:
my_submission = pd.DataFrame({'PassengerId': passenger_id, 'Survived': prediction})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)