In [164]:
# Data analysis and wrangling
import os
import numpy as np
import pandas as pd
import scipy.stats as st
from datetime import datetime

# Visualization
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator
import seaborn as sns

# Machine learning
from sklearn import tree, metrics, svm
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split, StratifiedKFold, learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [165]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [166]:
# Load the dataset

train = pd.read_csv("/kaggle/input/titanic-machine-learning-from-disaster/train.csv")
test = pd.read_csv("/kaggle/input/titanic-machine-learning-from-disaster/test.csv")
test_f = pd.read_csv("/kaggle/input/titanic-machine-learning-from-disaster/test.csv")

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

train

In [167]:
## Exploratory analysis ##

train.info()
print('_'*40)
test.info()

In [168]:
# What is the distribution of the "numerical Features" in the samples?

train.describe().round(2)

In [169]:
# What is the distribution of "categorical features" ?

train.describe(include=['O'])

In [170]:
train.size

In [171]:
train.shape

In [172]:
train.isna().sum()

In [173]:
# Correlation graph

plt.style.use('fivethirtyeight')

sns.set(font_scale=1.1)
correlation_train = train.corr()
mask = np.triu(correlation_train.corr())
plt.figure(figsize=(10, 10))
sns.heatmap(correlation_train,
            annot=True,
            fmt='.1f',
            cmap='coolwarm',
            square=True,
            mask=mask,
            linewidths=1,
            cbar=False)

plt.show()

In [174]:


grid = sns.FacetGrid(train, row='Embarked', height=2.2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
grid.add_legend()

In [175]:
# Distribution of the price of the FARE and its relationship with the survival rate

figure = plt.figure(figsize=(18, 7))
plt.hist([train[train['Survived'] == 1]['Fare'], train[train['Survived'] == 0]['Fare']], 
         stacked=True, color = ['g','r'],
         bins = 50, label = ['Survived','Dead'])
plt.xlabel('Fare')
plt.ylabel('Number of passengers')
plt.legend();

In [176]:
# Extract the target variable from the "training dataset"
y = train['Survived'].reset_index(drop=True)

# We join the data sets, so as not to repeat the operations
dataset = pd.concat([train, test]).reset_index(drop=True)

In [177]:
# We delete the features that are not useful ("TICKET)

dataset.drop("Ticket", axis=1, inplace=True)

In [178]:
## Treatment of NaN ##

# We analyze the characteristic EMBARKED

print(dataset[dataset.Embarked == 'Q'].Pclass.value_counts())
print(dataset[dataset.Embarked == 'C'].Pclass.value_counts())
print(dataset[dataset.Embarked == 'S'].Pclass.value_counts())

In [179]:
# We replace in 3 punctual null values

dataset.at[61, 'Embarked'] = 'C'
dataset.at[829, 'Embarked'] = 'C'
dataset.at[1043, 'Fare'] = 13.5

In [180]:
# Fill in the AGE in the NaN

print(dataset.groupby(['Sex', 'Pclass'])['Age'].agg(['mean', 'median']).round(1))

for e in dataset[dataset.Age.isna()].index:
    if (dataset.at[e,"Sex"] == "female") & dataset.at[e,"Pclass"] == "1":
        dataset.at[e,"Age"] = 36.5
    elif (dataset.at[e,"Sex"] == "female") & dataset.at[e,"Pclass"] == "2":
        dataset.at[e,"Age"] = 27.6
    elif (dataset.at[e,"Sex"] == "female") & dataset.at[e,"Pclass"] == "3":
        dataset.at[e,"Age"] = 24.5
    elif (dataset.at[e,"Sex"] == "male") & dataset.at[e,"Pclass"] == "1":
        dataset.at[e,"Age"] = 39.3
    elif (dataset.at[e,"Sex"] == "male") & dataset.at[e,"Pclass"] == "2":
        dataset.at[e,"Age"] = 30.7
    else:
        dataset.at[e,"Age"] = 27.1

In [181]:
## create new features ##

# FAMILY

dataset['Family_size'] = dataset['SibSp'] + dataset['Parch'] + 1
dataset['Alone'] = dataset['Family_size'].map(lambda s: 1 if s == 1 else 0)
dataset['SmallFamily'] = dataset['Family_size'].map(lambda s: 1 if 2 <= s <= 3 else 0)
dataset['LargeFamily'] = dataset['Family_size'].map(lambda s: 1 if 4 <= s else 0)
dataset.drop("Family_size", axis=1, inplace=True)

# DECK
dataset["Deck"] = dataset["Cabin"].str.slice(0,1)
# Fill null values with "N"
dataset["Deck"] = dataset["Deck"].fillna("N")
# I delete the CABIN feature because I don't need it anymore
dataset.drop("Cabin", axis=1, inplace=True)
print(dataset)

# TITLE
lista = []

def extraer_tratamiento(name):
  if "." not in name:
    return ""
  izda, dcha = name.split(".", 1)
  lista.append(izda.split()[-1])
  return izda.split()[-1]

dataset.Name.apply(extraer_tratamiento)
listadf = pd.DataFrame(lista)
listadf.rename(columns={0: "Titulo"}, inplace=True)

# We join the dataframes
dataset = pd.concat([dataset, listadf], axis=1)

# We delete the names because they are no longer needed
dataset.drop("Name", axis=1, inplace=True)
dataset
print(dataset.groupby(['Titulo'])['Titulo'].agg(["count"]))

In [182]:
## Transformation of "categorical" variables to numeric (SEX, EMBARKED, DECK, TITLE) ##

sex = {"male": 1,"female": 0}
embarked = {"S":0, "C":1, "Q":2}
deck = {"A":0,"B":1,"C":2,"D":3,"E":4,"F":5,"G":6,"N":7,"T":8}
titulo = {"Capt":0, "Col":1, "Countess":2,"Don":3, "Dr":4, "Jonkheer":5,"Lady":6, "Major":7, "Master":8,
         "Miss":9, "Mlle":10, "Mme":11,"Mr":12, "Mrs":13, "Ms":14,"Rev":15, "Sir":16}

dataset['Sex'] = dataset['Sex'].map(sex).astype('Int64')
dataset['Embarked'] = dataset['Embarked'].map(embarked).astype('Int64')
dataset['Deck'] = dataset['Deck'].map(deck).astype('Int64')
dataset['Titulo'] = dataset['Titulo'].map(titulo).astype('Int64')

# Fill the null value with "Mr" (which is the most common)
dataset["Titulo"] = dataset["Titulo"].fillna(12)

dataset = pd.get_dummies(data=dataset)
print(dataset)

In [183]:
# We separate the dataset again

train = dataset.iloc[:len(y), :]
y_train = y

test = dataset.iloc[len(train):, :]

# Now remove the "PASSENGER ID" feature from the "train" dataset, it doesn't add anything.

train.drop("PassengerId", axis=1, inplace=True)
train.drop("Survived", axis=1, inplace=True)
test.drop("PassengerId", axis=1, inplace=True)
test.drop("Survived", axis=1, inplace=True)

print(train)

In [184]:
## Modeling ##

#------------------------------------------
# Algorithm: Decision Tree
clf = tree.DecisionTreeClassifier(max_depth=10, random_state=42)
# I proceed to train him
clf.fit(train, y_train)
# I get the predictions
y_pred1 = clf.predict(test)

#------------------------------------------
# Algorithm: Support Vector Machine
svm = svm.SVC(kernel="poly")
# I proceed to train him
svm.fit(train, y_train)
# I get the predictions
y_pred2 = svm.predict(test)

#------------------------------------------
# Algorithm: Linear Discriminant Analysis
lda = LDA(n_components=1, solver="eigen",shrinkage="auto", store_covariance="False")
# Adjust the scales
X_train = lda.fit_transform(train, y_train)
X_test = lda.transform(test)
classifier = LDA()
# I proceed to train him
classifier.fit(train, y_train)
# I get the predictions
y_pred3 = classifier.predict(test)

#------------------------------------------
# Algorithm: Random forest 
rfc = RandomForestClassifier()
param_grid ={
             'max_depth' : [4, 6, 11],
                 'n_estimators': [300, 500],
                 'max_features': ['sqrt', 'auto', 'log2'],
                 'min_samples_split': [2, 3, 10],
                 'min_samples_leaf': [1, 3, 10],
                'max_leaf_nodes':st.randint(6, 10),
                 'bootstrap': [True, False]}

grid = RandomizedSearchCV(rfc,
                    param_grid, cv=10,
                    scoring='accuracy',
                    verbose=1,n_iter=10)

grid.fit(train, y_train)
grid.best_estimator_
grid.best_score_
y_pred5 = grid.best_estimator_.predict(test)


# The most important features of the algorithm
features = pd.DataFrame()
features['feature'] = train.columns
features['importance'] = clf.feature_importances_
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)

features.plot(kind='barh', figsize=(10, 7))

In [185]:

# We load the Y of the validation set
Y_test = pd.read_csv("/kaggle/input/test-file/tested.csv")
Y_test = Y_test.Survived

# Confusion matrix "Decision Tree"

print('\n\nConfusion Matrix for "Decision Tree":')
matriz_arbol = confusion_matrix(y_pred1,Y_test)
a = sns.heatmap(matriz_arbol, annot=True)
plt.show()
print("acuracy:", accuracy_score(y_pred1,Y_test))
print("precision:", precision_score(y_pred1, Y_test,average='weighted'))
print("recall" , metrics.recall_score(y_pred1, Y_test,average='weighted'))

# Confusion matrix "SVM"

print('\n\nConfusion Matrix for "SVM":')
matriz_arbol = confusion_matrix(Y_test, y_pred2)
a = sns.heatmap(matriz_arbol, annot=True)
plt.show()
print("acuracy:", accuracy_score(Y_test, y_pred2))
print("precision:", precision_score(Y_test, y_pred2,average='weighted'))
print("recall" , metrics.recall_score(Y_test,y_pred2,average='weighted'))

# Confusion matrix "LDA"

print('\n\nConfusion Matrix for "LDA":')
matriz_arbol = confusion_matrix(Y_test, y_pred3)
a = sns.heatmap(matriz_arbol, annot=True)
plt.show()
print("acuracy:", accuracy_score(Y_test, y_pred3))
print("precision:", precision_score(Y_test, y_pred3,average='weighted'))
print("recall" , metrics.recall_score(Y_test,y_pred3,average='weighted'))

# Confusion matrix "Random Forest"

print('\n\nConfusion Matrix for "Random Forest":')
matriz_arbol = confusion_matrix(Y_test, y_pred5)
a = sns.heatmap(matriz_arbol, annot=True)
plt.show()
print("acuracy:", accuracy_score(Y_test, y_pred5))
print("precision:", precision_score(Y_test, y_pred5,average='weighted'))
print("recall" , metrics.recall_score(Y_test,y_pred5,average='weighted'))

In [186]:
# Cross validate model with Kfold stratified cross val
kfold = StratifiedKFold(n_splits=10)

#Learning curves are a good way to see the effect of overfitting, and the effect of training size on accuracy.

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
    """Generate a simple plot of the test and training learning curve"""
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

g = plot_learning_curve(grid.best_estimator_,"RF mearning curves",train, y_train,cv=kfold)

In [187]:
output = pd.DataFrame({'PassengerId': test_f.PassengerId, 'Survived': y_pred5})

output.to_csv('titanic_kaggle.csv', index=False)

print("Your submission was successfully saved!")