# ***Importing libraries and loading datasets***

In [None]:
from gc                       import collect                                    # garbage collector to free memory
from warnings                 import filterwarnings                             # handling warnings
import numpy                  as np                                             # Linear Algebra
import pandas                 as pd                                             # data processing, CSV file I/O
import os                                                                       # use OS dependent features
import random                                                                   # generate pseudo-random numbers
import matplotlib.pyplot      as plt                                            # data visualization
import seaborn                as sns                                            # statistical data visualization
from sklearn.preprocessing    import LabelEncoder                               # encode categorical labels into numeric labels
from sklearn.preprocessing    import OneHotEncoder, StandardScaler              # categorical variables in numerical format and standardize the characteristic variables
from sklearn.model_selection  import train_test_split                           # split data into training and test sets
from sklearn.model_selection  import GridSearchCV                               # hyperparameter optimization
from sklearn.metrics          import accuracy_score                             # evaluate the accuracy of the classifier
from sklearn.tree             import DecisionTreeClassifier, plot_tree          # DecisionTree and plot tree for decision trees
from sklearn.ensemble         import RandomForestClassifier                     # random forest classifier
from google.colab             import drive                                      # mount drive
from sklearn.model_selection  import RepeatedStratifiedKFold                    # get cv in GridSearchCV


In [None]:
# Importing & run from our Drive space
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
stat = ''
path = '/content/drive/MyDrive/Colab Notebooks/datos/train.csv'

try:
    # Load dataset
    df_train = pd.read_csv(path, sep=',')
    stat = 'Loaded training dataset'

except FileNotFoundError:
    # Exception if file does not exist in path
    stat = 'Error: Training file not found. Check path, please'

except Exception as e:
    # Other exceptions
    stat = 'Sorry, there was an error loading the training dataset: ' + e

# Status message
print(stat)

Loaded dataset


In [None]:
stat = ''
path = '/content/drive/MyDrive/Colab Notebooks/datos/test.csv'

try:
    # Load dataset
    df_test = pd.read_csv(path, sep=',')
    stat = 'Loaded test dataset'

except FileNotFoundError:
    # Exception if file does not exist in path
    stat = 'Error: test file not found. Check path, please'

except Exception as e:
    # Other exceptions
    stat = 'Sorry, there was an error loading the test dataset: ' + e

# Status message
print(stat)

Loaded dataset


In [None]:
def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    os.environ["PYTHONHASHSEED"] = str(seed_value)


SEED = 42
set_seed(SEED)

# ***Explore data***

In [None]:
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
print("Columns: \n{0} ".format(df_train.columns.tolist()))

Columns: 
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'] 


# ***Basic data check***

## ***Missing values***

In [None]:
missing_values = df_train.isna().any()
print('Columns which have missing values: \n{0}'.format(missing_values[missing_values == True].index.tolist()))

Columns which have missing values: 
['Age', 'Cabin', 'Embarked']


In [None]:
print("Percentage of missing values in `Age` column: {0:.2f}".format(100.*(df_train.Age.isna().sum()/len(df_train))))
print("Percentage of missing values in `Cabin` column: {0:.2f}".format(100.*(df_train.Cabin.isna().sum()/len(df_train))))
print("Percentage of missing values in `Embarked` column: {0:.2f}".format(100.*(df_train.Embarked.isna().sum()/len(df_train))))

Percentage of missing values in `Age` column: 19.87
Percentage of missing values in `Cabin` column: 77.10
Percentage of missing values in `Embarked` column: 0.22


## ***Check for duplicates***

In [None]:
duplicates = df_train.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates))

Duplicates in train data: 0


## ***Categorical variables***

In [None]:
categorical = df_train.nunique().sort_values(ascending=True)
print('Categorical variables in train data: \n{0}'.format(categorical))

Categorical variables in train data: 
Survived         2
Sex              2
Pclass           3
Embarked         3
SibSp            7
Parch            7
Age             88
Cabin          147
Fare           248
Ticket         681
PassengerId    891
Name           891
dtype: int64


# ***Data cleaning***

In [None]:
def clean_data(data):
    # Too many missing values
    data.drop(['Cabin'], axis=1, inplace=True)

    # Probably will not provide some useful information
    data.drop(['Name', 'Ticket', 'Fare', 'Embarked'], axis=1, inplace=True)

    return data

df_train = clean_data(df_train)
df_test = clean_data(df_test)

In [None]:
df_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
886,887,0,2,male,27.0,0,0
887,888,1,1,female,19.0,0,0
888,889,0,3,female,,1,2
889,890,1,1,male,26.0,0,0
890,891,0,3,male,32.0,0,0


# ***Feature engineering***

Although I have eliminated most of the columns for simplicity, in the future I am planning to recover those columns. They may contain some useful information.
For now encoding the Sex column and filling Age column is enough to run a model.

In [None]:
df_train['Sex'].replace({'male':0, 'female':1}, inplace=True)
df_test['Sex'].replace({'male':0, 'female':1}, inplace=True)

# Merge two data to get the average Age and fill the column
all_data = pd.concat([df_train, df_test])
average = all_data.Age.median()
print("Average Age: {0}".format(average))
df_train.fillna(value={'Age': average}, inplace=True)
df_test.fillna(value={'Age': average}, inplace=True)

Average Age: 28.0


In [None]:
df_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
886,887,0,2,0,27.0,0,0
887,888,1,1,1,19.0,0,0
888,889,0,3,1,28.0,1,2
889,890,1,1,0,26.0,0,0
890,891,0,3,0,32.0,0,0


# ***Modelling***

Try different models with different parameters to understand which models give better results.

In [None]:
# Set X and y
X = df_train.drop(['Survived', 'PassengerId'], axis=1)
y = df_train['Survived']
test_X = df_test.drop(['PassengerId'], axis=1)

In [None]:
# To store models created
best_models = {}

# Split data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

def print_best_parameters(hyperparameters, best_parameters):
    value = "Best parameters: "
    for key in hyperparameters:
        value += str(key) + ": " + str(best_parameters[key]) + ", "
    if hyperparameters:
        print(value[:-2])

def get_best_model(estimator, hyperparameters, fit_params={}):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid_search = GridSearchCV(estimator=estimator, param_grid=hyperparameters, n_jobs=-1, cv=cv, scoring="accuracy")
    best_model = grid_search.fit(train_X, train_y, **fit_params)
    best_parameters = best_model.best_estimator_.get_params()
    print_best_parameters(hyperparameters, best_parameters)
    return best_model

def evaluate_model(model, name):
    print("Accuracy score:", accuracy_score(train_y, model.predict(train_X)))
    best_models[name] = model

In [None]:
print("Features: \n{0} ".format(X.columns.tolist()))

Features: 
['Pclass', 'Sex', 'Age', 'SibSp', 'Parch'] 


## ***Decision Tree Classifier***

Tune decision tree classifier model by changing some of its parameters.

* ***criterion: {“gini”, “entropy”}, default=”gini”***  
The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.

* ***splitter: {“best”, “random”}, default=”best”***  
The strategy used to choose the split at each node. Supported strategies are “best” to choose the best split and “random” to choose the best random split.

* ***max_depth: int, default=None***  
The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

* ***min_samples_split: int or float, default=2***
  * The minimum number of samples required to split an internal node:
    * If int, then consider min_samples_split as the minimum number.
    * If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.
* ***min_samples_leaf: int or float, default=1***  
The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.
  * If int, then consider min_samples_leaf as the minimum number.
  * If float, then min_samples_leaf is a fraction and ceil(min_samples_leaf * n_samples) are the minimum number of samples for each node.

In [None]:
# https://towardsdatascience.com/how-to-tune-a-decision-tree-f03721801680
# https://www.kaggle.com/gauravduttakiit/hyperparameter-tuning-in-decision-trees
hyperparameters = {
    'criterion'         : ['gini', 'entropy'],
    'splitter'          : ['best', 'random'],
    'max_depth'         : [None, 1, 2, 3, 4, 5],
    'min_samples_split' : list(range(2,5)),
    'min_samples_leaf'  : list(range(1,5))
}
estimator = DecisionTreeClassifier(random_state=1)
best_model_decision_tree = get_best_model(estimator, hyperparameters)

Best parameters: criterion: gini, splitter: best, max_depth: 4, min_samples_split: 2, min_samples_leaf: 3


In [None]:
evaluate_model(best_model_decision_tree.best_estimator_, 'decision_tree')

Accuracy score: 0.8502994011976048


## ***Random Forest Classifier***

* ***n_estimators: int, default=100***  
The number of trees in the forest.  
* ***max_features: {“auto”, “sqrt”, “log2”}, int or float, default=”auto”***
* The number of features to consider when looking for the best split:
  * If int, then consider max_features features at each split.
  * If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.
  * If “auto”, then max_features=sqrt(n_features).
  * If “sqrt”, then max_features=sqrt(n_features) (same as “auto”).
  * If “log2”, then max_features=log2(n_features).
  * If None, then max_features=n_features.
Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.

* ***criterion: {“gini”, “entropy”}, default=”gini”***  
The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. Note: this parameter is tree-specific.

* ***max_depth: int, default=None***  
The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

* ***min_samples_split: int or float, default=2***  
  * The minimum number of samples required to split an internal node:
    * If int, then consider min_samples_split as the minimum number.
    * If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.
* ***min_samples_leaf: int or float, default=1***  
The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.
  * If int, then consider min_samples_leaf as the minimum number.
  * If float, then min_samples_leaf is a fraction and ceil(min_samples_leaf * n_samples) are the minimum number of samples for each node.

In [None]:
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# https://www.analyticsvidhya.com/blog/2020/03/beginners-guide-random-forest-hyperparameter-tuning/
hyperparameters = {
    'n_estimators'      : list(range(10, 50, 10)),
    'max_features'      : ['auto', 'sqrt', 'log2'],
    'criterion'         : ['gini', 'entropy'],
    'max_depth'         : [None, 1, 2, 3, 4, 5],
    'min_samples_split' : list(range(2,5)),
    'min_samples_leaf'  : list(range(1,5))
}
estimator = RandomForestClassifier(random_state=1)
best_model_random_forest = get_best_model(estimator, hyperparameters)

17280 fits failed out of a total of 51840.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
17280 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterErr

Best parameters: n_estimators: 20, max_features: sqrt, criterion: gini, max_depth: 4, min_samples_split: 2, min_samples_leaf: 3


In [None]:
evaluate_model(best_model_random_forest.best_estimator_, 'random_forest')

Accuracy score: 0.8502994011976048


# ***WORK IN PROGRESS***

## ***Submission***

In [None]:
# Get predictions for each model and create submission files
for model in best_models:
    predictions = best_models[model].predict(test_X)
    output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
    filepath = '/content/drive/MyDrive/Colab Notebooks/datos/' +'submission_' + model + '.csv'
    output.to_csv(filepath, index=False)