____
__Universidad Tecnológica Nacional, Buenos Aires__<br/>
__Ingeniería Industrial__<br/>
__Cátedra de Ciencia de Datos - Curso I5521 - Turno Jueves noche__<br/>
__Elaborado por: Lucas Mareque__<br/>
__Editado por: Nicolas Aguirre__<br/>
____

# Libraries

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve , accuracy_score, auc, confusion_matrix
from sklearn.exceptions import DataConversionWarning
import warnings
warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Dataset 

Link: https://www.kaggle.com/c/titanic


|Variable |	Definition |	Key |
| --- | --- | --- |
|survival|	Survival|	0 = No, 1 = Yes|
|pclass|	Ticket class|	1 = 1st, 2 = 2nd, 3 = 3rd|
|sex|	Sex	| |
|Age|	Age in years| |	
|sibsp|	# of siblings / spouses aboard the Titanic| |	
|parch|	# of parents / children aboard the Titanic| |	
|ticket|	Ticket number | |	
|fare|	Passenger fare |	|
|cabin|	Cabin number	| |
|embarked|	Port of Embarkation	|C = Cherbourg, Q = Queenstown, S = Southampton

**Objetivo: Predecir si un pasajero sobrevive al accidente**

In [None]:
# Cargamos el dataset
root_path = '/path/to/clase_07/'
titanic_df = pd.read_csv(root_path+"titanic_train.csv")
# Observamos una parte de los datos
titanic_df.head(5)

In [None]:
#Eliminamos columnas que no nos interesan
titanic_df = titanic_df.drop(['PassengerId',"Name", "Ticket","Cabin"],axis=1)
total = titanic_df.isnull().sum().sort_values(ascending=False)
percent = (titanic_df.isnull().sum()/titanic_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
print(missing_data.head(6))
list_a = ['S','Q','C']
titanic_df.loc[~titanic_df['Embarked'].isin(list_a),:]
# Lleno Embarked vacíos con "S"
titanic_df['Embarked'].fillna('S', inplace = True)
# chequeamos que todo esta con los NaN
total = titanic_df.isnull().sum().sort_values(ascending=False)
percent = (titanic_df.isnull().sum()/titanic_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(6)

# Eda

**Tarea:**

- Ver matriz de correlación. Determinar si hay variables que se puedan sacar
- Distribución de pasajeros segun: clase y supervivencia , lugar de embarque y supervivencia, y precio y supervivencia

# Preprocessing

In [None]:
# Definimos las variables de entrenamiento y objetivo.
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked']
target = ['Survived']
# Generamos X e Y
X = titanic_df.loc[:,features]
Y = titanic_df.loc[:,target]

# Spliteamos Train y test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
# Cantidad de valores nulos
titanic_df.isnull().sum().sort_values(ascending=False)

In [None]:
# Reemplazamos los valores nulos de la columna Age por la media de los valores
imputer = SimpleImputer(strategy="mean")
imputer.fit(X_train[["Age"]])
X_train["Age"] = imputer.transform(X_train[["Age"]])
X_test["Age"] = imputer.transform(X_test[["Age"]]) 

print("Media calculada:", imputer.statistics_[0])

In [None]:
# Escalamos las variables numericos
cols = ["Age", "Fare", "SibSp", "Parch"]
scaler = StandardScaler()
X_train[cols] = scaler.fit_transform(X_train[cols])
X_test[["Age", "Fare", "SibSp", "Parch"]] = scaler.transform(X_test[["Age", "Fare", "SibSp", "Parch"]])

In [None]:
X_train_encoded

In [None]:
encoder.get_feature_names_out(cols)

In [None]:
# Modificamos las variables categoricas
cols = ["Embarked", "Sex", "Pclass"]
encoder = OneHotEncoder(handle_unknown="error",sparse_output=False)

# Ajustar y transformar X_train
X_train_encoded = encoder.fit_transform(X_train[cols])
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(cols), index=X_train.index)

# Transformar X_test
X_test_encoded = encoder.transform(X_test[cols])
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(cols), index=X_test.index)

# Dropear las columnas originales
X_train = X_train.drop(columns=cols)
X_test = X_test.drop(columns=cols)

# Concatenar las columnas codificadas
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)


In [None]:
X_train

# Train!

Vamos a entrenar un Random Forest, que consiste en un conjunto de árboles de decisión. Cada árbol realiza una predicción de manera independiente, y el resultado final se determina a partir del voto mayoritario entre todos los árboles:

![](https://www.researchgate.net/publication/354354484/figure/fig4/AS:1080214163595269@1634554534720/Illustration-of-random-forest-trees.jpg)

Contamos con diferentes hiperparametros, los analizados en este ejercicio son los siguientes:

- **n_estimators**: cantidad de arboles de decisión que tendrá el random forest.
- **max_depth**: número de niveles desde la raíz hasta la hoja más profunda.
- **criterion**:  mientras mas cercano sea a 0, más puro será el nodo ( todas las muestras de la misma clase)
    1. **Indice de Gini**: $Gini = 1 - \sum_{i=1}^{K} p_i^2$.  
   
    2. **Entropía**: $Entropy = - \sum_{i=1}^{K} p_i \log_2(p_i)$ 
    
    
- **min_samples_leaf**: número mínimo de muestras que debe tener una hoja


## Decision tree

In [None]:
tree = DecisionTreeClassifier(random_state=42)
param_grid = {
    'criterion': ['gini', 'entropy'],  
    'max_depth': [5, 10, 15],  
    'min_samples_leaf': [1, 4, 6]   
}
CV_tree = GridSearchCV(estimator=tree,param_grid=param_grid,cv=5)
CV_tree.fit(X_train, Y_train)
print("Mejores hiperparámetros:", CV_tree.best_params_)
print("Mejor score en validación cruzada:", CV_tree.best_score_)


In [None]:
Y_pred = CV_tree.best_estimator_.predict(X_test)
print("\nAccuracy en test:", accuracy_score(Y_test, Y_pred))

In [None]:
from sklearn import tree
plt.figure(figsize=(40,20))
best_tree = CV_tree.best_estimator_
# Dibujar el árbol
tree.plot_tree(best_tree, 
               feature_names=X_train.columns,  # si X_train es DataFrame
               class_names=[str(c) for c in best_tree.classes_],
               filled=True,   # colorea los nodos según la clase
               rounded=True,
               fontsize=12)

plt.show()

## Random Forest

In [None]:
# Realizamos un Grid Search para encontrar los hiperparametros

rfc=RandomForestClassifier(random_state=42)
param_grid = { 
    'n_estimators': [50, 100, 200],
    'max_depth' : [5, 10, 15],
    'criterion' :['gini', 'entropy'],
    'min_samples_leaf': [1, 4, 6]
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, Y_train)
CV_rfc.best_params_

In [None]:
# entrenamos el modelo con los hiperparametros encontrados
rfc1=RandomForestClassifier(random_state=42, n_estimators= 200, max_depth=15, criterion='gini',min_samples_leaf=4)
rfc1.fit(X_train, Y_train)
pred=rfc1.predict(X_test)

# Results

In [None]:
print("Accuracy for Random Forest on CV data: ",accuracy_score(Y_test,pred))

In [None]:
# Compute and plot AUC
fpr1, tpr1, thresholds = roc_curve(Y_test.astype('int'), pred, drop_intermediate = False)
auc_value = auc(fpr1, tpr1)
print("El AUC es = " + str(auc_value))

In [None]:
plt.plot(fpr1, tpr1, lw=2, alpha=0.8 , label = 'ROC curve', color = 'b')
plt.plot([0, 1], [0, 1], linestyle='--', lw=1, color='r', alpha=.8)
plt.xlabel('False Positive Rate',fontsize=18)
plt.ylabel('True Positive Rate',fontsize=18)
plt.grid(False)
plt.legend(loc="lower right")
plt.title('ROC Curve',fontsize=24)
plt.show()

In [None]:
# Compute Confusion Matrix
cm = confusion_matrix(Y_test, pred)
df_cm = pd.DataFrame(cm, index = ['No sobrevivió', 'Sobrevivió'], columns = ['No sobrevivió', "Sobreviviente"])
plt.figure(figsize = (12,8))
sns.heatmap(df_cm, annot=True,fmt='g')
plt.title('Confusion matrix',fontsize=24)
plt.show()