* Archivo: 006_random_forest_basico_1.ibynb
* Resumen: el script realiza un ejercicio de clasificación mediante el algoritmo de random forest. Se busca estudiar dicho método de clasificación.
* Fecha creación: 20201121
* Fecha última actualización: 20201121
* Autor: Gonzalo Plaza

In [46]:
# importación de librerías 
import numpy as np
import pandas as pd

from sklearn import impute
from sklearn import model_selection

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# carga de datos
df = pd.read_csv('../data/raw/train.csv')
df_predict = pd.read_csv('../data/raw/test.csv')

df['tipo'] = 'df'
df_predict['tipo'] = 'df_predict'
dfw = df.append(df_predict)

# parametros 
p_features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
p_target = "Survived"

# columna Sex
dfw['Sex'][dfw['Sex'] == 'female'] = 0
dfw['Sex'][dfw['Sex'] == 'male'] = 1

# columna Age
imputador_mean = impute.SimpleImputer(missing_values=np.NaN, strategy='mean')
imputador_mean.fit(dfw[['Fare','Age']])
dfw['Age'] = imputador_mean.transform(dfw[['Fare','Age']])[:,1]
dfw['Fare'] = imputador_mean.transform(dfw[['Fare','Age']])[:,0]

# column embarked
dfw['Embarked'][dfw['Embarked'] == 'S'] = 1
dfw['Embarked'][dfw['Embarked'] == 'C'] = 2
dfw['Embarked'][dfw['Embarked'] == 'Q'] = 3

imputador_freq = impute.SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
imputador_freq.fit(dfw[['Sex','Embarked']])
dfw['Embarked'] = imputador_freq.transform(dfw[['Sex','Embarked']])[:,1]

# construcción training testing
X = dfw.loc[dfw['tipo'] == "df", p_features]
y = dfw.loc[dfw['tipo'] == "df", p_target]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3, random_state = 1)


In [47]:
# entrenamiento del modelo
modelo_1 = RandomForestClassifier(random_state=1)
modelo_1.fit(X_train, y_train)
y_pred_1 = modelo_1.predict(X_test).astype(int)
y_pred_1_b = modelo_1.predict_proba(X_test)


In [49]:
# Accuracy
pred_1_metric = metrics.accuracy_score(y_test, y_pred_1)
# entropia cruzada
pred_1_metric_b = metrics.log_loss(y_test, y_pred_1_b)
# AUC
pred_1_metric_c = metrics.roc_auc_score(y_test, y_pred_1_b[:,-1])

In [50]:
# tabla de supervivencia
pred_1_metric_d = pd.crosstab(pd.cut(y_pred_1_b[:,-1],np.array(list(range(1,11,1)))/10), y_test)

In [52]:
# construcción modelo producción y salida
X_predict = dfw.loc[dfw['tipo'] == "df_predict", p_features]

modelo_final = RandomForestClassifier(random_state=1)
modelo_final = modelo_final.fit(X,y)
y_pred_final = modelo_final.predict(X_predict)

df_predict['Survived'] = y_pred_final
df_final = df_predict[['PassengerId', 'Survived']]
df_final['Survived'] = df_final['Survived'].astype(int)
df_final.to_csv('../output/006_random_forest_basico_1.csv', index=False)