In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

Creo una funcion para entrenar distintos modelos base, y luego obtener un diccionario con los mejores resultados

In [2]:
def modelo(X_train, X_test, y_train, y_test, m):
    
    if m == 'rl':
        model = LogisticRegression(max_iter = 1000, random_state = 42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        modelos['rl'] = accuracy_score(y_test, y_pred)
        return model
    elif m == 'tree':
        model = DecisionTreeClassifier(max_depth = 5, random_state = 42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        modelos['tree'] = accuracy_score(y_test, y_pred)
        return model
    elif m == 'rf':
        model = RandomForestClassifier(random_state = 42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        modelos['RF'] = accuracy_score(y_test, y_pred)
        return model
    elif m == 'svm':
        model = SVC(kernel = 'linear', random_state = 42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        modelos['svc'] = accuracy_score(y_test, y_pred)
        return model
    else:
        print("Error.")

Cargo y separo los datos en enranamiento y test

In [3]:
train, test = pd.read_csv('trainn.csv'), pd.read_csv('testt.csv')

train['Flia'] = train.SibSp + train.Parch
test['Flia'] = test.SibSp + test.Parch
train.Age = train.Age.astype(int)

train = train.loc[train.Age > 0]
X = train[['Pclass', 'Sex', 'Age', 'Flia']]
y = train.Survived.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [4]:
X_test_kaggle = test[['Pclass', 'Sex', 'Age', 'Flia']]

In [5]:
modelos = {}
modelos_lista = ['rl', 'tree', 'rf', 'svm']
for i in modelos_lista:
    m = modelo(X_train, X_test, y_train, y_test, i)

Visualizo resultados modelos, aparentemente el mejro resultado lo tiene el random forest, asi que procedo a utilizar grid search para buscar parametros mejores

In [6]:
modelos

{'rl': 0.7828054298642534,
 'tree': 0.7737556561085973,
 'RF': 0.8054298642533937,
 'svc': 0.7782805429864253}

In [6]:
modelos

{'rl': 0.7937219730941704,
 'tree': 0.8026905829596412,
 'RF': 0.7847533632286996,
 'svc': 0.7847533632286996}

## Grid p/ tree

In [8]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# Definir el modelo base
model = DecisionTreeClassifier()

# Realizar la búsqueda de parámetros óptimos
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5,scoring = 'accuracy')
grid_search.fit(X_train, y_train)

# Imprimir los mejores parámetros y la mejor puntuación
print("Mejores parámetros: ", grid_search.best_params_)

Mejores parámetros:  {'criterion': 'gini', 'max_depth': 3, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}


In [9]:
tree_grid = grid_search.best_estimator_
y_pred = tree_grid.predict(X_test)
accuracy_score(y_test, y_pred)

0.8071748878923767

In [10]:
predecir = tree_grid.predict(X_test_kaggle)
arbol_df = pd.DataFrame({'PassengerId': test.PassengerId,
                         'Survived': predecir
                         })
arbol_df.set_index('PassengerId', inplace = True)
arbol_df.to_csv('arbol.csv')

## Grid P/RF

Mejores hiperparametros: {'criterion': 'gini', 'max_depth': 5, 'n_estimators': 150}

In [7]:
params = {
    'n_estimators': [100, 150],
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 20]
}

bosque = RandomForestClassifier()
grid_search = GridSearchCV(bosque, params, cv = 5, scoring = 'accuracy')
grid_search.fit(X_train, y_train)
print("Mejores hiperparametros:", grid_search.best_params_)

Mejores hiperparametros: {'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 150}


In [8]:
bosque_n = grid_search.best_estimator_
y_pred = bosque_n.predict(X_test)
accuracy_score(y_test, y_pred)

0.8054298642533937

Genero predicciones y guardo modelo

In [9]:
predecir = bosque_n.predict(X_test_kaggle)
bosque_df = pd.DataFrame({'PassengerId': test.PassengerId,
                         'Survived': predecir
                         })
bosque_df.set_index('PassengerId', inplace = True)

In [10]:
bosque_df.to_csv('rdo_final.csv')