## Pruebo modelos

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from lightgbm import LGBMClassifier
import warnings

warnings.filterwarnings("ignore")

In [2]:
datos_train = pd.read_csv('data_train_clean_final.csv')
datos_test = pd.read_csv('data_test_clean_final.csv')

In [3]:
labels = datos_train['survived']
train = datos_train.drop(columns='survived')

### Random Forest Classifier

In [4]:
rf = RandomForestClassifier(random_state=22)

In [5]:
rf.fit(train, labels)

In [6]:
from sklearn.model_selection import cross_val_score

# Realiza la validación cruzada con 5 particiones
scores_rf = cross_val_score(rf, train, labels, cv=10)

precision_rf = scores_rf.mean()

print("Precisión media:", precision_rf)

Precisión media: 0.8265934065934065


### Gradient Boosting Classifier

In [7]:
gbc = GradientBoostingClassifier(random_state=22)

In [8]:
gbc.fit(train, labels)

In [9]:
from sklearn.model_selection import cross_val_score

# Realiza la validación cruzada con 5 particiones
scores_gbc = cross_val_score(gbc, train, labels, cv=10)

precision_gbc = scores_gbc.mean()

# Muestra los puntajes de validación cruzada
print("Precisión media:", precision_gbc)

Precisión media: 0.8399175824175824


### Light Gradient Boosting Machine Classifier

In [10]:
lgbm = LGBMClassifier(random_state=22)

In [11]:
lgbm.fit(train, labels)

[LightGBM] [Info] Number of positive: 500, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001153 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 356
[LightGBM] [Info] Number of data points in the train set: 1049, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.476644 -> initscore=-0.093490
[LightGBM] [Info] Start training from score -0.093490


  File "C:\Users\Guillermo\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


In [12]:
from sklearn.model_selection import cross_val_score

# Realiza la validación cruzada con 5 particiones
scores_lgbm = cross_val_score(lgbm, train, labels, cv=10)

precision_lgbm = scores_lgbm.mean()

# Muestra los puntajes de validación cruzada
print("Precisión media:", precision_lgbm)

[LightGBM] [Info] Number of positive: 450, number of negative: 494
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000220 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 364
[LightGBM] [Info] Number of data points in the train set: 944, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.476695 -> initscore=-0.093288
[LightGBM] [Info] Start training from score -0.093288
[LightGBM] [Info] Number of positive: 450, number of negative: 494
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 369
[LightGBM] [Info] Number of data points in the train set: 944, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.476695 -> initscore=-0.093288
[LightGBM] [Info

### Ada Boost Classifier

In [13]:
ada = AdaBoostClassifier(random_state=22)

In [14]:
ada.fit(train, labels)

In [15]:
from sklearn.model_selection import cross_val_score

# Realiza la validación cruzada con 5 particiones
scores_ada = cross_val_score(ada, train, labels, cv=10)

precision_ada = scores_ada.mean()

# Muestra los puntajes de validación cruzada
print("Precisión media:", precision_ada)

Precisión media: 0.8265842490842491


### Logistic Regression

In [16]:
lg = LogisticRegression(random_state=22)

In [17]:
lg.fit(train, labels)

In [18]:
from sklearn.model_selection import cross_val_score

# Realiza la validación cruzada con 5 particiones
scores_lg = cross_val_score(lg, train, labels, cv=10)

precision_lg = scores_ada.mean()

# Muestra los puntajes de validación cruzada
print("Precisión media:", precision_lg)

Precisión media: 0.8265842490842491


### Ridge Classifier

In [19]:
rc = RidgeClassifier(random_state=22)

In [20]:
rc.fit(train, labels)

In [21]:
from sklearn.model_selection import cross_val_score

# Realiza la validación cruzada con 5 particiones
scores_rc = cross_val_score(rc, train, labels, cv=10)

precision_rc = scores_ada.mean()

# Muestra los puntajes de validación cruzada
print("Precisión media:", precision_rc)

Precisión media: 0.8265842490842491


### Resumen

In [22]:
precision_modelos = {
    'RF': precision_rf,
    'GBC': precision_gbc,
    'LGBM': precision_lgbm,
    'ADA': precision_ada,
    'LG': precision_lg,
    'RC': precision_rc} 

In [23]:
precision_modelos_df = pd.DataFrame.from_dict(precision_modelos, orient='index', columns=['accuracy'])
precision_modelos_df.sort_values(by='accuracy', ascending=False, inplace=True)

In [24]:
precision_modelos_df

Unnamed: 0,accuracy
LGBM,0.84565
GBC,0.839918
RF,0.826593
ADA,0.826584
LG,0.826584
RC,0.826584


#### En este caso donde pruebo los modelos con sus configuraciones por default, Light Gradient Boosting Machine y Gradient Boosting Classifier parecen ser los modelos mas adecuados para mi dataset.

#### Al igual que utilizando la librería Pycaret, la precision de los modelos mejora utilizando técnicas de balanceo de datos, en este caso SMOTE.