In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
train, test = pd.read_csv('trainn.csv'), pd.read_csv('testt.csv')

In [3]:
test.Age = test['Age'].fillna(pd.Series(np.random.choice(test['Age'].dropna(), size=len(test['Age'].isna()))))

In [4]:
train.drop(['Cabin', 'Embarked'], axis = 1, inplace = True)

In [5]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Fare',
       'Familia_completa'],
      dtype='object')

Test solamente se usa para obtener el resultado final

In [6]:
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'Fare', 'Cabin',
       'Embarked', 'Familia_completa'],
      dtype='object')

## Datasets a usar

In [7]:
X = train[['Pclass', 'Sex', 'Age', 'Familia_completa']]
y = train.Survived.to_numpy()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

## Regresion logistica

In [9]:
modelo = {}

In [10]:
rl = LogisticRegression(max_iter = 1000)
rl.fit(X_train, y_train)

In [11]:
y_pred = rl.predict(X_test)

In [12]:
modelo['Reg log'] = accuracy_score(y_test, y_pred)

In [13]:
X_t = test[['Pclass', 'Sex', 'Age', 'Familia_completa']]

In [14]:
pred = rl.predict(X_t)

In [15]:
rl_a = pd.DataFrame({
    'PassengerId': test.PassengerId,
    'Survived': pred
})

In [16]:
def guardar(df, nombre):
    df.set_index('PassengerId', inplace=True)
    df.to_csv(str(nombre) +'.csv')

In [17]:
guardar(rl_a, 'rl_a')

## Arbol de desicion

In [18]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
rdos = []
for i in range(8):
    clf = DecisionTreeClassifier(max_depth = i + 1, random_state = 42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    rdos.append(accuracy_score(y_test, y_pred))

In [20]:
rdos

[0.7668161434977578,
 0.7668161434977578,
 0.7623318385650224,
 0.7488789237668162,
 0.7757847533632287,
 0.7533632286995515,
 0.7533632286995515,
 0.757847533632287]

In [21]:
clf = DecisionTreeClassifier(max_depth = 5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7757847533632287

In [22]:
modelo['ADD'] = accuracy_score(y_test, y_pred)

In [23]:
pred = rl.predict(X_t)
tree_df = pd.DataFrame({
    'PassengerId': test.PassengerId,
    'Survived': pred
})

#guardar(tree, 'tree_a')

In [24]:
'''import matplotlib.pyplot as plt
from sklearn import tree

yy = pd.Series(y_train)
fig = plt.figure(figsize = ( 25, 20))
tree.plot_tree(clf, filled=True)
fig.savefig('tree_plot.png')'''

"import matplotlib.pyplot as plt\nfrom sklearn import tree\n\nyy = pd.Series(y_train)\nfig = plt.figure(figsize = ( 25, 20))\ntree.plot_tree(clf, filled=True)\nfig.savefig('tree_plot.png')"

In [25]:
modelo

{'Reg log': 0.7937219730941704, 'ADD': 0.7757847533632287}

## Random forest

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
bosque = RandomForestClassifier()
bosque.fit(X_train, y_train)
y_pred = bosque.predict(X_test)
modelo['bosque'] = accuracy_score(y_test, y_pred)

## SVM

In [28]:
from sklearn.svm import SVC

In [29]:
s_v = SVC(kernel='linear')
s_v.fit(X_train, y_train)
y_pred = s_v.predict(X_test)
modelo['s_v'] = accuracy_score(y_test, y_pred)

In [30]:
s_v = SVC()
s_v.fit(X_train, y_train)
y_pred = s_v.predict(X_test)
modelo['s_v_rbf'] = accuracy_score(y_test, y_pred)

In [31]:
modelo

{'Reg log': 0.7937219730941704,
 'ADD': 0.7757847533632287,
 'bosque': 0.7623318385650224,
 's_v': 0.7668161434977578,
 's_v_rbf': 0.6098654708520179}

## Hiperparamtreos de LR

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
rl = LogisticRegression(max_iter = 1000)
rl.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 1000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [34]:
params = {
    'penalty': ['l1', 'l2', None],
    'C': [0.1, 0.5, 1.0, 5.0, 10.0],
    'solver': ['newton-cg', 'sag', 'saga', 'lbfgs']
   
}

grid_search = GridSearchCV(rl, params, cv=10, scoring = 'accuracy')
grid_search.fit(X_train, y_train)

# Imprimir los hiperparámetros óptimos
print("Mejores hiperparámetros:", grid_search.best_params_)







150 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\USUARIO\miniconda3\envs\jc\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\USUARIO\miniconda3\envs\jc\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\USUARIO\miniconda3\envs\jc\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 

Mejores hiperparámetros: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}


In [35]:
mejor_rl = grid_search.best_estimator_
y_pred = mejor_rl.predict(X_test)
modelo['mejor_rl'] = accuracy_score(y_test, y_pred)

In [36]:
modelo

{'Reg log': 0.7937219730941704,
 'ADD': 0.7757847533632287,
 'bosque': 0.7623318385650224,
 's_v': 0.7668161434977578,
 's_v_rbf': 0.6098654708520179,
 'mejor_rl': 0.7713004484304933}

# Eliminar atributos

In [59]:
X = train[['Pclass', 'Sex', 'Age']]
X_t = test[['Pclass', 'Sex', 'Age']]
y = train.Survived.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  418 non-null    int64  
 1   Sex     418 non-null    int64  
 2   Age     418 non-null    float64
dtypes: float64(1), int64(2)
memory usage: 9.9 KB


In [66]:
bosque = RandomForestClassifier()
bosque.fit(X_train, y_train)
y_pred = bosque.predict(X_test)
modelos_n['rf'] = accuracy_score(y_test, y_pred)


In [63]:
modelos_n = {}
reg = LogisticRegression(max_iter = 1000)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
modelos_n['reg'] = accuracy_score(y_test, y_pred)

In [64]:
s_v = SVC(kernel='linear')
s_v.fit(X_train, y_train)
y_pred = s_v.predict(X_test)
modelos_n['s_v'] = accuracy_score(y_test, y_pred)

In [67]:
modelos_n

{'reg': 0.7757847533632287,
 's_v': 0.7668161434977578,
 'rf': 0.7623318385650224}

In [69]:
predcir = bosque.predict(X_t)
bosque = pd.DataFrame({
    'PassengerId': test.PassengerId,
    'Survived': predcir
})
guardar(bosque, 'bosque_')

In [None]:
clf = DecisionTreeClassifier(max_depth = 5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
modelos_n['tree'] = accuracy_score(y_test, y_pred)


In [None]:
modelos_n, modelo

## Resultado final:
Score: 0.77990
los hiperparametros usados para random forest son Mejores hiperparámetros: {'criterion': 'gini', 'max_depth': 5, 'n_estimators': 100}

las columnas 'Pclass', 'Sex', 'Age', archivo bosque_

In [75]:
X_t.columns

Index(['Pclass', 'Sex', 'Age'], dtype='object')

In [76]:
params = {
    'n_estimators': [100, 150],
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 20]
}

bosque = RandomForestClassifier()

grid_search = GridSearchCV(bosque, params, cv = 10, scoring = 'accuracy')
grid_search.fit(X_train, y_train)

# Imprimir los hiperparámetros óptimos
print("Mejores hiperparámetros:", grid_search.best_params_)

Mejores hiperparámetros: {'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 150}


In [77]:
bosque_ = grid_search.best_estimator_
y_pred = bosque_.predict(X_test)
modelos_n['bosque_'] = accuracy_score(y_test, y_pred)

In [78]:
predcir = bosque_.predict(X_t)
bosque = pd.DataFrame({
    'PassengerId': test.PassengerId,
    'Survived': predcir
})
guardar(bosque, 'bosque_2')

In [79]:
modelos_n

{'reg': 0.7757847533632287,
 's_v': 0.7668161434977578,
 'rf': 0.7623318385650224,
 'bosque_': 0.7757847533632287}