# Ejemplo de Regresión Logística
<img src="https://raw.githubusercontent.com/fhernanb/fhernanb.github.io/master/docs/logo_unal_color.png" alt="drawing" width="200"/>

# Objetivo
En este ejemplo se busca crear un clasificador para saber si una persona sobrevive o no en el naufragio del Titanic.

<img src="https://raw.githubusercontent.com/fhernanb/Python-para-estadistica/master/imagenes/titanic.png" alt="drawing" width="900">

In [1]:
import pandas as pd  # Librería con las funciones read_table y read_csv
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

In [2]:
dt = pd.read_csv("titanic.csv", sep=",")
dt.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [3]:
dt_pclass = pd.get_dummies(dt['Pclass'])
dt_new = pd.concat([dt, dt_pclass], axis=1)
dt_new.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,1,2,3
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25,0,0,1
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833,1,0,0
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925,0,0,1
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1,1,0,0
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05,0,0,1


In [4]:
dtnew_sex = pd.get_dummies(dt['Sex'])
dt_new = pd.concat([dt_new, dtnew_sex], axis=1)
dt_new.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,1,2,3,female,male
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25,0,0,1,0,1
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833,1,0,0,1,0
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925,0,0,1,1,0
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1,1,0,0,1,0
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05,0,0,1,0,1


## Creación de los datos de entrenamiento (train) y de validación (test)
Para particionar los datos originales se usa la función `train_test_split`, para mayores detalles se recomienda consultar los parámetros de la función se recomienda consultar este [enlace](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

In [5]:
datos = dt_new[[2, 3, 'male', 'Age', 'Fare']]
respuesta = dt_new['Survived']
X_train, X_test, y_train, y_test = train_test_split(datos, respuesta, test_size=0.25)

In [6]:
mod = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train, y_train)
mod

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [7]:
mod.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'multinomial',
 'n_jobs': 1,
 'penalty': 'l2',
 'random_state': 0,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [8]:
mod.predict(X_train)[0:14]

array([0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0], dtype=int64)

In [9]:
mod.predict(X_test)[0:14]

array([1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [10]:
mod.predict_proba(X_train)

array([[0.75769294, 0.24230706],
       [0.41259575, 0.58740425],
       [0.5787238 , 0.4212762 ],
       ...,
       [0.64883997, 0.35116003],
       [0.77014875, 0.22985125],
       [0.7225097 , 0.2774903 ]])

In [11]:
mod.predict_proba(X_test)

array([[0.31726309, 0.68273691],
       [0.28276227, 0.71723773],
       [0.56280156, 0.43719844],
       [0.6674017 , 0.3325983 ],
       [0.51247523, 0.48752477],
       [0.74189701, 0.25810299],
       [0.33533701, 0.66466299],
       [0.78109863, 0.21890137],
       [0.74589324, 0.25410676],
       [0.68732123, 0.31267877],
       [0.42378173, 0.57621827],
       [0.50793282, 0.49206718],
       [0.53164214, 0.46835786],
       [0.74134193, 0.25865807],
       [0.70384171, 0.29615829],
       [0.79359308, 0.20640692],
       [0.36504074, 0.63495926],
       [0.34905225, 0.65094775],
       [0.79240584, 0.20759416],
       [0.22645925, 0.77354075],
       [0.36963335, 0.63036665],
       [0.83516281, 0.16483719],
       [0.75570605, 0.24429395],
       [0.28951901, 0.71048099],
       [0.53366765, 0.46633235],
       [0.26310336, 0.73689664],
       [0.68119893, 0.31880107],
       [0.87400755, 0.12599245],
       [0.74204985, 0.25795015],
       [0.48520695, 0.51479305],
       [0.

In [12]:
mod.score(X_train, y_train)

0.8150375939849624

In [13]:
mod.score(X_test, y_test)

0.7477477477477478

In [20]:
import statsmodels.api as sm
logit_model = sm.Logit(y_train, X_train)
result = logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.475109
         Iterations 6
                        Results: Logit
Model:              Logit            No. Iterations:   6.0000  
Dependent Variable: Survived         Pseudo R-squared: 0.275   
Date:               2018-10-03 18:39 AIC:              641.8953
No. Observations:   665              BIC:              664.3942
Df Model:           4                Log-Likelihood:   -315.95 
Df Residuals:       660              LL-Null:          -436.00 
Converged:          1.0000           Scale:            1.0000  
-----------------------------------------------------------------
        Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
-----------------------------------------------------------------
2       0.5764     0.2359     2.4429   0.0146    0.1139    1.0388
3      -0.3213     0.1811    -1.7741   0.0760   -0.6762    0.0337
male   -2.1421     0.1992   -10.7552   0.0000   -2.5324   -1.7517
Age     0.0042 

In [25]:
result.summary2()

0,1,2,3
Model:,Logit,No. Iterations:,6.0
Dependent Variable:,Survived,Pseudo R-squared:,0.275
Date:,2018-10-03 18:39,AIC:,641.8953
No. Observations:,665,BIC:,664.3942
Df Model:,4,Log-Likelihood:,-315.95
Df Residuals:,660,LL-Null:,-436.0
Converged:,1.0000,Scale:,1.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
2,0.5764,0.2359,2.4429,0.0146,0.1139,1.0388
3,-0.3213,0.1811,-1.7741,0.0760,-0.6762,0.0337
male,-2.1421,0.1992,-10.7552,0.0000,-2.5324,-1.7517
Age,0.0042,0.0057,0.7411,0.4586,-0.0070,0.0155
Fare,0.0153,0.0030,5.1307,0.0000,0.0095,0.0212


In [None]:
train_cols = data.columns[1:]
# Index([gre, gpa, prestige_2, prestige_3, prestige_4], dtype=object)

logit = sm.Logit(data['admit'], data[train_cols])

# fit the model
result = logit.fit()