# Ejemplo de Regresión Logística
<img src="https://raw.githubusercontent.com/fhernanb/fhernanb.github.io/master/docs/logo_unal_color.png" alt="drawing" width="200"/>

# Objetivo
En este ejemplo se busca crear un clasificador para saber si una persona sobrevive o no en el naufragio del Titanic.

<img src="https://raw.githubusercontent.com/fhernanb/Python-para-estadistica/master/imagenes/titanic.png" alt="drawing" width="900">

In [1]:
import pandas as pd  # Librería con las funciones read_table y read_csv
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

In [2]:
dt = pd.read_csv("titanic.csv", sep=",")
dt.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [3]:
dt_pclass = pd.get_dummies(dt['Pclass'])
dt_new = pd.concat([dt, dt_pclass], axis=1)
dt_new.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,1,2,3
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25,0,0,1
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833,1,0,0
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925,0,0,1
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1,1,0,0
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05,0,0,1


In [4]:
dtnew_sex = pd.get_dummies(dt['Sex'])
dt_new = pd.concat([dt_new, dtnew_sex], axis=1)
dt_new.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,1,2,3,female,male
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25,0,0,1,0,1
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833,1,0,0,1,0
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925,0,0,1,1,0
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1,1,0,0,1,0
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05,0,0,1,0,1


## Creación de los datos de entrenamiento (train) y de validación (test)
Para particionar los datos originales se usa la función `train_test_split`, para mayores detalles se recomienda consultar los parámetros de la función se recomienda consultar este [enlace](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

In [5]:
datos = dt_new[[2, 3, 'male', 'Age', 'Fare']]
respuesta = dt_new['Survived']
X_train, X_test, y_train, y_test = train_test_split(datos, respuesta, test_size=0.25)

# Regresión logística usando `sklearn`

In [6]:
mod = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train, y_train)
mod

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [7]:
mod.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'multinomial',
 'n_jobs': 1,
 'penalty': 'l2',
 'random_state': 0,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [8]:
mod.predict(X_train)[0:14]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype=int64)

In [9]:
mod.predict(X_test)[0:14]

array([1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1], dtype=int64)

In [10]:
mod.predict_proba(X_train)

array([[0.77517676, 0.22482324],
       [0.71468786, 0.28531214],
       [0.73470519, 0.26529481],
       ...,
       [0.43880954, 0.56119046],
       [0.67515343, 0.32484657],
       [0.78153304, 0.21846696]])

In [11]:
mod.predict_proba(X_test)

array([[0.37809001, 0.62190999],
       [0.75206657, 0.24793343],
       [0.5180465 , 0.4819535 ],
       [0.41870855, 0.58129145],
       [0.73810861, 0.26189139],
       [0.429638  , 0.570362  ],
       [0.63798494, 0.36201506],
       [0.47183963, 0.52816037],
       [0.40414233, 0.59585767],
       [0.75205909, 0.24794091],
       [0.76026119, 0.23973881],
       [0.18089323, 0.81910677],
       [0.63318162, 0.36681838],
       [0.22348295, 0.77651705],
       [0.7621709 , 0.2378291 ],
       [0.16813571, 0.83186429],
       [0.65963331, 0.34036669],
       [0.84269732, 0.15730268],
       [0.19600784, 0.80399216],
       [0.4539221 , 0.5460779 ],
       [0.83516424, 0.16483576],
       [0.26573373, 0.73426627],
       [0.23459814, 0.76540186],
       [0.7784048 , 0.2215952 ],
       [0.73084488, 0.26915512],
       [0.7311363 , 0.2688637 ],
       [0.6995819 , 0.3004181 ],
       [0.42524281, 0.57475719],
       [0.74531555, 0.25468445],
       [0.53422567, 0.46577433],
       [0.

In [12]:
mod.score(X_train, y_train)

0.8135338345864662

In [13]:
mod.score(X_test, y_test)

0.7612612612612613

# Regresión logística usando `statsmodels`

In [22]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np

logit_model = sm.Logit(y_train, X_train)
result = logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.482121
         Iterations 6
                        Results: Logit
Model:              Logit            No. Iterations:   6.0000  
Dependent Variable: Survived         Pseudo R-squared: 0.277   
Date:               2018-10-04 07:36 AIC:              651.2204
No. Observations:   665              BIC:              673.7193
Df Model:           4                Log-Likelihood:   -320.61 
Df Residuals:       660              LL-Null:          -443.18 
Converged:          1.0000           Scale:            1.0000  
-----------------------------------------------------------------
        Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
-----------------------------------------------------------------
2       0.6824     0.2370     2.8798   0.0040    0.2180    1.1469
3      -0.1548     0.1798    -0.8608   0.3894   -0.5071    0.1976
male   -2.2384     0.1982   -11.2916   0.0000   -2.6269   -1.8498
Age     0.0061 

In [23]:
result.summary2()

0,1,2,3
Model:,Logit,No. Iterations:,6.0
Dependent Variable:,Survived,Pseudo R-squared:,0.277
Date:,2018-10-04 07:36,AIC:,651.2204
No. Observations:,665,BIC:,673.7193
Df Model:,4,Log-Likelihood:,-320.61
Df Residuals:,660,LL-Null:,-443.18
Converged:,1.0000,Scale:,1.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
2,0.6824,0.2370,2.8798,0.0040,0.2180,1.1469
3,-0.1548,0.1798,-0.8608,0.3894,-0.5071,0.1976
male,-2.2384,0.1982,-11.2916,0.0000,-2.6269,-1.8498
Age,0.0061,0.0056,1.0964,0.2729,-0.0048,0.0171
Fare,0.0142,0.0028,5.1182,0.0000,0.0087,0.0196


In [24]:
train_cols = data.columns[1:]
# Index([gre, gpa, prestige_2, prestige_3, prestige_4], dtype=object)

logit = sm.Logit(data['admit'], data[train_cols])

# fit the model
result = logit.fit()

NameError: name 'data' is not defined

In [35]:
mod3 = smf.logit(formula='Survived ~ Age + Fare + Sex + Pclass', data=dt).fit()
print(mod3.params)

Optimization terminated successfully.
         Current function value: 0.451853
         Iterations 6
Intercept      4.839682
Sex[T.male]   -2.586525
Age           -0.034239
Fare           0.000320
Pclass        -1.220037
dtype: float64


In [39]:
mod3.summary2()

0,1,2,3
Model:,Logit,No. Iterations:,6.0
Dependent Variable:,Survived,Pseudo R-squared:,0.322
Date:,2018-10-04 07:49,AIC:,811.588
No. Observations:,887,BIC:,835.5273
Df Model:,4,Log-Likelihood:,-400.79
Df Residuals:,882,LL-Null:,-591.38
Converged:,1.0000,Scale:,1.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,4.8397,0.5252,9.2153,0.0000,3.8104,5.8690
Sex[T.male],-2.5865,0.1877,-13.7827,0.0000,-2.9543,-2.2187
Age,-0.0342,0.0072,-4.7710,0.0000,-0.0483,-0.0202
Fare,0.0003,0.0020,0.1568,0.8754,-0.0037,0.0043
Pclass,-1.2200,0.1417,-8.6098,0.0000,-1.4978,-0.9423
