# logistic regression
p(y_i=1|X) = sigmoid(Wx)  
W: vector (coefficients)  
x: input (|x| = |W|)  
determine W by maximum likelihood

In [None]:
# import required modules
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import statsmodels.api as sm
import pandas as pd
import numpy as np

In [2]:
# data
df = sm.datasets.fair.load_pandas().data

In [3]:
df.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666


In [4]:
# convert categorical variables to dummy variables
occ_dummies = pd.get_dummies(df.occupation)
hus_occ_dummies = pd.get_dummies(df.occupation_husb)
occ_dummies.columns = ["oc1", "oc2", "oc3", "oc4", "oc5", "oc6"]
hus_occ_dummies.columns = ["hoc1", "hoc2", "hoc3", "hoc4", "hoc5", "hoc6"]

In [5]:
df = df.drop(["occupation", "occupation_husb"], axis=1)

In [6]:
df = pd.concat([df, occ_dummies, hus_occ_dummies], axis=1)

In [7]:
# create answer label
Y = df["affairs"] > 0

In [8]:
df = df.drop(["oc1", "hoc1", "affairs"], axis=1)

In [9]:
df.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,oc2,oc3,oc4,oc5,oc6,hoc2,hoc3,hoc4,hoc5,hoc6
0,3.0,32.0,9.0,3.0,3.0,17.0,1,0,0,0,0,0,0,0,1,0
1,3.0,27.0,13.0,3.0,1.0,14.0,0,1,0,0,0,0,0,1,0,0
2,4.0,22.0,2.5,0.0,1.0,16.0,0,1,0,0,0,0,0,0,1,0
3,4.0,37.0,16.5,4.0,3.0,16.0,0,0,0,1,0,0,0,0,1,0
4,5.0,27.0,9.0,1.0,1.0,14.0,0,1,0,0,0,0,0,1,0,0


In [10]:
Y.head()

0    True
1    True
2    True
3    True
4    True
Name: affairs, dtype: bool

In [12]:
# convert answer label Y to array
Y = np.ravel(Y)

In [17]:
# logistic regression
logistic_model = LogisticRegression()
logistic_model.fit(df, Y)
logistic_model.score(df, Y)

0.7258875274897895

In [22]:
# view coefficients
pd.DataFrame([df.columns, logistic_model.coef_[0]]).T

Unnamed: 0,0,1
0,rate_marriage,-0.697926
1,age,-0.0563656
2,yrs_married,0.10392
3,children,0.0181619
4,religious,-0.368519
5,educ,0.00875341
6,oc2,0.298057
7,oc3,0.607994
8,oc4,0.346228
9,oc5,0.942511


In [24]:
# cross validation
x_train, x_test, y_train, y_test = train_test_split(df, Y)
logistic_crossmodel = LogisticRegression()
logistic_crossmodel.fit(x_train, y_train)
prediction = logistic_crossmodel.predict(x_test)

metrics.accuracy_score(y_test, prediction)

0.7236180904522613