In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
%matplotlib inline

  from pandas.core import datetools


In [2]:
dta = sm.datasets.fair.load_pandas().data

In [5]:
dta.shape
dta.info()
dta.isnull().sum(axis=0)
dta.affairs.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6366 entries, 0 to 6365
Data columns (total 9 columns):
rate_marriage      6366 non-null float64
age                6366 non-null float64
yrs_married        6366 non-null float64
children           6366 non-null float64
religious          6366 non-null float64
educ               6366 non-null float64
occupation         6366 non-null float64
occupation_husb    6366 non-null float64
affairs            6366 non-null float64
dtypes: float64(9)
memory usage: 447.6 KB


count    6366.000000
mean        0.705374
std         2.203374
min         0.000000
25%         0.000000
50%         0.000000
75%         0.484848
max        57.599991
Name: affairs, dtype: float64

In [6]:
# add "affair" column: 1 represents having affairs, 0 represents not
dta['affair'] = (dta.affairs > 0).astype(int)
y, X = dmatrices('affair~rate_marriage+age+yrs_married+children+religious+educ+C(occupation)+C(occupation_husb)',
dta, return_type="dataframe")
X = X.rename(columns = {'C(occupation)[T.2.0]':'occ_2','C(occupation)[T.3.0]':'occ_3','C(occupation)[T.4.0]':'occ_4',\
'C(occupation)[T.5.0]':'occ_5','C(occupation)[T.6.0]':'occ_6','C(occupation_husb)[T.2.0]':'occ_husb_2',\
'C(occupation_husb)[T.3.0]':'occ_husb_3','C(occupation_husb)[T.4.0]':'occ_husb_4','C(occupation_husb)[T.5.0]':'occ_husb_5',\
'C(occupation_husb)[T.6.0]':'occ_husb_6'})
y = np.ravel(y)

In [7]:
X.head()

Unnamed: 0,Intercept,occ_2,occ_3,occ_4,occ_5,occ_6,occ_husb_2,occ_husb_3,occ_husb_4,occ_husb_5,occ_husb_6,rate_marriage,age,yrs_married,children,religious,educ
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,32.0,9.0,3.0,3.0,17.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,27.0,13.0,3.0,1.0,14.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,22.0,2.5,0.0,1.0,16.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,37.0,16.5,4.0,3.0,16.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,27.0,9.0,1.0,1.0,14.0


AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [9]:
# LogisticRegression with X and y
model=LogisticRegression()
model=model.fit(X,y)

In [10]:
#testing on training dataset
model.score(X,y)

0.7258875274897895

In [11]:
# Model coefficient of all features
pd.DataFrame(list(zip(X.columns,np.transpose(model.coef_))))

Unnamed: 0,0,1
0,Intercept,[1.4898361980412742]
1,occ_2,[0.18806630047913203]
2,occ_3,[0.49894789517697424]
3,occ_4,[0.2506682569227064]
4,occ_5,[0.8390079138437434]
5,occ_6,[0.8339083831094645]
6,occ_husb_2,[0.1906356800608893]
7,occ_husb_3,[0.2978324105777254]
8,occ_husb_4,[0.16140858815236364]
9,occ_husb_5,[0.18777060533734802]


In [12]:
#Training and Testing datasplit
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [13]:
model2=LogisticRegression()
model2.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
#Prediction of testing data
predicted=model2.predict(X_test)
predicted

array([1., 0., 0., ..., 0., 0., 0.])

In [17]:
#Probabilities for 0 and 1
prob=model2.predict_proba(X_test)
prob

array([[0.35146325, 0.64853675],
       [0.90955083, 0.09044917],
       [0.72567332, 0.27432668],
       ...,
       [0.55727381, 0.44272619],
       [0.8120705 , 0.1879295 ],
       [0.74734599, 0.25265401]])

In [28]:
# Model Validation metrics on Test data -----
print("Accuracy Score :\n",metrics.accuracy_score (y_test,predicted))
print("\nROC AUC Score :\n",metrics.roc_auc_score(y_test,prob[:,1]))
print("\nClassification Report :\n",metrics.classification_report(y_test,predicted))
print("Confusion Matrix :\n",metrics.confusion_matrix(y_test,predicted))

Accuracy Score :
 0.7298429319371728

ROC AUC Score :
 0.745950606950631

Classification Report :
              precision    recall  f1-score   support

        0.0       0.75      0.90      0.82      1303
        1.0       0.63      0.37      0.47       607

avg / total       0.71      0.73      0.71      1910

Confusion Matrix :
 [[1169  134]
 [ 382  225]]
