In [2]:
#!/usr/bin/env python
# multinomial_logistic_regression.py
# Author : Saimadhu Polamuri
# Date: 05-May-2017
# About: Multinomial logistic regression model implementation
 
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import metrics
from sklearn.cross_validation import train_test_split
 
import plotly.graph_objs as go
import plotly.plotly as py
from plotly.graph_objs import *
%matplotlib inline
py.sign_in('marab', 'aVB8eKCue1UMOMzWxkVx')
 
# Dataset Path
DATASET_PATH = "./Input/Debugging.txt"



In [65]:

 
 
def scatter_with_color_dimension_graph(feature, target, layout_labels):
    """
    Scatter with color dimension graph to visualize the density of the
    Given feature with target
    :param feature:
    :param target:
    :param layout_labels:
    :return:
    """
    trace1 = go.Scatter(
        y=feature,
        mode='markers',
        marker=dict(
            size='16',
            color=target,
            colorscale='Viridis',
            showscale=True
        )
    )
    layout = go.Layout(
        title=layout_labels[2],
        xaxis=dict(title=layout_labels[0]), yaxis=dict(title=layout_labels[1]))
    data = [trace1]
    fig = Figure(data=data, layout=layout)
    # plot_url = py.plot(fig)
    py.image.save_as(fig, filename=layout_labels[1] + '_Density.png')
 
 
def create_density_graph(dataset, features_header, target_header):
    """
    Create density graph for each feature with target
    :param dataset:
    :param features_header:
    :param target_header:
    :return:
    """
    for feature_header in features_header:
        print "Creating density graph for feature:: {} ".format(feature_header)
        layout_headers = ["Number of Observation", feature_header + " & " + target_header,
                          feature_header + " & " + target_header + " Density Graph"]
        scatter_with_color_dimension_graph(dataset[feature_header], dataset[target_header], layout_headers)
 
 
def main():
    glass_data_headers = ["Expertise2","Guided2","Outcome2"]
    glass_data = pd.read_csv(DATASET_PATH, names=glass_data_headers)
 
    print "Number of observations :: ", len(glass_data.index)
    print "Number of columns :: ", len(glass_data.columns)
    print "Headers :: ", glass_data.columns.values
    print "Target :: ", glass_data[glass_data_headers[-1]]
#     Train , Test data split
 
#     print "glass_data_Expertise :: ", list(glass_data["Expertise2"][:0])
#     print "glass_data_target :: ", list(glass_data["Outcome2"][:2])
    graph_labels = ["Number of Observations", "expertise & Glass Type", "Sample Expertise - Glass Type Density Graph"]
    # scatter_with_color_dimension_graph(list(glass_data["RI"][:10]),
    #                                    np.array([1, 1, 1, 2, 2, 3, 4, 5, 6, 7]), graph_labels)
 
    # print "glass_data_headers[:-1] :: ", glass_data_headers[:-1]
    # print "glass_data_headers[-1] :: ", glass_data_headers[-1]
    # create_density_graph(glass_data, glass_data_headers[1:-1], glass_data_headers[-1])
    print "expertise::" ,list(glass_data[glass_data_headers[0]])
    print "Guided::", list(glass_data[glass_data_headers[1]])
    print "whats this::", list(glass_data[glass_data_headers[:-1]])
    print list(glass_data[glass_data_headers[-1]])
    train_x, test_x, train_y, test_y = train_test_split(glass_data[glass_data_headers[:-2]],
                                                        glass_data[glass_data_headers[-1]], train_size=0.9)
#     train_x, test_x, train_y, test_y = train_test_split(glass_data[glass_data_headers[:-2]],
#                                                         glass_data[glass_data_headers[-1]], train_size=0.9)
    # Train multi-classification model with logistic regression
    lr = linear_model.LogisticRegression()
    lr.fit(train_x, train_y)
 
    # Train multinomial logistic regression model
    mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(train_x, train_y)
 
    print "Logistic regression Train Accuracy :: ", metrics.accuracy_score(train_y, lr.predict(train_x))
    print "Logistic regression Test Accuracy :: ", metrics.accuracy_score(test_y, lr.predict(test_x))
 
    print "Multinomial Logistic regression Train Accuracy :: ", metrics.accuracy_score(train_y, mul_lr.predict(train_x))
    print "Multinomial Logistic regression Test Accuracy :: ", metrics.accuracy_score(test_y, mul_lr.predict(test_x))
 
 
if __name__ == "__main__":
    main()

Number of observations ::  27
Number of columns ::  3
Headers ::  ['Expertise2' 'Guided2' 'Outcome2']
Target ::  0     0
1     0
2     1
3     0
4     1
5     0
6     1
7     2
8     2
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    2
17    0
18    0
19    0
20    2
21    0
22    0
23    2
24    0
25    1
26    0
Name: Outcome2, dtype: int64
expertise:: [7, 7, 6, 7, 8, 5, 4, 7, 4, 7, 5, 5, 6, 6, 6, 6, 5, 5, 5, 6, 9, 6, 7, 6, 7, 7, 7]
Guided:: [0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0]
whats this:: ['Expertise2', 'Guided2']
[0, 0, 1, 0, 1, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 1, 0]
Logistic regression Train Accuracy ::  0.625
Logistic regression Test Accuracy ::  1.0
Multinomial Logistic regression Train Accuracy ::  0.625
Multinomial Logistic regression Test Accuracy ::  1.0


In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score

  from pandas.core import datetools


In [3]:
# load dataset
DATASET_PATH = "./Input/Debugging.txt"
data_headers = ["Expertise2","Guided2","Outcome2"]
dta = pd.read_csv(DATASET_PATH, names=data_headers)
# add "Outcome2" column: 1 represents having affairs, 0 represents not
dta['Outcome2'] = dta[data_headers[-1]]
dta.groupby('Outcome2').mean()

Unnamed: 0_level_0,Expertise2,Guided2
Outcome2,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6.111111,0.277778
1,6.25,0.75
2,6.2,1.0


In [4]:
dta.groupby('Expertise2').mean()


Unnamed: 0_level_0,Guided2,Outcome2
Expertise2,Unnamed: 1_level_1,Unnamed: 2_level_1
4,1.0,1.5
5,0.666667,0.333333
6,0.5,0.375
7,0.222222,0.333333
8,0.0,1.0
9,1.0,2.0


In [7]:
y, X = dmatrices('Outcome2 ~ Expertise2',
                  dta, return_type="dataframe")
print X.columns

Index([u'Intercept', u'Expertise2'], dtype='object')


In [90]:
y = np.ravel(y)

In [91]:
model = LogisticRegression()
model = model.fit(X, y)

# check the accuracy on the training set
model.score(X, y)

0.66666666666666663

In [92]:
y.mean()

0.51851851851851849

In [93]:
# examine the coefficients
pd.DataFrame(zip(X.columns, np.transpose(model.coef_)))

Unnamed: 0,0,1
0,Intercept,"[0.180135020076, -0.245137207404, -0.214524403..."
1,Expertise2,"[0.0493669437572, -0.196133379204, -0.16453337..."


In [94]:
DATASET_PATH = "./Input/Debugging.txt"
data_headers = ["Expertise2","Guided2","Outcome2"]
dta = pd.read_csv(DATASET_PATH, names=data_headers)



In [123]:
from sklearn.linear_model import LogisticRegression
dta['Guided2'] = dta['Guided2'].astype('str')

df = pd.get_dummies(dta,columns=['Guided2'],drop_first = True)
lr = LogisticRegression()
lr.fit(df[['Expertise2','Guided2_1']],df['Outcome2'])
lr.predict(df[['Expertise2','Guided2_1']])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [134]:
import statsmodels.api as sm
X = df[['Expertise2','Guided2_1']]
X = sm.add_constant(X)
mod = sm.OLS(df['Outcome2'], X)

In [135]:
res = mod.fit()

In [136]:
res.summary()

0,1,2,3
Dep. Variable:,Outcome2,R-squared:,0.404
Model:,OLS,Adj. R-squared:,0.354
Method:,Least Squares,F-statistic:,8.134
Date:,"Thu, 19 Apr 2018",Prob (F-statistic):,0.00201
Time:,20:01:14,Log-Likelihood:,-24.872
No. Observations:,27,AIC:,55.74
Df Residuals:,24,BIC:,59.63
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.0525,0.763,-1.380,0.180,-2.627,0.522
Expertise2,0.1729,0.114,1.512,0.143,-0.063,0.409
Guided2_1,1.0549,0.262,4.026,0.000,0.514,1.596

0,1,2,3
Omnibus:,0.285,Durbin-Watson:,1.644
Prob(Omnibus):,0.867,Jarque-Bera (JB):,0.268
Skew:,0.205,Prob(JB):,0.875
Kurtosis:,2.734,Cond. No.,39.9


In [138]:
from statsmodels.regression.linear_model import RegressionResults

In [139]:
A = np.identity(len(res.params))
A = A[1:,:]

In [148]:
RegressionResults.wald_test(np.array([1,2,3]))

TypeError: unbound method wald_test() must be called with RegressionResults instance as first argument (got ndarray instance instead)

In [145]:
np.array(res.params)

array([-1.05251339,  0.17291415,  1.05493177])