In [1]:
##############################################
# Logistic regression, using statsmodels and sklearn
# https://www.geeksforgeeks.org/logistic-regression-using-statsmodels/
# https://www.youtube.com/watch?v=bSXIbCZNBw0&list=PL2vnxMe6QbfKpOxM-F3sNvGPOdzlRcZ7L&index=1

In [2]:
import statsmodels.api as sm 
import pandas as pd  
from sklearn.metrics import (confusion_matrix, accuracy_score) 

In [3]:
# dataset  
sup_reg_part_df = pd.read_csv("/Users/anm/code/RegressionMECO/data/MECO/sup_reg_part_df.csv", sep='\t')

In [4]:
sup_reg_part_df

Unnamed: 0,trialid,ianum,ia,surprisal,uniform_id,reg.out
0,4.0,2.0,thylacine,13.792913,en_10,1
1,4.0,2.0,thylacine,13.792913,en_101,1
2,4.0,2.0,thylacine,13.792913,en_102,1
3,4.0,2.0,thylacine,13.792913,en_11,1
4,4.0,2.0,thylacine,13.792913,en_14,1
...,...,...,...,...,...,...
62522,12.0,164.0,organizations.,1.011428,en_94,1
62523,12.0,164.0,organizations.,1.011428,en_95,1
62524,12.0,164.0,organizations.,1.011428,en_97,1
62525,12.0,164.0,organizations.,1.011428,en_98,1


In [5]:
# training dataset
training_df = sup_reg_part_df[(sup_reg_part_df['trialid'] < 10)]

In [6]:
training_df

Unnamed: 0,trialid,ianum,ia,surprisal,uniform_id,reg.out
0,4.0,2.0,thylacine,13.792913,en_10,1
1,4.0,2.0,thylacine,13.792913,en_101,1
2,4.0,2.0,thylacine,13.792913,en_102,1
3,4.0,2.0,thylacine,13.792913,en_11,1
4,4.0,2.0,thylacine,13.792913,en_14,1
...,...,...,...,...,...,...
43238,9.0,200.0,venom.,3.098823,en_94,0
43239,9.0,200.0,venom.,3.098823,en_95,1
43240,9.0,200.0,venom.,3.098823,en_97,1
43241,9.0,200.0,venom.,3.098823,en_98,0


In [7]:
# test dataset
test_df = sup_reg_part_df[(sup_reg_part_df['trialid'] >= 10)]

In [8]:
test_df

Unnamed: 0,trialid,ianum,ia,surprisal,uniform_id,reg.out
43243,10.0,2.0,national,12.362928,en_10,1
43244,10.0,2.0,national,12.362928,en_101,1
43245,10.0,2.0,national,12.362928,en_102,1
43246,10.0,2.0,national,12.362928,en_11,1
43247,10.0,2.0,national,12.362928,en_14,1
...,...,...,...,...,...,...
62522,12.0,164.0,organizations.,1.011428,en_94,1
62523,12.0,164.0,organizations.,1.011428,en_95,1
62524,12.0,164.0,organizations.,1.011428,en_97,1
62525,12.0,164.0,organizations.,1.011428,en_98,1


In [9]:
# defining the dependent and independent variables 
Xtrain = training_df[['surprisal']] 
ytrain = training_df[['reg.out']] 

In [10]:
# building the model and fitting the data 
log_reg = sm.Logit(ytrain, Xtrain).fit()

Optimization terminated successfully.
         Current function value: 0.577159
         Iterations 5


In [11]:
# printing the summary table with coef and pvalue
print(log_reg.summary()) 

                           Logit Regression Results                           
Dep. Variable:                reg.out   No. Observations:                43243
Model:                          Logit   Df Residuals:                    43242
Method:                           MLE   Df Model:                            0
Date:                Tue, 19 Mar 2024   Pseudo R-squ.:                 0.05600
Time:                        17:39:27   Log-Likelihood:                -24958.
converged:                       True   LL-Null:                       -26439.
Covariance Type:            nonrobust   LLR p-value:                       nan
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
surprisal      0.1808      0.002     85.299      0.000       0.177       0.185


In [12]:
# Explanation of some of the terms in the summary table:
# 
# coef : the coefficients of the independent variables in the regression equation.
# Log-Likelihood : the natural logarithm of the Maximum Likelihood Estimation(MLE) function. MLE is the optimization process of finding the set of parameters that result in the best fit.
# LL-Null : the value of log-likelihood of the model when no independent variable is included(only an intercept is included).
# Pseudo R-squ. : a substitute for the R-squared value in Least Squares linear regression. It is the ratio of the log-likelihood of the null model to that of the full model.

In [13]:
# Predicting on test data

# loading the testing dataset   
# df = pd.read_csv('logit_test1.csv', index_col = 0) 
# here it will be already done as the test_df

In [14]:
# defining the dependent and independent variables 
Xtest = test_df[['surprisal']] 
ytest = test_df['reg.out'] 

In [15]:
# performing predictions on the test dataset 
yhat = log_reg.predict(Xtest) 
# prediction = list(map(round, yhat)) #from original script
prediction = (yhat > 0.6).values.astype(int) #adapted to this dataset

In [16]:
print(yhat)

43243    0.903351
43244    0.903351
43245    0.903351
43246    0.903351
43247    0.903351
           ...   
62522    0.545586
62523    0.545586
62524    0.545586
62525    0.545586
62526    0.545586
Length: 19284, dtype: float64


In [17]:
# comparing original and predicted values of y 
print('Actual values', list(ytest.values)) 

Actual values [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0

In [18]:
print('Predictions :', prediction) 

Predictions : [1 1 1 ... 0 0 0]


In [19]:
# confusion matrix 
cm = confusion_matrix(ytest, prediction)  
print ("Confusion Matrix : \n", cm)  

Confusion Matrix : 
 [[2902 3257]
 [4065 9060]]


In [20]:
# accuracy score of the model 
print('Test accuracy = ', accuracy_score(ytest, prediction))

Test accuracy =  0.6203069902509852
