In [1]:
import numpy as np
import pandas as pd

### Prompt

In [2]:
inp=input('Please enter the name of the data file (e.g. sample1.csv): ')

Please enter the name of the data file (e.g. sample1.csv): sample1.csv


In [3]:
data = pd.read_csv(inp)

In [4]:
data

Unnamed: 0,x1,x2,y
0,1.76405,-0.37,1.0
1,0.40016,-0.24,0.0
2,0.97874,1.10,1.0
3,2.24089,0.66,1.0
4,1.86756,0.64,1.0
...,...,...,...
195,-0.17155,-0.76,0.0
196,0.77179,0.86,1.0
197,0.82350,1.14,1.0
198,2.16324,1.47,1.0


In [5]:
x=data[['x1','x2']]
y=data['y']

### Logistic Regression Analysis via Gradient Descent Algorithm

In [6]:
#initial coefficients
np.random.seed(0)
b = pd.Series(np.random.uniform(low=-1, high=1, size=3))
b

0    0.097627
1    0.430379
2    0.205527
dtype: float64

In [7]:
alpha = 0.5 #learning rate
epochs = 1000 #number of iterations
n = len(x) #size

In [8]:
x

Unnamed: 0,x1,x2
0,1.76405,-0.37
1,0.40016,-0.24
2,0.97874,1.10
3,2.24089,0.66
4,1.86756,0.64
...,...,...
195,-0.17155,-0.76
196,0.77179,0.86
197,0.82350,1.14
198,2.16324,1.47


In [9]:
z = pd.DataFrame(np.hstack((np.ones((n,1)),x))) #add constant
z

Unnamed: 0,0,1,2
0,1.0,1.76405,-0.37
1,1.0,0.40016,-0.24
2,1.0,0.97874,1.10
3,1.0,2.24089,0.66
4,1.0,1.86756,0.64
...,...,...,...
195,1.0,-0.17155,-0.76
196,1.0,0.77179,0.86
197,1.0,0.82350,1.14
198,1.0,2.16324,1.47


In [10]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [11]:
for i in range(epochs):
    b[0]=b[0]-alpha*((sigmoid(z.dot(b.T))-y)*z[0]).mean()
    b[1]=b[1]-alpha*((sigmoid(z.dot(b.T))-y)*z[1]).mean()
    b[2]=b[2]-alpha*((sigmoid(z.dot(b.T))-y)*z[2]).mean()

In [12]:
#result
b=np.round(b,4)
b

0    0.1682
1    2.8210
2    2.8184
dtype: float64

### Statsmodels

In [13]:
import statsmodels.formula.api as sm

In [14]:
log_reg = sm.logit('y ~ x1+x2', data = data).fit()

Optimization terminated successfully.
         Current function value: 0.283800
         Iterations 8


In [15]:
print(log_reg.summary())

                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  200
Model:                          Logit   Df Residuals:                      197
Method:                           MLE   Df Model:                            2
Date:                Sat, 01 Oct 2022   Pseudo R-squ.:                  0.5904
Time:                        19:22:37   Log-Likelihood:                -56.760
converged:                       True   LL-Null:                       -138.59
Covariance Type:            nonrobust   LLR p-value:                 2.897e-36
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1683      0.240      0.702      0.482      -0.301       0.638
x1             2.8218      0.444      6.348      0.000       1.951       3.693
x2             2.8193      0.470      5.995      0.0

In [16]:
log_reg.params=np.round(log_reg.params, 4)
log_reg.params

Intercept    0.1683
x1           2.8218
x2           2.8193
dtype: float64

### Write File

In [17]:
f = open("HW4_output.txt",'w')
text = "Coefficients by Gradient Descent Method\n-------------\nConstant: "+str(b[0])+"\n"
for i in range(len(x.columns)):
    text += "Beta"+str(i+1)+": "+str(b[i+1])+"\n"
text += "\nCoefficients by Statmodels\n-------------\nConstant: "+str(log_reg.params[0])+"\n"
for i in range(len(x.columns)):
    text += "Beta"+str(i+1)+": "+str(log_reg.params[i+1])+"\n"
f.write(text)
f.close()