In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

### Prompt

In [2]:
inp_train=input('Please enter the name of the train data file (e.g. veh.dat): ')

Please enter the name of the train data file (e.g. veh.dat): veh.dat


In [3]:
inp_test=input('Please enter the name of the test data file (e.g. vehtest.dat): ')

Please enter the name of the test data file (e.g. vehtest.dat): vehtest.dat


In [4]:
train = pd.read_csv(inp_train, header=None)
test = pd.read_csv(inp_test, header=None)

In [5]:
x_train=train.iloc[:,:18]
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,93,35,72,172,62,7,149,44,19,124,169,334,125,62,5,30,203,210
1,95,57,104,228,74,10,212,31,24,175,224,670,223,74,0,4,186,193
2,103,54,107,189,56,11,223,30,25,174,225,729,200,70,0,29,187,201
3,79,40,80,133,55,7,147,47,19,135,172,311,144,76,8,30,181,193
4,99,46,105,209,64,11,197,34,23,152,212,575,159,65,0,33,194,205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,83,37,49,112,55,5,122,55,17,128,144,219,146,85,8,16,180,184
421,86,36,70,143,61,9,133,50,18,130,153,266,127,66,2,10,194,202
422,83,40,53,114,53,6,132,53,18,140,142,247,157,86,8,7,176,183
423,88,43,84,136,55,11,154,44,19,150,174,350,164,73,6,2,185,196


In [6]:
y_train=train.iloc[:,18]
y_train

0      1
1      1
2      1
3      1
4      1
      ..
420    4
421    4
422    4
423    4
424    4
Name: 18, Length: 425, dtype: int64

In [7]:
x_train=sm.add_constant(x_train)
x_train

Unnamed: 0,const,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1.0,93,35,72,172,62,7,149,44,19,124,169,334,125,62,5,30,203,210
1,1.0,95,57,104,228,74,10,212,31,24,175,224,670,223,74,0,4,186,193
2,1.0,103,54,107,189,56,11,223,30,25,174,225,729,200,70,0,29,187,201
3,1.0,79,40,80,133,55,7,147,47,19,135,172,311,144,76,8,30,181,193
4,1.0,99,46,105,209,64,11,197,34,23,152,212,575,159,65,0,33,194,205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,1.0,83,37,49,112,55,5,122,55,17,128,144,219,146,85,8,16,180,184
421,1.0,86,36,70,143,61,9,133,50,18,130,153,266,127,66,2,10,194,202
422,1.0,83,40,53,114,53,6,132,53,18,140,142,247,157,86,8,7,176,183
423,1.0,88,43,84,136,55,11,154,44,19,150,174,350,164,73,6,2,185,196


### One-vs-rest

In [8]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [9]:
def ovr(x_train,y_train):
    n=len(y_train.unique()) #number of classes
    models=[]
    
    for i in range(1,n+1):
        classes=list(range(1,n+1)) #[1,2,3,4]
        classes.remove(i)
        
        y_train.replace() #replace "rest" with 0
        y_current=y_train.replace(classes,0).replace(i,1) #set value to 1
        model_current = sm.Logit(y_current,x_train).fit() #fit binary logistic regression model
        models.append(model_current)
    
    return models #return models of each class as a list

In [10]:
#Fit the train data from veh.dat
models_list=ovr(x_train,y_train) #save list of fitted models

Optimization terminated successfully.
         Current function value: 0.360581
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.327391
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.042271
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.027253
         Iterations 16


In [11]:
#Summary of the first model (class 1)
models_list[0].summary()

0,1,2,3
Dep. Variable:,18,No. Observations:,425.0
Model:,Logit,Df Residuals:,406.0
Method:,MLE,Df Model:,18.0
Date:,"Fri, 07 Oct 2022",Pseudo R-squ.:,0.355
Time:,22:11:44,Log-Likelihood:,-153.25
converged:,True,LL-Null:,-237.61
Covariance Type:,nonrobust,LLR p-value:,1.615e-26

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,63.3740,40.875,1.550,0.121,-16.740,143.488
0,-0.2447,0.046,-5.368,0.000,-0.334,-0.155
1,0.7207,0.188,3.827,0.000,0.352,1.090
2,-0.0163,0.035,-0.460,0.646,-0.086,0.053
3,0.1138,0.031,3.690,0.000,0.053,0.174
4,-0.3534,0.089,-3.969,0.000,-0.528,-0.179
5,0.0537,0.126,0.426,0.670,-0.193,0.301
6,0.3649,0.205,1.779,0.075,-0.037,0.767
7,0.3689,0.270,1.369,0.171,-0.159,0.897


### Probabilities and Predictions Using Test Data

In [12]:
x_test=test.iloc[:,:18]
x_test=sm.add_constant(x_test)
x_test

Unnamed: 0,const,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1.0,100,45,100,209,65,8,201,32,23,147,231,611,189,72,5,5,189,195
1,1.0,100,51,109,231,70,11,220,30,25,163,238,722,206,73,11,19,189,198
2,1.0,102,43,96,197,63,10,185,36,22,142,202,513,139,65,8,12,195,204
3,1.0,80,34,42,110,57,3,114,59,17,119,131,191,121,87,4,7,179,183
4,1.0,104,58,103,230,69,11,219,30,25,176,231,716,246,71,7,4,187,196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,1.0,89,47,80,131,54,11,160,43,20,163,175,369,174,77,1,7,182,193
332,1.0,86,43,68,150,64,9,138,48,18,143,161,285,174,69,6,0,192,201
333,1.0,86,34,62,140,61,7,122,54,17,127,141,223,112,64,2,14,200,208
334,1.0,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207


In [13]:
y_test=test.iloc[:,18]
y_test

0      1
1      1
2      1
3      1
4      1
      ..
331    4
332    4
333    4
334    4
335    4
Name: 18, Length: 336, dtype: int64

In [14]:
#probabilities
probs=[]
for i in range(4):
    probs.append(np.round(sigmoid(x_test.dot(models_list[i].params)),4))
probs

[0      0.0445
 1      0.2571
 2      0.4213
 3      0.0225
 4      0.6114
         ...  
 331    0.0312
 332    0.0148
 333    0.0154
 334    0.0525
 335    0.0111
 Length: 336, dtype: float64,
 0      0.8522
 1      0.3302
 2      0.7555
 3      0.0315
 4      0.5863
         ...  
 331    0.0371
 332    0.0391
 333    0.0151
 334    0.0087
 335    0.0101
 Length: 336, dtype: float64,
 0      0.0000
 1      0.0000
 2      0.0000
 3      0.0002
 4      0.0000
         ...  
 331    0.0000
 332    0.0180
 333    0.0073
 334    0.0000
 335    0.0067
 Length: 336, dtype: float64,
 0      0.0000
 1      0.0000
 2      0.0000
 3      0.9869
 4      0.0000
         ...  
 331    1.0000
 332    0.7833
 333    1.0000
 334    1.0000
 335    1.0000
 Length: 336, dtype: float64]

In [15]:
#predictions
predict=[]
for i in range(len(test)):
    predict.append(np.argmax(np.array((probs[0][i],probs[1][i],probs[2][i],probs[3][i])))+1)
predict

[2,
 2,
 2,
 4,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 3,
 1,
 2,
 2,
 2,
 1,
 2,
 3,
 2,
 1,
 2,
 2,
 2,
 3,
 1,
 2,
 2,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 2,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 2,
 1,
 4,
 1,
 2,
 3,
 3,
 1,
 2,
 3,
 2,
 3,
 2,
 4,
 3,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 1,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 2,
 2,
 2,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 2,
 3,
 2,
 1,
 1,
 1,
 2,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 3,
 4,
 2,
 2,
 1,
 4,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 4,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 1,
 4,
 4,
 4,
 4,
 4,


In [16]:
#standardize probabilities
sums = sum(probs)
for i in range(4):
    probs[i]=np.round(probs[i]/sums,2)
probs

[0      0.05
 1      0.44
 2      0.36
 3      0.02
 4      0.51
        ... 
 331    0.03
 332    0.02
 333    0.01
 334    0.05
 335    0.01
 Length: 336, dtype: float64,
 0      0.95
 1      0.56
 2      0.64
 3      0.03
 4      0.49
        ... 
 331    0.03
 332    0.05
 333    0.01
 334    0.01
 335    0.01
 Length: 336, dtype: float64,
 0      0.00
 1      0.00
 2      0.00
 3      0.00
 4      0.00
        ... 
 331    0.00
 332    0.02
 333    0.01
 334    0.00
 335    0.01
 Length: 336, dtype: float64,
 0      0.00
 1      0.00
 2      0.00
 3      0.95
 4      0.00
        ... 
 331    0.94
 332    0.92
 333    0.96
 334    0.94
 335    0.97
 Length: 336, dtype: float64]

In [17]:
f = open("HW5_output1.txt",'w')
text = "ID, Actual class, Class 1, Class 2, Class 3, Class 4, Final prediction\n------------------------------------------------------\n"
for i in range(3):
    text += str(i+1)+", "+str(y_test[i])+", "+str(probs[0][i])+", "+str(probs[1][i])+", "+str(probs[2][i])+", "+str(probs[3][i])+", "+str(predict[i])+"\n"
text += "(skip: 처음 3 줄과 마지막 3 줄만 출력시킴)\n"
for i in range((len(test)-3),len(test)):
    text += str(i+1)+", "+str(y_test[i])+", "+str(probs[0][i])+", "+str(probs[1][i])+", "+str(probs[2][i])+", "+str(probs[3][i])+", "+str(predict[i])+"\n"
f.write(text)
f.close()

### Confusion Matrix and Accuracy

In [18]:
#Confusion matrix
conf_arr_test=np.array([[0,0,0,0],
               [0,0,0,0],
               [0,0,0,0],
               [0,0,0,0]])

for i in range(len(test)):
    if y_test[i]==1:
        if predict[i]==1:
            conf_arr_test[0,0]+=1
        elif predict[i]==2:
            conf_arr_test[0,1]+=1
        elif predict[i]==3:
            conf_arr_test[0,2]+=1
        elif predict[i]==4:
            conf_arr_test[0,3]+=1
    elif y_test[i]==2:
        if predict[i]==1:
            conf_arr_test[1,0]+=1
        elif predict[i]==2:
            conf_arr_test[1,1]+=1
        elif predict[i]==3:
            conf_arr_test[1,2]+=1
        elif predict[i]==4:
            conf_arr_test[1,3]+=1
    elif y_test[i]==3:
        if predict[i]==1:
            conf_arr_test[2,0]+=1
        elif predict[i]==2:
            conf_arr_test[2,1]+=1
        elif predict[i]==3:
            conf_arr_test[2,2]+=1
        elif predict[i]==4:
            conf_arr_test[2,3]+=1
    elif y_test[i]==4:
        if predict[i]==1:
            conf_arr_test[3,0]+=1
        elif predict[i]==2:
            conf_arr_test[3,1]+=1
        elif predict[i]==3:
            conf_arr_test[3,2]+=1
        elif predict[i]==4:
            conf_arr_test[3,3]+=1
            
print(conf_arr_test)

[[49 34  1  2]
 [22 47 11  5]
 [ 0  0 85  1]
 [ 2  0  2 75]]


In [19]:
#Accuracy
acc_test = np.round(np.trace(conf_arr_test)/len(test),3)
acc_test

0.762

In [20]:
f = open("HW5_output2.txt",'w')
text2 = "Confusion Matrix (Test)\n----------------------------\n"
text2 += "                  Predicted Class\n"
text2 += "                   1      2      3      4\n"
for i in range(4):
    if i==0:
        text2 += "Actual    "
    elif i==1:
        text2 += "Class     "
    else:
        text2 += "            "
    text2 += str(i)+"    "
    for j in range(4):
        text2 += str(conf_arr_test[i,j])+"      "
    text2 += "\n"
text2 += "\nModel Summary (Test)\n--------------------------\n"
text2 += "Overall accuracy = "+str(acc_test)+"\n\n\n"
f.write(text2)
f.close()