In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix

## Read Data

In [2]:
def read_data():
    trdata_name = input('Enter the name of train data file [(ex) veh.dat]: ')
    tstdata_name = input('Enter the name of test data file [(ex) vehtest.dat]: ')
    coding_fm = int(input("Select the data coding format(1 = 'a b c' or 2 = 'a,b,c'): "))
    separator_fm = {coding_fm == 1 : ' '}.get(True, ",")
    res_pos = int(input('Enter the column position of the response variable : [from 1 to p]:')) - 1
    header = input('Does the data have column header? (y/n):')
    
    if header == 'y':
        trdata = pd.read_csv(trdata_name, sep=separator_fm)
        trres_col = trdata.columns[res_pos]
        tr_response = trdata[res_col]
        tr_feature = trdata.drop(res_col, axis = 1)
        
        tstdata = pd.read_csv(tstdata_name, sep=separator_fm)
        tstres_col = tstdata.columns[res_pos]
        tst_response = tstdata[tstres_col]
        tst_feature = tstdata.drop(tstres_col, axis = 1)
    
    else:
        trdata = pd.read_csv(trdata_name, sep=separator_fm, header=None)
        tr_response = trdata[res_pos]
        tr_feature = trdata.drop(res_pos, axis = 1)
        
        tstdata = pd.read_csv(tstdata_name, sep=separator_fm, header=None)
        tst_response = tstdata[res_pos]
        tst_feature = tstdata.drop(res_pos, axis = 1)
        
    out_name = input('Enter the output file name to export [(ex) result.txt]:')
    return tr_feature, tr_response, tst_feature, tst_response, out_name

In [3]:
tr_x, tr_y, tst_x, tst_y, out_name = read_data()

Enter the name of train data file [(ex) veh.dat]: veh.dat
Enter the name of test data file [(ex) vehtest.dat]: vehtest.dat
Select the data coding format(1 = 'a b c' or 2 = 'a,b,c'): 2
Enter the column position of the response variable : [from 1 to p]:19
Does the data have column header? (y/n):n
Enter the output file name to export [(ex) result.txt]:result.txt


## Multiclass Logistic Regression

In [4]:
class MultiLogisticReg:
    def __init__(self):
        None
        
    def one_hot_encoding(self, y):
        self.y_dum = pd.get_dummies(y)
        
    def fit(self, x, y):
        xc = sm.add_constant(x)
        self.lr = sm.Logit(y, xc).fit()
        
    def predict(self, x):
        xc = sm.add_constant(x)
        pred = self.lr.predict(xc)
        return pred
    
    def accuracy(self, y, pred_y):
        acc = np.mean(pred_y == y.values)
        return acc

## Train

In [5]:
mlr = MultiLogisticReg()

In [6]:
tr_pred_lst = []
tst_pred_lst = []
class_num = tr_y.nunique()

for i in range(class_num):
    mlr.one_hot_encoding(tr_y)
    y_ovr = mlr.y_dum.iloc[:,i]
    mlr.fit(tr_x, y_ovr)

    tr_pred = mlr.predict(tr_x)
    tr_pred_lst.append(tr_pred)
    
    tst_pred = mlr.predict(tst_x)
    tst_pred_lst.append(tst_pred)

tr_y_pred = np.argmax(tr_pred_lst, axis=0) + 1
tr_y_pred_per = tr_pred_lst / np.sum(tr_pred_lst, axis=0)

tst_y_pred = np.argmax(tst_pred_lst, axis=0) + 1
tst_y_pred_per = tst_pred_lst / np.sum(tst_pred_lst, axis=0)

tr_acc = mlr.accuracy(tr_y, tr_y_pred)
tst_acc = mlr.accuracy(tst_y, tst_y_pred)

Optimization terminated successfully.
         Current function value: inf
         Iterations 8
Optimization terminated successfully.
         Current function value: inf
         Iterations 9
Optimization terminated successfully.
         Current function value: inf
         Iterations 13
Optimization terminated successfully.
         Current function value: inf
         Iterations 16


  x = pd.concat(x[::order], 1)
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


## Output

In [7]:
text = f'''ID, Actual class, Class 1, Class 2, Class 3, Class 4, Final prediction
----------------------------------------------------------------------
'''

In [8]:
for i in range(3):
    text += f'{i+1}, {tst_y[i]}, {np.round_(tst_y_pred_per[:, i], 1)}, {tst_y_pred[i]}\n'

In [9]:
text += f'(skip: 처음 3줄과 마지막 3줄만 출력시킴)\n'

In [10]:
for i in range(len(tst_y)-3, len(tst_y)):
    text += f'{i+1}, {tst_y[i]}, {np.round_(tst_y_pred_per[:, i], 1)}, {tst_y_pred[i]}\n'

In [11]:
text += f'''
Confusion Matrix (Test)
-----------------------
{confusion_matrix(tst_y.values, tst_y_pred)}

Model Summary (Test)
--------------------
Overall accuracy = {tst_acc:.3f}
'''

In [12]:
file = open(out_name, "w") 
file.write(text)
file.close()