In [1]:
import pandas as pd
import numpy as np

## 1. Gradient descent

In [2]:
def read_data():
    data_name = input('Enter the name of data file [(ex) harris.dat]: ')
    coding_fm = int(input("Select the data coding format(1 = 'a b c' or 2 = 'a,b,c'): "))
    separator_fm = {coding_fm == 1 : ' '}.get(True, ",")
    res_pos = int(input('Enter the column position of the response variable : [from 1 to p]:')) - 1
    header = input('Does the data have column header? (y/n):')
    
    if header == 'y':
        trdata = pd.read_csv(data_name, sep=separator_fm)
        res_col = trdata.columns[res_pos]
        tr_response = trdata[res_col]
        tr_feature = trdata.drop(res_col, axis = 1)
    
    else:
        trdata = pd.read_csv(data_name, sep=separator_fm, header=None)
        tr_response = trdata[res_pos]
        tr_feature = trdata.drop(res_pos, axis = 1)

    out_name = input('Enter the output file name to export [(ex) result.txt]:')
    return tr_feature, tr_response, out_name

In [3]:
tr_x, tr_y, out_name = read_data()

Enter the name of data file [(ex) harris.dat]: harris.dat
Select the data coding format(1 = 'a b c' or 2 = 'a,b,c'): 1
Enter the column position of the response variable : [from 1 to p]:1
Does the data have column header? (y/n):n
Enter the output file name to export [(ex) result.txt]:reg_result.txt


In [4]:
ones = np.ones((tr_x.shape[0], 1))
tr_xc = np.concatenate([ones, tr_x], axis=1)

In [5]:
def forward(x, beta):
    y_hat = np.dot(x, beta)
    return y_hat

In [6]:
def gradient(x, y, y_hat):
    grad = np.mean((y_hat - y.values.reshape(-1,1)) * x, axis=0).reshape(-1,1)
    return grad

In [7]:
beta = np.random.rand(tr_xc.shape[1], 1)

for i in range(10000000):
    y_hat = forward(tr_xc, beta)
    grad = gradient(tr_xc, tr_y, y_hat)
    beta -= 1e-4 * grad

In [8]:
beta

array([[3.52642211e+03],
       [9.00203109e+01],
       [1.26899001e+00],
       [2.34062358e+01],
       [7.22460671e+02]])

### Skitlearn

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
lr = LinearRegression(fit_intercept=True)

In [11]:
lr.fit(tr_x, tr_y)

LinearRegression()

In [12]:
lr.intercept_

3526.4221106890072

In [13]:
lr.coef_

array([ 90.02031094,   1.26899001,  23.40623577, 722.46067138])

### output

In [14]:
text = f'''Coefficients by Gradient Descent Method
---------------------------------------
Constant: {float(beta[0]):.3f}
'''

In [15]:
for idx, val in enumerate(beta[1:]):
    text += f'Beta{idx+1}: {val[0]:.3f}\n'

In [16]:
text += f'''
Coefficient by Statmodels
-------------------------
Constant: {lr.intercept_:.3f}
'''

In [17]:
for idx, val in enumerate(lr.coef_):
    text += f'Beta{idx+1}: {val:.3f}\n'

In [18]:
file = open(out_name, "w") 
file.write(text)
file.close()

## 2. LDA

In [19]:
def read_data():
    trdata_name = input('Enter the name of train data file [(ex) veh.dat]: ')
    tstdata_name = input('Enter the name of test data file [(ex) vehtest.dat]: ')
    coding_fm = int(input("Select the data coding format(1 = 'a b c' or 2 = 'a,b,c'): "))
    separator_fm = {coding_fm == 1 : ' '}.get(True, ",")
    res_pos = int(input('Enter the column position of the response variable : [from 1 to p]:')) - 1
    header = input('Does the data have column header? (y/n):')
    
    if header == 'y':
        trdata = pd.read_csv(trdata_name, sep=separator_fm)
        trres_col = trdata.columns[res_pos]
        tr_response = trdata[res_col]
        tr_feature = trdata.drop(res_col, axis = 1)
        
        tstdata = pd.read_csv(tstdata_name, sep=separator_fm)
        tstres_col = tstdata.columns[res_pos]
        tst_response = tstdata[tstres_col]
        tst_feature = tstdata.drop(tstres_col, axis = 1)
    
    else:
        trdata = pd.read_csv(trdata_name, sep=separator_fm, header=None)
        tr_response = trdata[res_pos]
        tr_feature = trdata.drop(res_pos, axis = 1)
        
        tstdata = pd.read_csv(tstdata_name, sep=separator_fm, header=None)
        tst_response = tstdata[res_pos]
        tst_feature = tstdata.drop(res_pos, axis = 1)
        
    out_name = input('Enter the output file name to export [(ex) result.txt]:')
    return tr_feature, tr_response, tst_feature, tst_response, out_name

In [20]:
tr_x, tr_y, tst_x, tst_y, out_name = read_data()

Enter the name of train data file [(ex) veh.dat]: veh.dat
Enter the name of test data file [(ex) vehtest.dat]: vehtest.dat
Select the data coding format(1 = 'a b c' or 2 = 'a,b,c'): 2
Enter the column position of the response variable : [from 1 to p]:19
Does the data have column header? (y/n):n
Enter the output file name to export [(ex) result.txt]:lda_result.txt


In [21]:
class LDA:
    def __init__(self):
        return

    def fit(self, x, y):
        class_num = int(y.nunique())
        class_data_num = [int(sum((y == i).values)) for i in range(1, class_num+1)]
        class_cov = [(x[(y == i).values]).cov() for i in range(1, class_num+1)]
        self.pooled_cov = sum([(class_data_num[i]-1)*class_cov[i] \
                               for i in range(class_num)]) / (sum(class_data_num)-class_num)
        self.class_mean = [(x[(y == i).values]).mean() for i in range(1, class_num+1)]
        self.class_p = y.value_counts(normalize=True).sort_index().values
    
    def predict(self, x, y):
        c = [np.diag(np.exp((-1/2) * (x - self.class_mean[i]) @ np.linalg.inv(self.pooled_cov) \
                            @ (x - self.class_mean[i]).T)) * self.class_p[i] for i in range(self.class_p.shape[0])]
        self.c_pred = np.argmax(np.array(c), axis=0) + 1
        return self.c_pred
    
    def accuracy(self, x, y):
        self.acc = sum(y.values.flatten() == self.c_pred) / len(x)
        return self.acc

### 회귀 분석 평가 지표

In [22]:
def R_squre(y, y_hat):
    SST = np.sum((y - y.mean()) ** 2)
    SSR = np.sum((y - y_hat) ** 2)
    return 1 - SSR/SST

### For train

In [23]:
def train_MAE(y, y_hat):
    return np.sum(np.abs(y - y_hat)) / (len(y)-1)

In [24]:
def train_MAPE(y, y_hat):
    return np.sum(np.abs((y - y_hat) / y)) / (len(y)-1)

In [25]:
def train_MSE(y, y_hat):
    return np.sum((y - y_hat) ** 2) / (len(y)-1)

In [26]:
def train_RMSE(y, y_hat):
    return np.sqrt(np.sum((y - y_hat) ** 2) / (len(y)-1))

### For test

In [27]:
def test_MAE(y, y_hat):
    return np.sum(np.abs(y - y_hat)) / len(y)

In [28]:
def test_MAPE(y, y_hat):
    return np.sum(np.abs((y - y_hat) / y)) / len(y)

In [29]:
def test_MSE(y, y_hat):
    return np.sum((y - y_hat) ** 2) / len(y)

In [30]:
def test_RMSE(y, y_hat):
    return np.sqrt(np.sum((y - y_hat) ** 2) / len(y))

### confusion matrix

In [31]:
def conf_matrix(y, c_pred):
    conf = [[sum((y == j) & (c_pred == i)) for i in range(1, int(y.nunique())+1)] \
            for j in range(1, int(y.nunique())+1)]
    df_conf = pd.DataFrame(conf)
    df_conf.columns = [1,2,3,4]
    df_conf.index = [1,2,3,4]
    return df_conf

In [32]:
def analysis():
    ana_method = int(input('Enter the anaysis method [(ex) 1 = classification or 2 = regression]: '))
    
    if ana_method == 1:
        tr_lda = LDA()
        tr_lda.fit(tr_x, tr_y)
        tr_c_pred = tr_lda.predict(tr_x, tr_y)
        tr_acc = tr_lda.accuracy(tr_x, tr_y)

        tst_lda = LDA()
        tst_lda.fit(tr_x, tr_y)
        tst_c_pred = tst_lda.predict(tst_x, tst_y)
        tst_acc = tst_lda.accuracy(tst_x, tst_y)

        text = f'''Confusion Matrix (Training)
--------------------------
{conf_matrix(tr_y, tr_c_pred)}
        
Model Summary (Training)
------------------------
overall accuracy = {tr_acc:.3f}

Confusion Matrix (Test)
----------------------
{conf_matrix(tst_y, tst_c_pred)}

Model Summary (Test)
--------------------
overall accuracy = {tst_acc:.3f}
'''

    else:
        xtx = np.dot(tr_x.T, tr_x)
        xtx_inv = np.linalg.inv(xtx)
        xtx_inv_xt = np.dot(xtx_inv, tr_x.T)
        beta_hat = np.dot(xtx_inv_xt, tr_y)
        
        tr_y_hat = np.dot(tr_x, beta_hat)
        tst_y_hat = np.dot(tst_x, beta_hat)
        
        text = f'''coefficients
------------
constant: {beta_hat[0]:.3f}
'''
        for idx, val in enumerate(beta_hat[1:]):
            text += f'Beta{idx+1}: {val:.3f}\n'
        
        text += f'''
Model Summary
-------------
R-Square = {R_squre(tr_y, tr_y_hat):.4f}
MSE = {train_MSE(tr_y, tr_y_hat):.4f}

Prediction Performance
----------------------
Predictive R-Square = {R_squre(tst_y, tst_y_hat):.4f}
MAE = {test_MAE(tst_y, tst_y_hat):.4f}
MAPE = {test_MAPE(tst_y, tst_y_hat):.4f}
RMSE = {test_RMSE(tst_y, tst_y_hat):.4f}
'''

    return text

In [33]:
text = analysis()

Enter the anaysis method [(ex) 1 = classification or 2 = regression]: 1


In [34]:
file = open(out_name, "w") 
file.write(text)
file.close()