In [1]:
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix

In [2]:
def read_data():
    trdata_name = input('Enter the name of train data file [(ex) veh.dat]: ')
    tstdata_name = input('Enter the name of test data file [(ex) vehtest.dat]: ')
    coding_fm = int(input("Select the data coding format(1 = 'a b c' or 2 = 'a,b,c'): "))
    separator_fm = {coding_fm == 1 : ' '}.get(True, ",")
    res_pos = int(input('Enter the column position of the response variable : [from 1 to p]:')) - 1
    header = input('Does the data have column header? (y/n):')
    
    if header == 'y':
        trdata = pd.read_csv(trdata_name, sep=separator_fm)
        trres_col = trdata.columns[res_pos]
        tr_response = trdata[trres_col]
        tr_feature = trdata.drop(trres_col, axis = 1)
        
        tstdata = pd.read_csv(tstdata_name, sep=separator_fm)
        tstres_col = tstdata.columns[res_pos]
        tst_response = tstdata[tstres_col]
        tst_feature = tstdata.drop(tstres_col, axis = 1)
    
    else:
        trdata = pd.read_csv(trdata_name, sep=separator_fm, header=None)
        tr_response = trdata[res_pos]
        tr_feature = trdata.drop(res_pos, axis = 1)
        
        tstdata = pd.read_csv(tstdata_name, sep=separator_fm, header=None)
        tst_response = tstdata[res_pos]
        tst_feature = tstdata.drop(res_pos, axis = 1)
        
    out_name = input('Enter the output file name to export [(ex) result.txt]:')
    return tr_feature, tr_response, tst_feature, tst_response, out_name

In [3]:
class LdaBagging():
    def __init__(self):
        None
    
    def fit(self, x, y, tst_x, B = 101):
        diff_lst = []
        tst_pred_lst = []

        for b in range(B):
            bootstrap = x.sample(replace=True, frac=1)
            bootstrap_idx = bootstrap.index

            bootstrap_x = x.iloc[bootstrap_idx, :]
            bootstrap_y = y[bootstrap_idx]

            oob_x = x.drop(set(bootstrap_idx))
            oob_y = y.drop(set(bootstrap_idx))

            lda = LinearDiscriminantAnalysis()
            lda.fit(bootstrap_x, bootstrap_y)

            oob_pred = lda.predict(oob_x)
            oob_acc = np.mean(oob_pred == oob_y)
            oob_error = 1 - oob_acc

            tst_pred = lda.predict(tst_x)
            tst_pred_lst.append(tst_pred)

            error_lst = []
            for i in range(len(oob_x.columns)):
                oob_x_copy = oob_x.copy()
                oob_x_copy.iloc[:, i] = oob_x_copy.iloc[:, i].sample(frac=1).values

                per_pred = lda.predict(oob_x_copy)
                per_acc = np.mean(per_pred == oob_y)
                per_error = 1 - per_acc
                error_lst.append(per_error-oob_error)

            diff_lst.append(error_lst)

        d = np.mean(diff_lst, axis=0)
        sd = np.sqrt(np.sum((np.array(diff_lst) - np.array(d)) ** 2, axis=0) / (B-1))
        imp = d/sd
        
        self.tst_pred_lst = tst_pred_lst
        
        return imp
        
    def predict(self, x, B = 101):
        tst_pred = []

        for j in range(len(x)):
            tst_clf_lst = []

            for i in range(B):
                tst_clf_lst.append(self.tst_pred_lst[i][j])
            tst_pred.append(max(tst_clf_lst, key=tst_clf_lst.count))
        
        self.tst_pred = tst_pred
        return self.tst_pred
            
    def accuracy(self, y):
        tst_acc = np.mean(self.tst_pred == y)
        return tst_acc

In [4]:
class RandomFeatureEnsemble:
    def __init__(self):
        None
        
    def get_col_num(self, x):
        p = len(x.columns)
            
        if p % 2 == 0:
            m = p/2
        else:
            m = (p-1)/2
        
        self.m = m
        
    def fit(self, x, y, tst_x, B = 101):
        diff_lst = []
        tst_pred_lst = []

        for b in range(B):
            rf_tr_x = x.sample(n = int(self.m), axis = 1)
            rf_tst_x = tst_x[rf_tr_x.columns]

            bootstrap = rf_tr_x.sample(replace=True, frac=1).drop_duplicates()
            bootstrap_idx = bootstrap.index

            bootstrap_x = rf_tr_x.iloc[bootstrap_idx, :]
            bootstrap_y = y[bootstrap_idx]

            oob_x = rf_tr_x.drop(set(bootstrap_idx))
            oob_y = y.drop(set(bootstrap_idx))

            rf_lda = LinearDiscriminantAnalysis()
            rf_lda.fit(bootstrap_x, bootstrap_y)

            oob_pred = rf_lda.predict(oob_x)
            oob_acc = np.mean(oob_pred == oob_y)
            oob_error = 1 - oob_acc

            tst_pred = rf_lda.predict(rf_tst_x)
            tst_pred_lst.append(tst_pred)

            error_lst = []
            diff = np.zeros(len(x.columns))

            for i in range(len(oob_x.columns)):
                oob_x_copy = oob_x.copy()
                oob_x_copy.iloc[:, i] = oob_x_copy.iloc[:, i].sample(frac=1).values

                per_pred = rf_lda.predict(oob_x_copy)
                per_acc = np.mean(oob_y == per_pred)
                per_error = 1-per_acc
                error_lst.append(per_error-oob_error)

            diff[oob_x_copy.columns] = error_lst
            diff_lst.append(diff)
        
        d = np.mean(diff_lst, axis=0)
        sd = np.sqrt(np.sum((np.array(diff_lst) - np.array(d)) ** 2, axis=0) / (B-1))
        imp = d/sd
        
        self.tst_pred_lst = tst_pred_lst
        
        return imp
        
    def predict(self, x, B = 101):
        tst_pred = []

        for j in range(len(x)):
            tst_clf_lst = []

            for i in range(B):
                tst_clf_lst.append(self.tst_pred_lst[i][j])
            tst_pred.append(max(tst_clf_lst, key=tst_clf_lst.count))
            
        self.tst_pred = tst_pred
        return self.tst_pred
        
    def accuracy(self, y):
        tst_acc = np.mean(self.tst_pred == y)
        return tst_acc

In [5]:
tr_x, tr_y, tst_x, tst_y, out_name = read_data()

Enter the name of train data file [(ex) veh.dat]: veh.dat
Enter the name of test data file [(ex) vehtest.dat]: vehtest.dat
Select the data coding format(1 = 'a b c' or 2 = 'a,b,c'): 2
Enter the column position of the response variable : [from 1 to p]:19
Does the data have column header? (y/n):n
Enter the output file name to export [(ex) result.txt]:result.txt


In [6]:
bag = LdaBagging()

In [7]:
bag_var_imp = bag.fit(tr_x, tr_y, tst_x)

In [8]:
bag_pred = bag.predict(tst_x)

In [9]:
bag_acc = bag.accuracy(tst_y)

In [10]:
rfe = RandomFeatureEnsemble()

In [11]:
rfe.get_col_num(tr_x)

In [12]:
rfe_var_imp = rfe.fit(tr_x, tr_y, tst_x)

In [13]:
rfe_pred = rfe.predict(tst_x)

In [14]:
rfe_acc = rfe.accuracy(tst_y)

In [15]:
text = f'''LDA - bagging

variable Importance:
'''

In [16]:
for i in range(len(tr_x.columns)):
    text += f'X{i+1}: {bag_var_imp[i]:.3f}\n'

In [17]:
text += f'''
Confusion Matrix (LDA - bagging)
--------------------------------
{confusion_matrix(tst_y, bag_pred)}

Model Summary (LDA - bagging)
-----------------------------
overall accuracy = {bag_acc:.3f}
'''

In [18]:
text += f'''
LDA - random feature ensemble

variable Importance:
'''

In [19]:
for i in range(len(tr_x.columns)):
    text += f'X{i+1}: {rfe_var_imp[i]:.3f}\n'

In [20]:
text += f'''
Confusion Matrix (LDA - random feature)
---------------------------------------
{confusion_matrix(tst_y, rfe_pred)}

Model Summary (LDA - random feature)
-----------------------------
overall accuracy = {rfe_acc:.3f}
'''

In [21]:
file = open(out_name, "w") 
file.write(text)
file.close()