In [1]:
import numpy as np
import pandas as pd

## Problem #1

### Read Data

In [2]:
harris = pd.read_csv("harris.dat", sep = " ", header=None)

In [3]:
harris

Unnamed: 0,0,1,2,3,4
0,3900,12,0.0,1,0
1,4020,10,44.0,7,0
2,4290,12,5.0,30,0
3,4380,8,6.2,7,0
4,4380,8,7.5,6,0
...,...,...,...,...,...
88,6600,15,215.5,16,1
89,6840,15,41.5,7,1
90,6900,12,175.0,10,1
91,6900,15,132.0,24,1


In [4]:
x_col = harris.columns.delete(0)
x = harris[x_col]
x = x.T.reset_index(drop=True).T
y = harris[0]

In [5]:
n = len(y)
n

93

### Gradient Descent Algorithm

In [6]:
#initial weights
np.random.seed(0)
w = pd.Series(np.random.uniform(low=-1, high=1, size=4)) #weights
b = np.random.uniform(low=-1, high=1) #constant
w

0    0.097627
1    0.430379
2    0.205527
3    0.089766
dtype: float64

In [7]:
#Gradient Descent
alpha=0.00004
for i in range(10000000):
    y_hat = x.dot(w) + b
    error = np.abs(y_hat-y).mean()
    
    w[0] = w[0] - alpha*(((y_hat-y)*x[0]).mean())
    w[1] = w[1] - alpha*(((y_hat-y)*x[1]).mean())
    w[2] = w[2] - alpha*(((y_hat-y)*x[2]).mean())
    w[3] = w[3] - alpha*(((y_hat-y)*x[3]).mean())
    b = b - alpha*((y_hat-y).mean())
    if i%10000==0:
        print(i, np.round(error,4))

0 5372.304
10000 600.8853
20000 594.287
30000 588.0187
40000 582.0563
50000 576.3774
60000 571.1226
70000 566.4574
80000 562.0319
90000 557.8043
100000 553.7865
110000 549.8937
120000 546.2068
130000 543.0014
140000 540.0374
150000 537.1371
160000 534.2971
170000 531.5145
180000 528.7862
190000 526.1407
200000 523.6453
210000 521.2043
220000 518.8842
230000 516.7091
240000 514.6884
250000 512.6823
260000 510.691
270000 508.715
280000 506.7543
290000 504.8093
300000 502.88
310000 501.053
320000 499.3603
330000 497.7065
340000 496.0703
350000 494.4624
360000 492.868
370000 491.2873
380000 489.7203
390000 488.1691
400000 486.6916
410000 485.2285
420000 483.862
430000 482.5146
440000 481.1796
450000 479.8571
460000 478.5469
470000 477.2491
480000 475.9636
490000 474.6904
500000 473.4545
510000 472.2404
520000 471.0384
530000 469.8483
540000 468.6702
550000 467.5038
560000 466.3491
570000 465.2059
580000 464.0743
590000 462.9539
600000 461.8449
610000 460.7469
620000 459.6601
630000 458.584

In [8]:
#weights result
w=np.round(w,3)
w

0    179.468
1      1.858
2     29.011
3    631.722
dtype: float64

In [9]:
#constant result
b=np.round(b,3)
b

2253.282

### Python Package

In [10]:
import statsmodels.api as sm

In [11]:
x = sm.add_constant(x, prepend = False)
model = sm.OLS(y,x)
res = model.fit()
res.params = np.round(res.params, 3)
res.params

0          90.020
1           1.269
2          23.406
3         722.461
const    3526.422
dtype: float64

### Write File

In [12]:
f = open("HW3_output_descent.txt",'w')
text1 = "Coefficients by Gradient Descent Method\n---------------\n"
text1 += "Constant:   "+str(b)+"\n"
for i in range(1,5):
    text1 += "Beta"+str(i)+":   "+str(w[i-1])+"\n"
text1 += "\nCoefficients by Statsmodels\n---------------\n"
text1 += "Constant:   "+str(res.params['const'])+"\n"
for i in range(1,5):
    text1 += "Beta"+str(i)+":   "+str(res.params[i-1])+"\n"
f.write(text1)
f.close()

## Problem #2

### Prompt

In [13]:
train=input('Please enter the name of the training data file (e.g. veh.dat): ')

Please enter the name of the training data file (e.g. veh.dat): veh.dat


In [14]:
df_train=pd.read_csv(train, header=None)
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,93,35,72,172,62,7,149,44,19,124,169,334,125,62,5,30,203,210,1
1,95,57,104,228,74,10,212,31,24,175,224,670,223,74,0,4,186,193,1
2,103,54,107,189,56,11,223,30,25,174,225,729,200,70,0,29,187,201,1
3,79,40,80,133,55,7,147,47,19,135,172,311,144,76,8,30,181,193,1
4,99,46,105,209,64,11,197,34,23,152,212,575,159,65,0,33,194,205,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,83,37,49,112,55,5,122,55,17,128,144,219,146,85,8,16,180,184,4
421,86,36,70,143,61,9,133,50,18,130,153,266,127,66,2,10,194,202,4
422,83,40,53,114,53,6,132,53,18,140,142,247,157,86,8,7,176,183,4
423,88,43,84,136,55,11,154,44,19,150,174,350,164,73,6,2,185,196,4


In [15]:
test=input('Please enter the name of the test data file: (e.g. vehtest.dat): ')

Please enter the name of the test data file: (e.g. vehtest.dat): vehtest.dat


In [16]:
df_test=pd.read_csv(test, header=None)
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,100,45,100,209,65,8,201,32,23,147,231,611,189,72,5,5,189,195,1
1,100,51,109,231,70,11,220,30,25,163,238,722,206,73,11,19,189,198,1
2,102,43,96,197,63,10,185,36,22,142,202,513,139,65,8,12,195,204,1
3,80,34,42,110,57,3,114,59,17,119,131,191,121,87,4,7,179,183,1
4,104,58,103,230,69,11,219,30,25,176,231,716,246,71,7,4,187,196,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,89,47,80,131,54,11,160,43,20,163,175,369,174,77,1,7,182,193,4
332,86,43,68,150,64,9,138,48,18,143,161,285,174,69,6,0,192,201,4
333,86,34,62,140,61,7,122,54,17,127,141,223,112,64,2,14,200,208,4
334,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,4


In [17]:
method=input('Please choose between regression and classification (1 = regression or 2 = classification): ')

Please choose between regression and classification (1 = regression or 2 = classification): 2


### Regression

In [18]:
def regression(y,x):
    n = len(y)
    ones = pd.DataFrame(np.ones(n),dtype=np.int8)
    z = pd.concat([ones, x], axis=1)
    z = z.T.reset_index(drop=True).T #reset columns
    
    b_hat = pd.DataFrame((np.linalg.inv(z.T.dot(z))).dot(z.T).dot(y))
    b_hat = round(b_hat,3)
    
    y_hat = z.dot(b_hat)
    
    return b_hat, y_hat

In [19]:
if method == '1':   
    #Prompt
    import_name=input("Please enter the name of the data file: ")
    encode_sep=int(input("Please select the separator used in the data file (1 = whitespace or 2 = comma): "))
    separator={encode_sep ==1 : " "}.get(True, ",")
    res_pos =int(input("Please enter the position of the response variable column (select from 1 to p): "))
    header=input("Does the data file include a column header? (y/n) : ")
    export_name=input("Please enter the name for the file to be exported (e.g. result.txt) : ")

    #Multiple linear regression
    if(header=="y"):
        data=pd.read_csv(import_name, sep=separator)
    else:
        data=pd.read_csv(import_name, sep=separator, header=None)

    Y = data.iloc[:,res_pos-1] #response variable

    Z_col = data.columns.delete(res_pos-1) #columns of predictor variables
    Z=data[Z_col] #predictor variables
    ones=pd.DataFrame(np.ones(len(data),dtype=np.int8))
    Z=pd.concat([ones,Z],axis=1)
    Z = Z.T.reset_index(drop=True).T #reset columns#predictor variables with constant term

    B_hat=pd.DataFrame((np.linalg.inv(Z.T.dot(Z))).dot(Z.T).dot(Y))
    B_hat=round(B_hat,3)

    Y_hat = Z.dot(B_hat)
    Y_hat=round(Y_hat,3)

    #Calculate R^2
    SSE = sum((Y-Y_hat[0])**2)
    SST = sum((Y-Y.mean())**2)
    R_square = round(1-SSE/SST,4)

    #Calculate MSE
    n = len(data)
    p = len(Z.columns)
    MSE = round(SSE/(n-p),4)

    f = open(export_name,'w')
    text2 = "Coefficients\n-------------\nConstant: "+str(B_hat.iloc[0,0])+"\n"
    for i in range(1,len(Z.columns)):
        text2 += "Beta"+str(i)+": "+str(B_hat.iloc[i,0])+"\n"
    text2 += "\nID, Actual values, Fitted values\n--------------------------------\n"
    for j in range(5):
        text2 += str(j+1) + ", " + str(Y[j]) + ", " + str(Y_hat.iloc[j,0]) + "\n"
    text2 += "(continue)\n\nModel Summary\n-------------\nR-square = " + str(R_square) + "\n"
    text2 += "MSE = " + str(MSE)
    f.write(text2)
    f.close()

### Classification

In [20]:
def lda(x_lda, y_lda):
    X1=x_lda[y_lda==1]
    X2=x_lda[y_lda==2]
    X3=x_lda[y_lda==3]
    X4=x_lda[y_lda==4]

    n1, n2, n3, n4 = y_lda.value_counts(sort=False)
    n=n1+n2+n3+n4
    p=1/4 #Assume equal priors

    X1_mean = pd.DataFrame(X1.mean())
    X2_mean = pd.DataFrame(X2.mean())
    X3_mean = pd.DataFrame(X3.mean())
    X4_mean = pd.DataFrame(X4.mean())

    S1=X1.cov()
    S2=X2.cov()
    S3=X3.cov()
    S4=X4.cov()
    Sp = 1/(n1+n2+n3+n4-4)*((n1-1)*S1+(n2-1)*S2+(n3-1)*S3+(n4-1)*S4)
    
    #Dataframe for classification
    classified_lda = pd.DataFrame(columns = ['Classified into TYPE'])

    for i in range(len(x_lda)):
        x0=pd.Series.reset_index(x_lda.iloc[i,:], drop=True)
        #linear discriminant functions for population i (i=1,2,...,g)
        d1_val = X1_mean.T.dot(np.linalg.inv(Sp)).dot(x0)[0]-1/2*X1_mean.T.dot(np.linalg.inv(Sp)).dot(X1_mean).iloc[0,0]+np.log(p)
        d2_val = X2_mean.T.dot(np.linalg.inv(Sp)).dot(x0)[0]-1/2*X2_mean.T.dot(np.linalg.inv(Sp)).dot(X2_mean).iloc[0,0]+np.log(p)
        d3_val = X3_mean.T.dot(np.linalg.inv(Sp)).dot(x0)[0]-1/2*X3_mean.T.dot(np.linalg.inv(Sp)).dot(X3_mean).iloc[0,0]+np.log(p)
        d4_val = X4_mean.T.dot(np.linalg.inv(Sp)).dot(x0)[0]-1/2*X4_mean.T.dot(np.linalg.inv(Sp)).dot(X4_mean).iloc[0,0]+np.log(p)
        max_ind = np.argmax([d1_val,d2_val,d3_val,d4_val])
        classified_lda.loc[i,:] = max_ind+1 #save classification
    
    return classified_lda

#### Train data

In [21]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,93,35,72,172,62,7,149,44,19,124,169,334,125,62,5,30,203,210,1
1,95,57,104,228,74,10,212,31,24,175,224,670,223,74,0,4,186,193,1
2,103,54,107,189,56,11,223,30,25,174,225,729,200,70,0,29,187,201,1
3,79,40,80,133,55,7,147,47,19,135,172,311,144,76,8,30,181,193,1
4,99,46,105,209,64,11,197,34,23,152,212,575,159,65,0,33,194,205,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,83,37,49,112,55,5,122,55,17,128,144,219,146,85,8,16,180,184,4
421,86,36,70,143,61,9,133,50,18,130,153,266,127,66,2,10,194,202,4
422,83,40,53,114,53,6,132,53,18,140,142,247,157,86,8,7,176,183,4
423,88,43,84,136,55,11,154,44,19,150,174,350,164,73,6,2,185,196,4


In [22]:
x_lda_train = df_train.iloc[:,:18]
x_lda_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,93,35,72,172,62,7,149,44,19,124,169,334,125,62,5,30,203,210
1,95,57,104,228,74,10,212,31,24,175,224,670,223,74,0,4,186,193
2,103,54,107,189,56,11,223,30,25,174,225,729,200,70,0,29,187,201
3,79,40,80,133,55,7,147,47,19,135,172,311,144,76,8,30,181,193
4,99,46,105,209,64,11,197,34,23,152,212,575,159,65,0,33,194,205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,83,37,49,112,55,5,122,55,17,128,144,219,146,85,8,16,180,184
421,86,36,70,143,61,9,133,50,18,130,153,266,127,66,2,10,194,202
422,83,40,53,114,53,6,132,53,18,140,142,247,157,86,8,7,176,183
423,88,43,84,136,55,11,154,44,19,150,174,350,164,73,6,2,185,196


In [23]:
y_lda_train = df_train[18]
y_lda_train

0      1
1      1
2      1
3      1
4      1
      ..
420    4
421    4
422    4
423    4
424    4
Name: 18, Length: 425, dtype: int64

#### Test data

In [24]:
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,100,45,100,209,65,8,201,32,23,147,231,611,189,72,5,5,189,195,1
1,100,51,109,231,70,11,220,30,25,163,238,722,206,73,11,19,189,198,1
2,102,43,96,197,63,10,185,36,22,142,202,513,139,65,8,12,195,204,1
3,80,34,42,110,57,3,114,59,17,119,131,191,121,87,4,7,179,183,1
4,104,58,103,230,69,11,219,30,25,176,231,716,246,71,7,4,187,196,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,89,47,80,131,54,11,160,43,20,163,175,369,174,77,1,7,182,193,4
332,86,43,68,150,64,9,138,48,18,143,161,285,174,69,6,0,192,201,4
333,86,34,62,140,61,7,122,54,17,127,141,223,112,64,2,14,200,208,4
334,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,4


In [25]:
x_lda_test = df_test.iloc[:,:18]
x_lda_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,100,45,100,209,65,8,201,32,23,147,231,611,189,72,5,5,189,195
1,100,51,109,231,70,11,220,30,25,163,238,722,206,73,11,19,189,198
2,102,43,96,197,63,10,185,36,22,142,202,513,139,65,8,12,195,204
3,80,34,42,110,57,3,114,59,17,119,131,191,121,87,4,7,179,183
4,104,58,103,230,69,11,219,30,25,176,231,716,246,71,7,4,187,196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,89,47,80,131,54,11,160,43,20,163,175,369,174,77,1,7,182,193
332,86,43,68,150,64,9,138,48,18,143,161,285,174,69,6,0,192,201
333,86,34,62,140,61,7,122,54,17,127,141,223,112,64,2,14,200,208
334,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207


In [26]:
y_lda_test = df_test[18]
y_lda_test

0      1
1      1
2      1
3      1
4      1
      ..
331    4
332    4
333    4
334    4
335    4
Name: 18, Length: 336, dtype: int64

### LDA

In [27]:
if method == '2':
    #Classification using LDA
    train_result = lda(x_lda_train, y_lda_train)
    test_result = lda(x_lda_test, y_lda_test)
    
    #Confusion matrix
    #Train data
    conf_arr_train=np.array([[0,0,0,0],
                       [0,0,0,0],
                       [0,0,0,0],
                       [0,0,0,0]])

    for i in range(len(y_lda_train)):
        if y_lda_train[i]==1:
            if train_result.iloc[i,0]==1:
                conf_arr_train[0,0]+=1
            elif train_result.iloc[i,0]==2:
                conf_arr_train[0,1]+=1
            elif train_result.iloc[i,0]==3:
                conf_arr_train[0,2]+=1
            elif train_result.iloc[i,0]==4:
                conf_arr_train[0,3]+=1
        elif y_lda_train[i]==2:
            if train_result.iloc[i,0]==1:
                conf_arr_train[1,0]+=1
            elif train_result.iloc[i,0]==2:
                conf_arr_train[1,1]+=1
            elif train_result.iloc[i,0]==3:
                conf_arr_train[1,2]+=1
            elif train_result.iloc[i,0]==4:
                conf_arr_train[1,3]+=1
        elif y_lda_train[i]==3:
            if train_result.iloc[i,0]==1:
                conf_arr_train[2,0]+=1
            elif train_result.iloc[i,0]==2:
                conf_arr_train[2,1]+=1
            elif train_result.iloc[i,0]==3:
                conf_arr_train[2,2]+=1
            elif train_result.iloc[i,0]==4:
                conf_arr_train[2,3]+=1
        elif y_lda_train[i]==4:
            if train_result.iloc[i,0]==1:
                conf_arr_train[3,0]+=1
            elif train_result.iloc[i,0]==2:
                conf_arr_train[3,1]+=1
            elif train_result.iloc[i,0]==3:
                conf_arr_train[3,2]+=1
            elif train_result.iloc[i,0]==4:
                conf_arr_train[3,3]+=1
        
    #Test data
    conf_arr_test=np.array([[0,0,0,0],
                   [0,0,0,0],
                   [0,0,0,0],
                   [0,0,0,0]])

    for i in range(len(y_lda_test)):
        if y_lda_test[i]==1:
            if test_result.iloc[i,0]==1:
                conf_arr_test[0,0]+=1
            elif test_result.iloc[i,0]==2:
                conf_arr_test[0,1]+=1
            elif test_result.iloc[i,0]==3:
                conf_arr_test[0,2]+=1
            elif test_result.iloc[i,0]==4:
                conf_arr_test[0,3]+=1
        elif y_lda_test[i]==2:
            if test_result.iloc[i,0]==1:
                conf_arr_test[1,0]+=1
            elif test_result.iloc[i,0]==2:
                conf_arr_test[1,1]+=1
            elif test_result.iloc[i,0]==3:
                conf_arr_test[1,2]+=1
            elif test_result.iloc[i,0]==4:
                conf_arr_test[1,3]+=1
        elif y_lda_test[i]==3:
            if test_result.iloc[i,0]==1:
                conf_arr_test[2,0]+=1
            elif test_result.iloc[i,0]==2:
                conf_arr_test[2,1]+=1
            elif test_result.iloc[i,0]==3:
                conf_arr_test[2,2]+=1
            elif test_result.iloc[i,0]==4:
                conf_arr_test[2,3]+=1
        elif y_lda_test[i]==4:
            if test_result.iloc[i,0]==1:
                conf_arr_test[3,0]+=1
            elif test_result.iloc[i,0]==2:
                conf_arr_test[3,1]+=1
            elif test_result.iloc[i,0]==3:
                conf_arr_test[3,2]+=1
            elif test_result.iloc[i,0]==4:
                conf_arr_test[3,3]+=1
    
    #Accuracy
    #Train data
    acc_train = np.trace(conf_arr_train)/len(df_train)
    acc_train = np.round(acc_train,3)
    
    #Test data
    acc_test = np.trace(conf_arr_test)/len(df_test)
    acc_test = np.round(acc_test,3)

In [28]:
#Confusion matrix of train data
print(conf_arr_train)

[[ 76  23   2   4]
 [ 30  76   1   3]
 [  2   1 106   1]
 [  1   0   0  99]]


In [29]:
#Accuracy of train data
print(acc_train)

0.84


In [30]:
#Confusion matrix of test data
print(conf_arr_test)

[[59 24  1  2]
 [32 45  3  5]
 [ 0  1 84  1]
 [ 0  2  1 76]]


In [31]:
#Accuracy of test data
print(acc_test)

0.786


### Write File

In [33]:
f = open("HW3_output_LDA.txt",'w')
text3 = "Confusion Matrix (Training)\n----------------------------\n"
text3 += "                  Predicted Class\n"
text3 += "                   1      2      3      4\n"
for i in range(4):
    if i==0:
        text3 += "Actual    "
    elif i==1:
        text3 += "Class     "
    else:
        text3 += "            "
    text3 += str(i)+"    "
    for j in range(4):
        text3 += str(conf_arr_train[i,j])+"      "
    text3 += "\n"
text3 += "\nModel Summary (Training)\n--------------------------\n"
text3 += "Overall accuracy = "+str(acc_train)+"\n\n\n"
text3 += "Confusion Matrix (Test)\n----------------------------\n"
text3 += "                  Predicted Class\n"
text3 += "                   1      2      3      4\n"
for i in range(4):
    if i==0:
        text3 += "Actual    "
    elif i==1:
        text3 += "Class     "
    else:
        text3 += "            "
    text3 += str(i)+"    "
    for j in range(4):
        text3 += str(conf_arr_test[i,j])+"      "
    text3 += "\n"
text3 += "\nModel Summary (Test)\n--------------------------\n"
text3 += "Overall accuracy = "+str(acc_test)+"\n\n\n"
f.write(text3)
f.close()