In [1]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

# Question 1

In [None]:
def read_file(filename):
    return pd.read_csv(os.getcwd()+"/data/"+filename, sep=" ", header=0, names=["x0","x1","y"])

In [None]:
df = read_file("D2z.txt")

In [None]:
ax1 = df.plot.scatter(x="x0",y="x1",c="y",colormap="plasma")
plt.xticks(np.arange(-2,2, 0.5))
plt.yticks(np.arange(-2,2,0.5))
plt.show()

In [None]:
def get_NN(df,v):
    df["distance_"] = np.sqrt((df["x0"]-v[0])**2 + (df["x1"]-v[1])**2)
    return df.iloc[[df.idxmin()["distance_"]]]

In [None]:
# pts = []
# for x0 in np.arange(-2,2.1,0.1):
#     for x1 in np.arange(-2,2.1,0.1):
#         pts.append([x0,x1,get_NN(df,[x0,x1]).iloc[0]["y"]])

# for pt in pts:
#     if pt[2]:
#         plt.plot(pt[0],pt[1],"o",c="blue")
#     else:
#         plt.plot(pt[0],pt[1],"o",c="red")

# plt.show()

In [None]:
pts = []
for x0 in np.arange(-2,2.1,0.1):
    for x1 in np.arange(-2,2.1,0.1):
        pts.append([x0,x1,get_NN(df,[x0,x1]).iloc[0]["y"]])
        
nn_df = pd.DataFrame(pts)
nn_df.columns = ["x0","x1","y"]

fig, ax = plt.subplots()

df.plot.scatter(x="x0",y="x1",c="y",colormap="plasma", ax=ax)
nn_df.plot.scatter(x="x0",y="x1",c="y",colormap="plasma", alpha=0.2, ax=ax)

# Question 2

In [2]:
# returns dataframe of data after dropping email No. column
def read_file(filename):
    df = pd.read_csv(os.getcwd()+"/data/"+filename, sep=",")
    return df.drop(df.columns[0], axis=1)

# return data with indexes between a and b as test_df and all other data as data_df
# drops the prediction column from both dataframes
def split_data(df, a, b):    
    train_df = df.drop(df.index[range(a,b)])
    train_df = train_df.drop(train_df.columns[-1], axis=1)
    
    test_df = df.drop(df.index[range(0,a)]).drop(df.index[range(b,len(df.index))])
    test_df = test_df.drop(test_df.columns[-1], axis=1)
    return train_df, test_df

#returns the index of the k nearest neighbors as an array
def get_nn(train_df, test_df, k):
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute').fit(train_df)
    distances, nns = nbrs.kneighbors(test_df)
    
    return nns

def get_prediction(df, nns, i):
    pos_predictions = 0
    for j in range(len(nns[i])):
        if df.iloc[nns[i][j]]["y"]:
            pos_predictions += 1
    if pos_predictions/len(nns[i]) >= 0.5:
        prediction = 1
    else:
        prediction = 0
        
    return prediction

# numerator: (TP + TN)
# denominator: (all datapoints)
def get_accuracy(df, nns, a):

    num_correct = 0
    for i in range(len(nns)):
        real_val = df.iloc[i+a]["y"]
        prediction = get_prediction(df, nns, i)
        
        if real_val and prediction:
            num_correct += 1
        if not real_val and not prediction:
            num_correct += 1

    return num_correct / 1000

# numerator: (TP)
# denominator: (TP + FP)
def get_precision(df, nns, a):
    
    tp = 0
    fp = 0
    for i in range(len(nns)):
        real_val = df.iloc[i+a]["y"]
        prediction = get_prediction(df, nns, i)
        
        if real_val and prediction:
            tp += 1
        if not real_val and prediction:
            fp += 1
    
    return tp/(tp+fp)

# numerator: (tp)
# denominator: (tp + fn)
def get_recall(df, nns, a):
    
    tp = 0
    fn = 0
    for i in range(len(nns)):
        real_val = df.iloc[i+a]["y"]
        prediction = get_prediction(df, nns, i)
        
        if real_val and prediction:
            tp += 1
        if real_val and not prediction:
            fn += 1
    
    return tp/(tp + fn)

In [None]:
df = read_file("emails.csv")
folds = [[0,1000],[1000,2000],[2000,3000],[3000,4000],[4000,5000]]

k=1
for fold in folds:
    train_df, test_df = split_data(df, fold[0], fold[1])
    nns = get_nn(train_df, test_df, k)

    
    accuracy = get_accuracy(df, nns, fold[0])
    precision = get_precision(df, nns, fold[0])
    recall = get_recall(df, nns, fold[0])
    
    print("accuracy: " + str(accuracy))
    print("precision: " + str(precision))
    print("recall: " + str(recall))
    print("---")

# Question 4

In [None]:
df = read_file("emails.csv")
folds = [[0,1000],[1000,2000],[2000,3000],[3000,4000],[4000,5000]]
ks = [1,3,5,7,10]

ave_accuracies = []
for k in ks:
    
    accuracies = []
    for fold in folds:
        train_df, test_df = split_data(df, fold[0], fold[1])
        nns = get_nn(train_df, test_df, k)

        accuracies.append(get_accuracy(df, nns, fold[0]))
    
    ave_accuracies.append(np.array(accuracies).mean())

In [None]:
for accuracy in ave_accuracies:
    print(accuracy)

# Question 3

In [19]:
#ONLY WORKS ON FIRST SPLIT
def get_y(df, a):
    
#     y = (df.loc[np.r_[0:a, a+1000:5000]:, df.columns == 'Prediction']).to_numpy()
    y = (df.loc[np.r_[0:a, a+1000:5000], df.columns == 'Prediction']).to_numpy()
    
    return y
    
    y = df.loc[1000:, df.columns == 'Prediction']
    return y.to_numpy()

def sigmoid(theta, x):
    return 1/(1+np.exp(-x@theta))

def take_step(theta, eta, x, y):
    n = train_df.shape[0]
    return theta - 1/n * eta*np.matmul(x.T,(sigmoid(theta,x) - y))

def train_regression(df, train_df, eta, steps, a):
    theta = np.zeros((3001,1))
    x = train_df.to_numpy()
    y = get_y(df,a)
    
    for i in range(steps):
        theta = take_step(theta, eta, x, y)
        
    return theta

def make_predictions(theta, test_df):
    x = test_df.to_numpy()
    return (sigmoid(theta, x) > 0.5)*1

def get_accuracy(df, predictions, a):
    
    num_correct = 0
    for i in range(len(predictions)):
                
        real_val = df.iloc[i+a]["Prediction"]
        prediction = predictions[i][0]
    
        if real_val and prediction:
            num_correct += 1
        if not real_val and not prediction:
            num_correct += 1

    return num_correct / 1000

In [22]:
df = read_file("emails.csv")
folds = [[0,1000],[1000,2000],[2000,3000],[3000,4000],[4000,5000]]
eta = 0.0001
steps = 10


accuracies = []
precisions = []
recalls = []
for i in range(len(folds)):
    a = folds[i][0]
    b = folds[i][1]
    
    train_df, test_df = split_data(df, a, b)
    train_df["b_"] = 1
    test_df["b_"] = 1
    
    theta = train_regression(df, train_df, eta, steps, a)
    predictions = make_predictions(theta, test_df)
    
    accuracies.append(get_accuracy(df, predictions, a))
    precisions.append(1)
    recalls.append(1)
    
for a in accuracies:
    print(a)

0.72
0.741
0.824
0.713
0.7


In [None]:
theta = train_regression(df, train_df, eta, steps)

for i in theta:
    print(i)
print(theta.shape)

In [32]:
predictions = make_predictions(theta, test_df)
print(predictions)
print(get_accuracy(df, predictions, 0))

1000
0.707


[0 1 2]
[4 5]


(5,)
