In [None]:
from numpy.core.numeric import identity
import numpy as np

def import_data(filepath):
  f = open(filepath)
  X = []
  y = []
  for lines in f:
    line = lines.split()
    X.append((float(line[0]), float(line[1])))
    y.append(float(line[2]))
  return (np.array(X), np.array(y))

def phi(x1, x2):
  return (1, x1, x2, x1**2, x2**2, x1*x2, abs(x1 - x2), abs(x1 + x2))

# K points taken out
def transformation(X, K):
  Z = []
  for point in X:
    x1, x2 = point
    Z.append(phi(x1, x2)[:K + 1])
  return np.array(Z)

def predict(w, x):
  return np.sign(np.dot(w,x))

def w_lin(Z, y):
  return np.dot(np.dot(np.linalg.inv(np.dot(Z.T, Z)), Z.T), y)

def err(w, X, y):
  err = 0
  for i in range(len(X)):
    if y[i] != predict(w, X[i]):
      err += 1
  return float(err)/ len(X)


def train_validate_split(data,  N):
  return data[:N], data[N:]

def sim(K):
  X_in, y_in = import_data("in.txt")
  X_out, y_out = import_data("out.txt")
  Z_in = transformation(X_in, K)
  train_Z, validation_Z = train_validate_split(Z_in, 25)
  train_y, validation_y = train_validate_split(y_in, 25)
  w = w_lin(train_Z, train_y)

  Z_out = transformation(X_out, K)

  err_in = err(w, train_Z, train_y)
  err_val = err(w, validation_Z, validation_y)
  err_out = err(w, Z_out, y_out)
  return err_in, err_val, err_out

K_vals = [3, 4, 5, 6, 7]

def lowest_val_error(K_vals):
  lowest_k = 3
  lowest_err = 1
  for k in K_vals:
    _, err_val, _ = sim(k)
    print(err_val)
    if(err_val < lowest_err):
      lowest_err = err_val
      lowest_k = k
  print("The model with the best validation error is k = " + str(lowest_k) +
        " with an error of: " + str(lowest_err))

lowest_val_error(K_vals)

0.3
0.5
0.2
0.0
0.1
The model with the best validation error is k = 6 with an error of: 0.0


In [None]:
def lowest_err_out(K_vals):
  lowest_k = 3
  lowest_err = 1
  for k in K_vals:
    _, _, err_out = sim(k)
    print(err_out)
    if(err_out < lowest_err):
      lowest_err = err_out
      lowest_k = k
  print("The model with the best error out is k = " + str(lowest_k) +
        " with an error of: " + str(lowest_err))

lowest_err_out(K_vals)

0.42
0.416
0.188
0.084
0.072
The model with the best error out is k = 7 with an error of: 0.072


In [None]:
def sim_reversed_train_validate(K):
  X_in, y_in = import_data("in.txt")
  X_out, y_out = import_data("out.txt")
  Z_in = transformation(X_in, K)
  train_Z, validation_Z = train_validate_split(Z_in, 25)
  train_y, validation_y = train_validate_split(y_in, 25)
  w = w_lin(validation_Z, validation_y)

  Z_out = transformation(X_out, K)

  # now we swap validation and train sets, the names might be a little confusing
  err_in = err(w, validation_Z, validation_y)
  err_val = err(w, train_Z, train_y)
  err_out = err(w, Z_out, y_out)
  return err_in, err_val, err_out

def lowest_val_error_reversed(K_vals):
  lowest_k = 3
  lowest_err = 1
  for k in K_vals:
    _, err_val, _ = sim_reversed_train_validate(k)
    print(err_val)
    if(err_val < lowest_err):
      lowest_err = err_val
      lowest_k = k
  print("The model with the best validation error is k = " + str(lowest_k) +
        " with an error of: " + str(lowest_err))

lowest_val_error_reversed(K_vals)

0.28
0.36
0.2
0.08
0.12
The model with the best validation error is k = 6 with an error of: 0.08


In [None]:
def lowest_err_out_reversed(K_vals):
  lowest_k = 3
  lowest_err = 1
  for k in K_vals:
    _, _, err_out = sim_reversed_train_validate(k)
    print(err_out)
    if(err_out < lowest_err):
      lowest_err = err_out
      lowest_k = k
  print("The model with the best error out is k = " + str(lowest_k) +
        " with an error of: " + str(lowest_err))

lowest_err_out_reversed(K_vals)

0.396
0.388
0.284
0.192
0.196
The model with the best error out is k = 6 with an error of: 0.192


In [None]:
import random
N = 100000
e1 = 0
e2 = 0
e = 0
for i in range(N):
  e1_i = random.uniform(0,1)
  e2_i = random.uniform(0,1)
  e1 += e1_i / N
  e2 += e2_i / N
  e += min(e1_i, e2_i) / N

print("e1: " + str(e1) + "\n e2: " + str(e2) + "\n e: " + str(e) + "\n")

e1: 0.5016911936892019
 e2: 0.49915115453052683
 e: 0.33338727548378866



#SVM VS PLA

In [None]:
from sklearn import svm
from sklearn.linear_model import Perceptron
import math

min_val = -1
max_val = 1
d = 2
def create_points(N):
  X = [(0, 0) for i in range(N)]
  for i in range(N):
    X[i] = (random.uniform(min_val, max_val), random.uniform(min_val, max_val))
  return np.array(X)

def get_target_function():
  x1 = random.uniform(min_val, max_val)
  y1 = random.uniform(min_val, max_val)
  x2 = random.uniform(min_val, max_val)
  y2 = random.uniform(min_val, max_val)
  m =  (y1 - y2) / (x1 - x2)
  b = y1 - (m * x1)
  return (m, b)

def create_y(X, target_func):
  y = []
  for i in range(X.shape[0]):
    point = X[i]
    point_class = -1
    if point[0] * target_func[0] + target_func[1] > point[1]:
      point_class = 1
    y.append(point_class)
  return np.array(y)


def calculate_disagreement(N):
  sims = 1000
  total = 0.0
  total_svm = 0.0
  svm_count = 0
  svm_more_accurate = 0

  for i in range(sims):
    # set to test data
    num_points = 300
    incorrect = 0
    incorrect_svm = 0
    target_func = get_target_function()
    X_test = create_points(num_points)
    y_test = create_y(X_test, target_func)
    X_train = create_points(N)
    y_train = create_y(X_train, target_func)
    while len(set(y_train.flatten())) == 1:
      X_train = create_points(N)
      y_train = create_y(X_train, target_func)

    pla = Perceptron()
    pla.fit(X_train, y_train)
    # np.inf not working
    clf = svm.SVC(C= 999999999999, kernel='linear')
    clf.fit(X_train, y_train)
    svm_count += len(clf.support_vectors_)
    incorrect = 1 - pla.score(X_test, y_test)
    incorrect_svm = 1 - clf.score(X_test, y_test)
    if(incorrect_svm < incorrect):
      svm_more_accurate += 1

    total += (incorrect / num_points)
    total_svm += (incorrect_svm / num_points)
  pla_acc = total/sims
  svm_acc = total_svm / sims


  return pla_acc, svm_acc, (svm_count/ sims), (svm_more_accurate / sims)

pla_acc, svm_acc, svm_count, svm_better = calculate_disagreement(10)

In [None]:
print("For N = 10, SVM is better by: " + str(svm_better))

For N = 10, SVM is better by: 0.637


In [None]:
pla_acc, svm_acc, svm_count, svm_better = calculate_disagreement(100)

In [None]:
print("For N = 100, SVM is better by: " + str(svm_better))

For N = 100, SVM is better by: 0.695


In [None]:
print("For N = 100 the average number of vector machines is: " + str(svm_count))

For N = 100 the average number of vector machines is: 2.997
