# Dataset

1. load csv file (panda, numpy)
2. split dataset. Example code:()
   ```
   random.shuffle(data) # change if you are using pandas dataframe
   training = data[:int(len(data)*0.8)]
   test = data[int(len(data)*0.8):]

   fold5 = KFold(5) # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
   for train_idx, val_idx in fold5.split(training):
      sub_val = training[val_idx]
      sub_train = training[train_idx]
      clf = model(sub_train, sub_val, ...) # training the model, and evaluate it on validation dataset
      performance(clf, test) # test the model on test dataset
   ```

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

data = pd.read_csv("spambase.csv")

data.sample(frac=1) # change if you are using pandas dataframe
training = data[:int(len(data)*0.8)]
test = data[int(len(data)*0.8):]

fold5 = KFold(5) # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

for train_idx, val_idx in fold5.split(training):
   sub_val = training[val_idx]
   sub_train = training[train_idx]
   clf = model(sub_train, sub_val, ...) # training the model, and evaluate it on validation dataset
   performance(clf, test) # test the model on test dataset

#Naive bayes

1. model learning:

   Note:

   features: remove attributes that is not related to word (the last four attributes)

   labels: the last column

   count P(c) -> how many samples are positive, and how many are negtive

   if freq_word>0, then this word exists. You could use this to calculate P(a|c) -> for each class, what is the prob of each word

   remember to use laplace smoothing.

2. model evaluation (on val dataset -> performance(model, val)):
   
   for each new sample, $\prod{P(a|c)}P(c)$ if word is in the email(freq_word > 0); and find the maximum class
   

   

In [17]:
from math import exp

def separate_by_class(data):
   sep_list = dict()
   for i in range(len(data)-4):
      arr = data.iloc[i]
      class_val = arr[-1]
   if (class_val not in sep_list):
      sep_list[class_val] = list()
      sep_list[class_val].append(arr)
   return sep_list

def stdev(data):
    avg = sum(data)/len(data.index)
    z = 0
    for i in data:
      z += (i-avg) ** 2
    z /= float(data.shape[0]-1)
    return z ** 0.5

def summarize_dataset(data):
   sum_list = []
   for col in zip(*data):
      sum_list.append(sum(data)/len(data.index), stdev(col), len(col.index))
   return sum_list

def summarize_by_class(dataset):
   sep_list = separate_by_class(dataset)
   sum_list = dict()
   for class_val, rows in sep_list.items():
      sum_list[class_val] = summarize_dataset(rows)
   return sum_list

def calculate_probability(x, avg, stdev):
   exponent = exp(-((x-avg)**2 / (2 * stdev ** 2 )))
   return exponent * (1 / ((2 * 3.1415926535) ** 0.5) * stdev)

def calculate_class_probabilities(row, sum_list):
   for i in sum_list:
      rows += sum_list[i][0][2]
   prob_list = dict()
   for class_val, sum_list in sum_list.items():
     prob_list[class_val] = sum_list[class_val][0][2]/rows
   for i in range(len(sum_list)):
     avg, stdev, count = sum_list[i]
     prob_list[class_val] *= calculate_probability(row[i], avg, stdev)
   return prob_list

def predict(row, sum_list):
   prob_list = calculate_class_probabilities(sum_list, row)
   best_label, best_prob = None, -1
   for class_value, probability in prob_list.items():
      if best_label is None or probability > best_prob:
         best_prob = probability
   best_label = class_value
   return best_label

def naive_model(training_data, test_data):
   sum_list = summarize_by_class(training_data)
   predictions = list()
   for row in test:
      output = predict(row, sum_list)
      predictions.append(output)
   return(predictions)

print(naive_model(training, test))

TypeError: object of type 'builtin_function_or_method' has no len()

# KNN
1. model learning: None

2. model evaluation(on val dataset): You could use each row(exclude the last column) as the feature of the email. You do not have to recalcuate the freqency.

   ```
   Note:
   parallel programing
   numpy.cos() to calcuate the similarity
   ```

In [21]:
def euc_dist(row1, row2):
   dist = 0
   for i in range(len(row1)-1):
      dist += (row1[i] - row2[i]) ** 2
   return dist ** 0.5

def get_neighbors(training_data, test_row, num_neighbors):
   dist_list = list()
   for training_row in training_data:
      dist = euc_dist(test_row, training_row)
      dist_list.append((training_row, dist))
      dist_list.sort(key=lambda tup: tup[1])
   neighbor_list = list()
   for i in range(num_neighbors):
      neighbor_list.append(dist_list[i][0])
   return neighbor_list

def predict_classification(training_data, test_row, num_neighbors):
   neighbors = get_neighbors(training_data, test_row, num_neighbors)
   outputs = []
   for i in neighbors:
      outputs.append(i[-1])
   prediction = max(set(outputs), key=outputs.count)
   return prediction

def k_nearest_neighbors(training_data, test_data, num_neighbors):
   prob_list = list()
   for row in test_data:
      output = predict_classification(training_data, row, num_neighbors)
      prob_list.append(output)
   return(prob_list)

# LR

1. model learning: You could use each row(exclude the last column) as the feature of the email. You do not have to recalcuate the freqency.
    
    $y = sigmoid(MX)$

step 1: add one more column (all value is 1) in X -> X' = np.c_[np.ones((len(X), 1)), X]

step 2:vector M = np.random.randn(len(X[0])+1, 1);

key formula for step 3 (Note: n is the size of the TRAINING dataset; $cdot$ is dot production ):

1. $pred_y = sigmoid(M\cdot X')$

2. $loss = -\sum(y\cdot log(pred_y)+(1-y)\cdot log(1-pred_y))/n$

3. $gm=X'\cdot (pred_y - y)*2/n$

Step 3 example code:
   ```
   #Step 3: performing gradient descent on whole dataset:
   best_model = M
   best_performace = 0
   for i in range(epoch):
     pred_y = ...
     gm = ...
     _p = performace(model, val)
     if _p > best_performance:
        best_model = M
        best_performance = _p
     M = M - learning_rate*gm
   ```

2. model evaluation(on val dataset):
  
   calculate pred_y, if more than 0.5, then the predicted label is 1.

In [22]:
def predict(row, coefficients):
   predY = coefficients[0]
   for i in range(len(row)-1):
      predY += coefficients[i + 1] * row[i]
   return predY

def coefficients_sgd(training_data, l_rate, n_epoch):
   coeff = []
   for i in range(len(training_data[0])):
      coeff.append(0)
   for epoch in range(n_epoch):
      sum_error = 0
      for row in training_data:
         yhat = predict(row, coeff)
      error = yhat - row[-1]
      sum_error += error**2
   coeff[0] = coeff[0] - l_rate * error
   for i in range(len(row)-1):
      coeff[i + 1] = coeff[i + 1] - l_rate * error * row[i]
   return coeff

def linear_regression_sgd(training_data, test_data, l_rate, n_epoch):
   predictions = list()
   coeff = coefficients_sgd(training_data, l_rate, n_epoch)
   for row in test_data:
      predY = predict(row, coeff)
      predictions.append(predY)
   return(predictions)

# Model Evaluation

https://scikit-learn.org/stable/modules/model_evaluation.html

In [23]:
def performance(model, data):
  pred_list = []
  other_list = []
  result = 0
  for i in range(len(data.index)):
     pred_list.append(predict(i))

  for i in range(len(pred_list)):
     if(pred_list[i] == other_list[i]):
        result = result + 1
  return result/len(pred_list)