# Machine Learning Tools

In [3]:
%%capture
import random
import import_ipynb
from gradient_descent_and_data_tools import *

These are a few routines that do common ML tasks like splitting data for training and testing, as well as quality metric for use when assessing how good models are (like accuracy, precision, recall, and F1).

In [3]:
def split_data(data, perc):
    '''split data into fractions [perc, 1-perc]'''
    results = [],[]
    for row in data:
        # random.random() returns float between [0.0, 1.0)
        results[0 if random.random() < perc else 1].append(row)
    return results

In [4]:
def train_test_split(x, y, test_pct):
    data = zip(x, y)                               # pair values
    train, test = split_data(data, 1. - test_pct)  # split data into sets of pairs
    x_train, y_train = zip(*train)
    x_test, y_test = zip(*test)
    
    return x_train, x_test, y_train, y_test

In [5]:
def accuracy(tp, fp, fn, tn):
    correct = tp+tn
    total = tp+fp+fn+tn
    return correct/total

def precision(tp, fp, fn, tn):
    return tp/(tp+fp)

def recall(tp, fp, fn, tn):
    return tp/(tp+fn)

def f1_score(tp, fp, fn, tn):
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp , fn , tn)
    return 2.*p*r/(p+r)

In [1]:
def total_sum_of_squares(y):
    '''total squared variation of y_i's from their mean'''
    return sum(v**2 for v in de_mean(y))

def r_squared(alpha, beta, x, y):
    '''coefficient of determination or the fraction of variation in y captured by model'''
    r2 = 1.0-(sum_of_squared_errors(alpha, beta, x, y)/total_sum_of_squares(y))