# Machine Learning

In [1]:
import random
from typing import TypeVar, List, Tuple

In [2]:
X = TypeVar('X')  # generic type to represent a data point

def split_data(data: List[X], prob: float) -> Tuple[List[X], List[X]]:
    """Split data into fractions [prob, 1 - prob]"""
    data = data[:]                    # Make a shallow copy
    random.shuffle(data)              # because shuffle modifies the list.
    cut = int(len(data) * prob)       # Use prob to find a cutoff
    return data[:cut], data[cut:]     # and split the shuffled list there.

data = [n for n in range(1000)]
train, test = split_data(data, 0.75)

# The proportions should be correct
assert len(train) == 750
assert len(test) == 250

# And the original data should be preserved (in some order)
assert sorted(train + test) == data

In [3]:
Y = TypeVar('Y')  # generic type to represent output variables

def train_test_split(xs: List[X],
                     ys: List[Y],
                     test_pct: float) -> Tuple[List[X], List[X], List[Y], List[Y]]:
    # Generate the indices and split them.
    idxs = [i for i in range(len(xs))]
    train_idxs, test_idxs = split_data(idxs, 1 - test_pct)

    return ([xs[i] for i in train_idxs],  # x_train
            [xs[i] for i in test_idxs],   # x_test
            [ys[i] for i in train_idxs],  # y_train
            [ys[i] for i in test_idxs])   # y_test

We want to make sure our code works right

In [7]:
xs = [x for x in range(1000)] # xs are 1 ... 1000
ys = [2 * x for x in xs]      # each y_i is twice x_i
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.25)

# Check that the proportions are correct
assert len(x_train) == len(y_train) == 750
assert len(x_test) == len(y_test) == 250

# Check that the corresponding data point are paired correctly
assert all(y == 2 * x for x, y in zip(x_train, y_train))
assert all(y == 2 * x for x, y in zip(x_test, y_test))

After which we can do something like

In [None]:
model = SomeKindOfModel()
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.33)
model.train(x_train, y_train)
performance = model.test(x_test, y_test)

## Correctness

Given a set of labeled data and such a predictive model, every data point lies in one of four categories:
    
* True positive
   - “This message is spam, and we correctly predicted spam.”
    
* False positive (Type 1 error)
   - “This message is not spam, but we predicted spam.”
    
* False negative (Type 2 error)
   - “This message is spam, but we predicted not spam.”
    
* True negative
   - “This message is not spam, and we correctly predicted not spam.”


**accuracy** is defined as the fraction of correct predictions

In [9]:
def accuracy(tp: int, fp: int, fn: int, tn:int) -> float:
    
    correct = tp + tn
    total = tp + fp + fn + tn
    
    return correct/total

assert accuracy(70, 4930, 13930, 981070) == 0.98114

It’s common to look at the combination of **precision** and **recall**. **recision** measures how accurate our **positive** predictions were.

In [10]:
def precision(tp: int, fp: int, fn: int, tn: int) -> float:
    return tp / (tp + fp)

assert precision(70, 4930, 13930, 981070) == 0.014

**recall** measures what fraction of the positives our model identified

In [11]:
def recall(tp: int, fp: int, fn: int, tn: int) -> float:
    return tp / (tp + fn)

assert recall(70, 4930, 13930, 981070) == 0.005

Sometimes precision adn recall are combined into the **F1 score**, which is defined as

In [12]:
def f1_score(tp: int, fp: int, fn: int, tn: int) -> float:
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp, fn, tn)

    return 2 * p * r / (p + r)