# Intro to Machine Learning

In [None]:
!pip install pandas seaborn kaggle statsmodels

In [None]:
import os
import umap

from zipfile import ZipFile

from sklearn.datasets import load_breast_cancer
from sklearn.datasets import fetch_openml

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC as BlackBoxClassifier
from sklearn.cluster import DBSCAN as BlackBoxClustering
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import statsmodels.api as sm
BlackBoxForecasting = sm.tsa.statespace.SARIMAX

import matplotlib.pyplot as plt
from pylab import rcParams

import pandas as pd
import numpy as np
import seaborn as sns

np.set_printoptions(precision=2)

os.environ["KAGGLE_USERNAME"] = "injeans"
os.environ["KAGGLE_KEY"] = "cc08cc14836d12bd8bfccdfff9147ab7"

%matplotlib inline

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    sns.heatmap(cm, annot=True, ax=ax, cmap="YlGnBu")
    # We want to show all ticks...
    ax.set(xticks=[0.5, 1.5],
           yticks=[0.5, 1.5],
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

#     # Loop over data dimensions and create text annotations.
#     fmt = '.2f' if normalize else 'd'
#     thresh = cm.max() / 2.
#     for i in range(cm.shape[0]):
#         for j in range(cm.shape[1]):
#             ax.text(j, i, format(cm[i, j], fmt),
#                     ha="center", va="center",
#                     color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

## What is ML?

[[Image Placeholder]]

A popular definition for machine learning, given by [Mitchell 1997](http://www.cs.cmu.edu/afs/cs.cmu.edu/user/mitchell/ftp/mlbook.html), is

> A computer program is said to learn from experience, E, with respect to some class of tasks, T, and performance measure, P, if its performance at tasks in T, as measured by P, improves with experience E.

## Experience

Typically, the experience a machine learning algorithm encounters during learning is in the form of a dataset, or exposure to a dataset (or subset thereof). A dataset is a collection of examples, each example comprising a set of features that have been quantitatively measured from some object or event. We typically represent an example as a vector $x \in \mathbb{R}^N$, where each entry  of the vector is another feature. Broadly speaking, experiences are often categorised as either **unsupervised** or **supervised**.

### Supervised

Supervised learning algorithms experience a dataset containing features, but each example is also associated with a **label** or **target**. Supervised learning involves observing several examples of random vector, $x$, and an associated value or vector $y$, then learning to predict $y$ from $x$, usually by estimating $p(y|x)$ [(Goodfellow, Bengio, & Courville, 2016)](#References). 

Let's take a look at an example of a supervised experience.

In [None]:
breast_cancer_data = load_breast_cancer()
breast_cancer_df = pd.DataFrame(data=np.c_[breast_cancer_data['data'], breast_cancer_data['target']],
                                columns=breast_cancer_data['feature_names'].tolist() + ['target'])
breast_cancer_df.head()

In [None]:
breast_cancer_df.describe()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))

sns.distplot(breast_cancer_df['target'], ax=axes[0])

trans = umap.UMAP(n_neighbors=5, random_state=42).fit(breast_cancer_data['data'])

axes[1].scatter(trans.embedding_[:, 0], trans.embedding_[:, 1], 
                s=50, c=breast_cancer_data['target'], cmap='Spectral')
           
fig.tight_layout()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data['data'], breast_cancer_data['target'], 
                                                    test_size=0.2, random_state=42)

clf = BlackBoxClassifier(gamma="scale")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Classification accuracy: {:.2f}%".format(accuracy_score(y_pred, y_test)*100))

In [None]:
# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=breast_cancer_data['target_names'], normalize=True,
                      title='Normalized confusion matrix')

plt.show()

### (Self?/) Unsupervised

Unsupervised learning algorithms experience a datset containing many features, then learn useful properties of the structure of this dataset. Unsupervised learning involves observing several examples of a random vector, $x$, and attempting to implicitly or explicitly learn the probability distribution $p(x)$, or some interesting properties of that distribution [(Goodfellow, Bengio, & Courville, 2016)](#References).

Let's dive in!

In [None]:
higgs_data = fetch_openml(name='higgs')
higgs_df = pd.DataFrame(data=np.c_[higgs_data['data'], higgs_data['target']],
                               columns=higgs_data['feature_names'] + ['target'])
higgs_df.dropna(inplace=True)
higgs_df = higgs_df[higgs_df["target"]==1]
higgs_df = higgs_df.sample(frac=0.2, random_state=42)
higgs_df.head()

In [None]:
higgs_df.describe()

In [None]:
trans = umap.UMAP(n_neighbors=5, random_state=42).fit(higgs_df.drop('target', axis=1).values)

plt.figure(figsize=(20,10))
plt.scatter(trans.embedding_[:, 0], trans.embedding_[:, 1], s=50)

In [None]:
clst = BlackBoxClustering(eps=4, min_samples=3)
clst.fit(higgs_df.drop('target', axis=1).values)
y_pred = clst.labels_

print(np.unique(y_pred))

plt.figure(figsize=(20,10))
plt.scatter(trans.embedding_[:, 0], trans.embedding_[:, 1], 
            s=50, c=y_pred, cmap='Spectral')
plt.colorbar()

## Task

Many kinds of tasks can be solved with machine learning. Some of the most common machine learning tasks include the following:

- **Classiﬁcation**: the computer program is asked to specify which of $k$ categories some input belongs to. To solve this task, the learning algorithm is usually asked to produce a function $f:\mathbb{R}^n \to \{1, \dots, k\}$. When $y=f(\mathbf{x})$, the model assigns an input described by vector $\mathbf{x}$ to a category identiﬁed by numeric code $y$. There are other variants of the classiﬁcation task, for example, where $f$ outputs a probability distribution over classes. An example of a classiﬁcation task is object recognition, where the input is an image (usually described as a set of pixel brightness values), and the output is a numeric code identifying the object in the image.
- **Regression**: the computer program is asked to predict a numerical value given some input. To solve this task, the learning algorithm is asked to output a function $f:\mathbb{R}^n\to \mathbb{R}$. This type of task is similar to classiﬁcation, except that the format of output is diﬀerent. An example of a regression task is the prediction of the expected claim amount that an insured person will make (used to set insurance premiums), or the prediction of future prices of securities. These kinds of predictions are also used for algorithmic trading.
- **Clustering**: the assignment of a set of observations into subsets (called clusters) so that observations in the same cluster are similar in some sense.

Okay so let's consider these tasks in the context of a new dataset.

In [None]:
!kaggle datasets download -d cityofLA/los-angeles-traffic-collision-data -p kaggle_data
with ZipFile('kaggle_data/los-angeles-traffic-collision-data.zip', 'r') as zipObj:
   # Extract all the contents of zip file in different directory
   zipObj.extractall('kaggle_data')
!ls kaggle_data

In [None]:
la_df = pd.read_csv("kaggle_data/traffic-collision-data-from-2010-to-present.csv")
la_df['Date Reported'] = pd.to_datetime(la_df['Date Reported'])
la_df['Date Occurred'] = pd.to_datetime(la_df['Date Occurred'])
la_df.drop(labels="DR Number", axis=1, inplace=True)
la_df.drop(labels="Crime Code Description", axis=1, inplace=True)
la_df.dropna(inplace=True)
la_df.head()

In [None]:
la_df.describe()

### Classification

In [None]:
single_mo_df = la_df[la_df['MO Codes'].str.split().str.len().lt(2)]
gb_df = single_mo_df.groupby("MO Codes").count()['Date Reported']
mo_codes = gb_df[gb_df.gt(100)].index
clf_df = la_df[la_df['MO Codes'].isin(mo_codes)].copy()

target = clf_df.pop('MO Codes')
features_df = clf_df.drop("Location", axis=1)

In [None]:
la_df.dtypes

In [None]:
lb_make = LabelEncoder()
features_df["Date Reported"] = lb_make.fit_transform(features_df["Date Reported"])
features_df["Date Occurred"] = lb_make.fit_transform(features_df["Date Occurred"])
obj_df = features_df.select_dtypes(include=['object']).copy()

for col in obj_df.columns:
    features_df[col] = lb_make.fit_transform(features_df[col])

target = lb_make.fit_transform(target)
features_df.head()

In [None]:
features_df.dtypes

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features_df.values, target, 
                                                    test_size=0.2, random_state=42)

clf = BlackBoxClassifier(gamma="scale")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Classification accuracy: {:.2f}%".format(accuracy_score(y_pred, y_test)*100))

### Regression

In [None]:
collisions = la_df.groupby(['Date Occurred'])["Date Reported"].count().reset_index()
collisions = collisions.set_index('Date Occurred')
collisions

In [None]:
y = collisions['Date Reported'].resample('MS').mean()

In [None]:
y.plot(figsize=(15, 6))

In [None]:
rcParams['figure.figsize'] = 18, 8
decomposition = sm.tsa.seasonal_decompose(y, model='additive')
fig = decomposition.plot()
plt.show()

In [None]:
fcst = BlackBoxForecasting(y,
                           order=(0, 1, 1),
                           seasonal_order=(0, 1, 1, 12),
                           enforce_stationarity=False,
                           enforce_invertibility=False)
results = fcst.fit()

In [None]:
pred = results.get_prediction(start=pd.to_datetime('2019-01-01'), dynamic=False)
pred_ci = pred.conf_int()
ax = y['2017':].plot(label='observed')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7))
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('# Collisions')
plt.legend()
plt.show()

### Clustering

### References

[Goodfellow, I., Bengio, Y., & Courville, A. (2016). Deep Learning. MIT Press. Retrieved from https://www.deeplearningbook.org](https://www.deeplearningbook.org)