<a href="https://colab.research.google.com/github/gheenie/ml-applied-ml-with-NNs/blob/main/lab_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import csv

from google.colab import drive
import numpy
from numpy import loadtxt
# from urllib import urlopen
from pandas import read_csv
from pandas import set_option
from matplotlib import pyplot
from pandas.plotting import scatter_matrix
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.tree import export_graphviz
from subprocess import check_call
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline


In [None]:
# Mount location
drive.mount('/content/drive')

# Change working dir
os.chdir('drive/Colab Notebooks')


# Load data


In [None]:
# Load CSV Using Python Standard Library

filename = 'pima-indians-diabetes.data.csv'
raw_data = open(filename, 'rt')
reader = csv.reader(raw_data, delimiter=',', quoting=csv.QUOTE_NONE)
x = list(reader)
data = numpy.array(x).astype('float')
print(data.shape)


In [None]:
# Load CSV using NumPy

filename = 'pima-indians-diabetes.data.csv'
raw_data = open(filename, 'rt')
data = loadtxt(raw_data, delimiter=",")
print(data.shape)


In [None]:
# Load CSV from URL using NumPy

url = 'https://goo.gl/XXXXX'
raw_data = urlopen(url)
dataset = loadtxt(raw_data, delimiter=",")
print(dataset.shape)


In [5]:
# Load CSV using Pandas

filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
print(data.shape)


(768, 9)


In [None]:
# Load CSV using Pandas from URL

url = 'https://goo.gl/XXXXXXX'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(url, names=names)
print(data.shape)


# Descriptive statistics


In [None]:
# Inspect data

# First few rows
peek = data.head(20)
print(peek)
# Data types
types = data.dtypes
print(types)

# Review data dimensions
# Too many or few rows or features?

shape = data.shape
print(shape)

# Descriptive stats

set_option('display.width', 100)
set_option('display.precision', 3)
description = data.describe()
print(description)

# Class distributions (for classification probs)
# Need to be balanced

class_counts = data.groupby('class').size()
print(class_counts)

# Pairwise Pearson correlations
# Don't want highly correlated pairs

correlations = data.corr(method='pearson')
print(correlations)

# Skew of univariate distributions for each attribute

skew = data.skew()
print(skew)


# Visualisation


In [None]:
# Univariate plots
# Understand each attribute of your dataset independently

# Univariate histograms
# Get an idea of distributions.
# From the shape of the bins you can quickly get a
# feeling for whether an attribute is Gaussian, skewed or even has an
# exponential distribution. It can also help you see possible outliers.

data.hist()
pyplot.show()

# Univariate density plots
# Get an idea of distributions

data.plot(kind='density', subplots=True, layout=(3,3), sharex=False)
pyplot.show()

# Box and whiskers plot
# Get an idea of distributions.
# Candidate outlier values are 1.5 times greater than the size of spread of
# the middle 50% of the data

data.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
pyplot.show()

# Multivariate plots
# Show the interactions between multiple variables in your dataset

# Correlation matrix plot
# Some machine learning algorithms like linear and logistic regression can have
# poor performance if there are highly correlated input variables in your data

correlations = data.corr()
# Plotting
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = numpy.arange(0, 9, 1)
# Specify names for attributes - less generic. Don't specify first, then do so
# to investigate more closely
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
pyplot.show()

# Scatter plot matrix
# Look at the pairwise relationships from different perspectives

scatter_matrix(data, figsize=[20, 20])
pyplot.show()

# Data preparation


A difficulty is that different algorithms make different assumptions about your data and may require different transforms. Sometimes algorithms can deliver better results without pre-processing.

Generally, I would recommend creating many different views and transforms of your data, then exercise a handful of algorithms on each view of your dataset. This will help you to flush out which data transforms might be better at exposing the structure of your problem in general.

Two standard idioms for transforming data: fit and multiple transform; combined fit-and-transform.


In [7]:
array = data.values
# Separate array into input and output components
X = array[:, 0:8]
Y = array[:, 8]


Rescale data

Useful for optimisation algos (used in the core of ML algos) like gradient descent, algos that weight inputs like regression and neural networks, and algos that use distance measures like k-Nearest Neighbors.


In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
# Summarise transformed data
set_printoptions(precision=3)
print(rescaledX[0:5, :])


Standardise data

Suitable for techniques that assume a Gaussian distribution in the input variables and work better with rescaled data, such as linear regression, logistic regression, and linear discriminate analysis.


In [None]:
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
# Summarise transformed data
set_printoptions(precision=3)
print(rescaledX[0:5, :])


Normalise data

Useful for sparse datasets with attributes of varying scales when using algorithms that weight input values such as neural networks and use distance measures such as k-Nearest Neighbors.

In [None]:
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
# Summarise transformed data
set_printoptions(precision=3)
print(normalizedX[0:5, :])


Binarise data

Useful when you have probabilities that you want to make crisp values. It is also useful when feature engineering and you want to add new features that indicate something meaningful

In [None]:
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
# Summarise transformed data
set_printoptions(precision=3)
print(binaryX[0:5, :])


In [None]:
# Plot each attribute and see the changes

import seaborn as sns

data.plot(kind='density', subplots=True, layout=(3, 3), sharex=False, figsize=[10, 10])
pyplot.show()

sns.distplot(rescaledX[:, 1])
sns.distplot(rescaledX[:, 2])
sns.distplot(rescaledX[:, 3])
sns.distplot(rescaledX[:, 4])
sns.distplot(rescaledX[:, 5])
sns.distplot(rescaledX[:, 6])
sns.distplot(rescaledX[:, 7])

# Decision tree classification on raw vs normalised data

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

kfold = KFold(n_splits=10, random_state=7, shuffle=None)
model = DecisionTreeClassifier()
results = cross_val_score(model, X, Y, cv=kfold)
print("Mean estimated accuracy \n",results.mean())
results2 = cross_val_score(model, normalizedX, Y, cv=kfold)
print("Mean estimated accuracy on normalised data \n",results2.mean())

# Feature selection


Having irrelevant features in your data can decrease the accuracy of many models, especially linear algorithms like linear and logistic regression. Benefits: reduces overfitting; improves accuracy; reduces training time.


In [None]:
array = data.values
# Separate array into input and output components
X = array[:, 0:8]
Y = array[:, 8]


Univariate Selection

Statistical tests can be used to select those features that have the strongest relationship with the output variable.


In [None]:
# Select top 4 features with chi-squared test
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
# Summarise scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# Summarise selected features
print(features[0:5, :])


Recursive feature elimination

The choice of algorithm does not matter too much as long as it is skillful and consistent.


In [None]:
# Select top 3 features with logistic regression
model = LogisticRegression(solver='liblinear')
rfe = RFE(estimator=model, n_features_to_select=3)
fit = rfe.fit(X, Y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

PCA


In [None]:
pca = PCA(n_components=3)
fit = pca.fit(X)
# Summarise components
print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(fit.components_)

Feature importance

Bagged decision trees like Random Forest and Extra Trees can be used to estimate the importance of features

In [None]:
# Use extra trees classifier
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)


# Resampling


You need to know how well your algorithms perform on unseen data.

The best way to evaluate the performance of an algorithm would be to make predictions for new data to which you already know the answers. The second-best way is to use clever techniques from statistics called resampling methods that allow you to make accurate estimates for how well your algorithm will perform on new data.

Generally k-fold cross-validation is the gold standard for evaluating the performance of a machine learning algorithm on unseen data with k set to 3, 5, or 10.

Using a train/test split is good for speed when using a slow algorithm and produces performance estimates with lower bias when using large datasets.

Techniques like leave-one-out cross-validation and repeated random splits can be useful intermediates when trying to balance variance in the estimated performance, model training speed, and dataset size.

Experiment and find a technique for your problem that is fast and produces reasonable estimates of performance that you can use to make decisions. If in doubt, use 10-fold cross-validation.


In [None]:
array = data.values
# Separate array into input and output components
X = array[:, 0:8]
Y = array[:, 8]


Split into train and test sets

Hyperparameters: ratio of train:test set

Very fast, hence useful to use this approach when the algorithm you are investigating is slow to train. Ideal for large datasets (millions of records) where there is strong evidence that both splits of the data are representative of the underlying problem. Produces performance estimates with lower bias when using large datasets.

A downside of this technique is that it can have a high variance. This means that differences in the training and test dataset can result in meaningful differences in the estimate of accuracy.


In [None]:
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (result * 100.0))


K-fold cross validation

Hyperparameters: Choice of k must allow the size of each test partition to be large enough to be a reasonable sample of the problem, whilst allowing enough repetitions of the train-test evaluation of the algorithm to provide a fair estimate of the algorithm's performance on unseen data. For modest sized datasets in the thousands or tens of thousands of records, k values of 3, 5, and 10 are common.

Estimate the performance of a machine learning algorithm with less variance than a single train-test set split. The result is a more reliable estimate of the performance of the algorithm on new data.


In [None]:
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
model = LogisticRegression(solver='liblinear')
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Leave one out cross validation

The result is a large number of performance measures that can be summarised in an effort to give a more reasonable estimate of the accuracy of your model on unseen data.

A downside is that it can be a computationally more expensive procedure than k-fold cross-validation.


In [None]:
loocv = LeaveOneOut()
model = LogisticRegression(solver='liblinear')
results = cross_val_score(model, X, Y, cv=loocv)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))


Repeated random test-train splits

This has the speed of using a train/test split and the reduction in
variance in the estimated performance of k-fold cross-validation. You can also repeat the process many more times as needed to improve the accuracy.

A down side is that repetitions may include much of the same data in the train or the test split from run to run, introducing redundancy into the evaluation.


In [None]:
n_splits = 10
test_size = 0.33
seed = 7
kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)
model = LogisticRegression(solver='liblinear')
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))


# Evaluation metrics


In [None]:
array = data.values
X = array[:, 0:8]
Y = array[:, 8]


Classification metrics

Demonstrated with logistic regression and 10-fold cross-validation.


Classification accuracy

Most common but most misused. It is really only suitable when there are an equal number of observations in each class (which is rarely the case) and that all predictions and prediction errors are equally important, which is often not the case.


In [None]:
kfold = KFold(n_splits=10, random_state=7,shuffle=True)
model = LogisticRegression(solver='liblinear')
scoring = 'accuracy'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))


Logarithmic loss


In [None]:
kfold = KFold(n_splits=10, random_state=7,shuffle=True)
model = LogisticRegression(solver='liblinear')
scoring = 'neg_log_loss'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("Logloss: %.3f (%.3f)" % (results.mean(), results.std()))


Area under ROC curve


In [None]:
kfold = KFold(n_splits=10, random_state=7,shuffle=True)
model = LogisticRegression(solver='liblinear')
scoring = 'roc_auc'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("AUC: %.3f (%.3f)" % (results.mean(), results.std()))

# split into train/test sets
trainX, testX, trainy, testy = train_test_split(X, Y, test_size=0.1, random_state=2)
# fit a model
model = LogisticRegression(solver='liblinear')
model.fit(trainX, trainy)
# predict probabilities
probs = model.predict_proba(testX)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(testy, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
fpr, tpr, thresholds = roc_curve(testy, probs)
# plot no skill
pyplot.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
pyplot.plot(fpr, tpr, marker='.')
# show the plot
pyplot.show()


Confusion matrix

A handy presentation of the accuracy of a model with two or more classes.


In [None]:
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)


Classification report


In [None]:
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)


# Algo shortlisting


You cannot know which algorithm will work best on your dataset beforehand. You must use trial and error to discover a shortlist of algorithms that do well on your problem that you can then double down on and tune further. The question is not: What algorithm should I use on my dataset? Instead it is: What algorithms
should I spot-check on my dataset? You can guess at what algorithms might do well on your dataset, and this can be a good starting point. I recommend trying a mixture of algorithms and see what is good at picking out the structure in your data. Some suggestions when spotchecking algorithms on your dataset:
• Try a mixture of algorithm representations (e.g. instances and trees).
• Try a mixture of learning algorithms (e.g. different algorithms for learning the same type of
representation).
• Try a mixture of modelling types (e.g. linear and nonlinear functions or parametric and
nonparametric).


In [None]:
array = data.values
X = array[:,0:8]
Y = array[:,8]


Logistic regression

Assumes a Gaussian distribution for the numeric input variables and can model binary classification problems.


In [None]:
num_folds = 10
kfold = KFold(n_splits=10, random_state=7,shuffle=True)
model = LogisticRegression(solver='liblinear')
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())


K-nearest neighbours

Uses a distance metric to find the k most similar instances in the training data for a new instance and takes the mean outcome of the neighbors as the prediction.


In [None]:
num_folds = 10
kfold = KFold(n_splits=10, random_state=7,shuffle=True)
model = KNeighborsClassifier()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())


Support vector machines

Of particular importance is the use of different kernel functions via the kernel parameter. A powerful Radial Basis Function is used by default.


In [None]:
kfold = KFold(n_splits=10, random_state=7,shuffle=True)
model = SVC()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())


Classification and regression trees ie decision trees

Decision-tree learners can create over-complex trees that do not generalise the data well ie overfitting. Mechanisms such as pruning (not currently supported), setting the minimum number of samples required at a leaf node, or setting the maximum depth of the tree are necessary to avoid this problem.


In [None]:
def run_decision_tree(model):
    test_size = 0.1
    seed = 7
    # split into train/test sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
    random_state=seed)
    # fit a model
    model.fit(X_train, Y_train)
    predicted = model.predict(X_test)
    report = classification_report(Y_test, predicted)
    print(report)

    export_graphviz(
        model,
        out_file='pima_tree.dot',
        feature_names=names[0:8],
        rounded=True,
        filled=True
    )

    # convert .dot to .png
    check_call(['dot','-Tpng','pima_tree.dot','-o','pima_tree.png'])
    # if pydot is installed use the below
    !dot -Tpng pima_tree.dot -o pima_tree.png -Gdpi-600
    # display in python
    pyplot.figure(figsize = (14, 18))
    pyplot.imshow(pyplot.imread('pima_tree.png'))
    pyplot.axis('off');
    pyplot.show();


run_decision_tree(DecisionTreeClassifier())
# With pruning
run_decision_tree(DecisionTreeClassifier(max_depth=3, min_samples_split=10))


# Algo comparing


You often end up with multiple good models to choose from. Each model will have different performance characteristics.

Time to bring in previous sections. Use resampling to estimate how accurate each model may be on unseen data. Also use a number of different ways of looking at the estimated accuracy (You visualised new data using different techniques in order to look at the data from different perspectives. Now the same idea applies to model selection.). Then choose one or two best models from the suite of models that you have created.

The key to a fair comparison of machine learning algorithms is ensuring that each algorithm is evaluated in the same way on the same data. You can achieve this by forcing each algorithm to be evaluated on a consistent test harness and configuring with the same random seed.

A way to do this is to use visualisation methods to show the average accuracy, variance, and other properties of the distribution of model accuracies.


In [None]:
array = data.values
X = array[:,0:8]
Y = array[:,8]

# prepare models
models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = KFold(n_splits=10, random_state=7,shuffle=True)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# boxplot algorithm comparison
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()


# Pipelining


In [6]:
array = data.values
# Separate array into input and output components
X = array[:, 0:8]
Y = array[:, 8]


In [7]:
estimators = []
# Create a tuple with two elements, a label (standardize) and the instance
# StandardScaler.
estimators.append(('standardize', StandardScaler()))
estimators.append(('lda', LinearDiscriminantAnalysis()))
model = Pipeline(estimators)

# Evaluate pipeline.
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7669685577580315


# Teardown


In [None]:
drive.flush_and_unmount()