<a href="https://colab.research.google.com/github/gheenie/msc-applied-ml/blob/main/lab_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import csv

from google.colab import drive
import numpy
from numpy import loadtxt
# from urllib import urlopen
from pandas import read_csv
from pandas import set_option
from matplotlib import pyplot
from pandas.plotting import scatter_matrix
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer


In [None]:
# Mount location
drive.mount('/content/drive')

# Change working dir
os.chdir('drive/Colab Notebooks')


# Load data

In [None]:
# Load CSV Using Python Standard Library

filename = 'pima-indians-diabetes.data.csv'
raw_data = open(filename, 'rt')
reader = csv.reader(raw_data, delimiter=',', quoting=csv.QUOTE_NONE)
x = list(reader)
data = numpy.array(x).astype('float')
print(data.shape)


In [None]:
# Load CSV using NumPy

filename = 'pima-indians-diabetes.data.csv'
raw_data = open(filename, 'rt')
data = loadtxt(raw_data, delimiter=",")
print(data.shape)


In [None]:
# Load CSV from URL using NumPy

url = 'https://goo.gl/XXXXX'
raw_data = urlopen(url)
dataset = loadtxt(raw_data, delimiter=",")
print(dataset.shape)


In [4]:
# Load CSV using Pandas

filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
print(data.shape)


(768, 9)


In [None]:
# Load CSV using Pandas from URL

url = 'https://goo.gl/XXXXXXX'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(url, names=names)
print(data.shape)


# Descriptive statistics

In [None]:
# Inspect data

# First few rows
peek = data.head(20)
print(peek)
# Data types
types = data.dtypes
print(types)

# Review data dimensions
# Too many or few rows or features?

shape = data.shape
print(shape)

# Descriptive stats

set_option('display.width', 100)
set_option('display.precision', 3)
description = data.describe()
print(description)

# Class distributions (for classification probs)
# Need to be balanced

class_counts = data.groupby('class').size()
print(class_counts)

# Pairwise Pearson correlations
# Don't want highly correlated pairs

correlations = data.corr(method='pearson')
print(correlations)

# Skew of univariate distributions for each attribute

skew = data.skew()
print(skew)


# Visualisation

In [None]:
# Univariate plots
# Understand each attribute of your dataset independently

# Univariate histograms
# Get an idea of distributions.
# From the shape of the bins you can quickly get a
# feeling for whether an attribute is Gaussian, skewed or even has an
# exponential distribution. It can also help you see possible outliers.

data.hist()
pyplot.show()

# Univariate density plots
# Get an idea of distributions

data.plot(kind='density', subplots=True, layout=(3,3), sharex=False)
pyplot.show()

# Box and whiskers plot
# Get an idea of distributions.
# Candidate outlier values are 1.5 times greater than the size of spread of
# the middle 50% of the data

data.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
pyplot.show()

# Multivariate plots
# Show the interactions between multiple variables in your dataset

# Correlation matrix plot
# Some machine learning algorithms like linear and logistic regression can have
# poor performance if there are highly correlated input variables in your data

correlations = data.corr()
# Plotting
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = numpy.arange(0, 9, 1)
# Specify names for attributes - less generic. Don't specify first, then do so
# to investigate more closely
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
pyplot.show()

# Scatter plot matrix
# Look at the pairwise relationships from different perspectives

scatter_matrix(data, figsize=[20, 20])
pyplot.show()

# Data preparation

A difficulty is that different algorithms make different assumptions about your data and may require different transforms. Sometimes algorithms can deliver better results without pre-processing.

Generally, I would recommend creating many different views and transforms of your data, then exercise a handful of algorithms on each view of your dataset. This will help you to flush out which data transforms might be better at exposing the structure of your problem in general.

Two standard idioms for transforming data: fit and multiple transform; combined fit-and-transform.

In [None]:
array = data.values
# Separate array into input and output components
X = array[:, 0:8]
Y = array[:, 8]

# Rescale data
# Useful for optimisation algos (used in the core of ML algos) like gradient
# descent, algos that weight inputs like regression and neural networks, and
# algos that use distance measures like k-Nearest Neighbors

scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
# Summarise transformed data
set_printoptions(precision=3)
print(rescaledX[0:5, :])

# Standardise data
# Suitable for techniques that assume a Gaussian distribution in the input
# variables and work better with rescaled data, such as linear regression,
# logistic regression, and linear discriminate analysis

scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
# Summarise transformed data
set_printoptions(precision=3)
print(rescaledX[0:5, :])

# Normalise data
# Useful for sparse datasets with attributes of varying scales when using
# algorithms that weight input values such as neural networks and use distance
# measures such as k-Nearest Neighbors

scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
# Summarise transformed data
set_printoptions(precision=3)
print(normalizedX[0:5, :])

# Binarise data

# Useful when you have probabilities that you want to make crisp values. It is
# also useful when feature engineering and you want to add new features that
# indicate something meaningful

binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
# Summarise transformed data
set_printoptions(precision=3)
print(binaryX[0:5, :])


In [None]:
# Plot each attribute and see the changes

import seaborn as sns

data.plot(kind='density', subplots=True, layout=(3, 3), sharex=False, figsize=[10, 10])
pyplot.show()

sns.distplot(rescaledX[:, 1])
sns.distplot(rescaledX[:, 2])
sns.distplot(rescaledX[:, 3])
sns.distplot(rescaledX[:, 4])
sns.distplot(rescaledX[:, 5])
sns.distplot(rescaledX[:, 6])
sns.distplot(rescaledX[:, 7])

# Decision tree classification on raw vs normalised data

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

kfold = KFold(n_splits=10, random_state=7, shuffle=None)
model = DecisionTreeClassifier()
results = cross_val_score(model, X, Y, cv=kfold)
print("Mean estimated accuracy \n",results.mean())
results2 = cross_val_score(model, normalizedX, Y, cv=kfold)
print("Mean estimated accuracy on normalised data \n",results2.mean())

# Feature selection and resampling

In [None]:
drive.flush_and_unmount()