In [1]:
# Exercise 4

In [2]:
import math
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd

import classifier as cl
import dtransform as dt
import stats
import subgraphs as sg

In [3]:
red_data = pd.read_csv("datasets/winequality-red.csv")
red_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
# Feature labels
feature_labels = []
for label in red_data.columns:
    feature_labels.append(str(label))
    
feature_labels.remove('quality')
print(feature_labels)

['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']


In [5]:
raw_classes = red_data.iloc[:, 11]
red_data = red_data.iloc[:, :11]

In [6]:
def eucl_dist(x, y):
    sum = 0
    for i in range(len(x)):
        sum += (x[i] - y[i])**2
        
    return math.sqrt(sum)

In [7]:
# Min-max normalise the input data
def normalise(data):
    # Find minimum and maximum values of data   
    mini = min(data)
    maxi = max(data)
    
    # Normalise data
    values = []
    for val in data:
        values.append((val - mini) / (maxi - mini))
        
    # Write over input data
    for i, val in enumerate(values):
        data[i] = val

In [8]:
for _, attr in red_data.items():
    normalise(attr)
    
red_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.247788,0.39726,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.606299,0.137725,0.153846
1,0.283186,0.520548,0.0,0.116438,0.143573,0.338028,0.215548,0.494126,0.362205,0.209581,0.215385
2,0.283186,0.438356,0.04,0.09589,0.133556,0.197183,0.169611,0.508811,0.409449,0.191617,0.215385
3,0.584071,0.109589,0.56,0.068493,0.105175,0.225352,0.190813,0.582232,0.330709,0.149701,0.215385
4,0.247788,0.39726,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.606299,0.137725,0.153846


In [9]:
red_data_sample = red_data.sample(40, axis=0)
    
# Instance labels
inst_labels = []
for label in red_data_sample.index:
    inst_labels.append(str(label))

In [10]:
# Instances are rows, attributes are columns
_, nattr = red_data.shape
ninst, _ = red_data_sample.shape
D = np.zeros((ninst, ninst))

for i in range(ninst):
    i_data = red_data_sample.iloc[i, :].values
    for j in range(i, ninst):
        j_data = red_data_sample.iloc[j, :].values
        
        dist = eucl_dist(i_data, j_data)
        D[i][j] = D[j][i] = dist
        
if os.path.exists("Output"):
    print("Path exists")
else:
    print("Path not found, creating directory")
    os.mkdir("Output")
    
np.savetxt("Output/RedWineInstanceDistMatrix.csv", D, fmt="%.2f", delimiter=",")

Path exists


In [11]:
D_attr = np.zeros((nattr, nattr))

for i in range(nattr):
    i_data = red_data.iloc[:, i].values
    for j in range(i, nattr):
        j_data = red_data.iloc[:, j].values
        
        dist = eucl_dist(i_data, j_data)
        D_attr[i][j] = D_attr[j][i] = dist

if os.path.exists("Output"):
    print("Path exists")
else:
    print("Path not found, creating directory")
    os.mkdir("Output")
    
np.savetxt("Output/RedWineAttributeDistMatrix.csv", D_attr, fmt="%.2f", delimiter=",")

Path exists


In [12]:
# Generate sets of vertices
V = set(n for n in range(len(D)))
V_attr = set(n for n in range(len(D_attr)))

In [13]:
_ = sg.rngraph(V, D, "Output/wine-inst-rng.gml", inst_labels)
_ = sg.gg_graph(V, D, "Output/wine-inst-gg.gml", inst_labels)

In [14]:
_ = sg.rngraph(V_attr, D_attr, "Output/wine-attr-rng.gml", feature_labels)
_ = sg.gg_graph(V_attr, D_attr, "Output/win-attr-gg.gml", feature_labels)

In [15]:
# Training and testing a classifier

In [16]:
# Divide classes into two categories
classes = []
for val in raw_classes:
    if val >= 6:
        classes.append(1)  # High scoring wines
    else:
        classes.append(0)  # Low scoring wines

In [17]:
import importlib
importlib.reload(dt)

<module 'dtransform' from '/home/conor/Documents/COMP3340/Assignments/P2/dtransform.py'>

In [18]:
# Discretise dataset
red_data_disc = red_data.copy()
red_splits = dt.discretise_dataset(red_data_disc, classes, attr_axis=1)
red_data_disc.to_csv("Output/RedWineDiscrete.csv")

Total features =  11
Discretising feature  11

In [19]:
k = 4
red_data_reduced = pd.DataFrame()
chi_values = dt.chi_sq(red_data_disc, classes, attr_axis=1)
red_data_reduced = dt.k_chi_reduce(red_data_disc, k, chi_values, feature_labels, attr_axis=1)
red_data_reduced.head()

Unnamed: 0,alcohol,sulphates,volatile acidity,free sulfur dioxide
0,0.0,1.0,2.0,0.0
1,0.0,2.0,3.0,0.0
2,0.0,2.0,2.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,1.0,2.0,0.0


In [20]:
# Split into training and test datasets
# Dataset appears to be randomly ordered, so splitting it 70/30 should suffice
split_point = int(red_data_reduced.shape[0] * 7/10)

train_data = red_data_reduced.iloc[:split_point, :]
train_classes = classes[:split_point]

test_data = red_data_reduced.iloc[split_point:, :]
test_classes = classes[split_point:]

In [21]:
freq_tables = cl.naive_bayes(train_data, train_classes, attr_axis=1)

# Test on train dataset
classifications = cl.nb_classify(freq_tables, train_data, train_classes, inst_axis=0)

# Evaluate classifier

# Produce confusion matrix
# TP FP
# FN TN
cm = cl.cmatrix(classifications, train_classes)

print(cm)
print(f'Sensitivity: {stats.sensitivity(cm):.2f}')
print(f'Specificity: {stats.specificity(cm): .2f}')
print(f'Accuracy: {stats.accuracy(cm): .2f}')
print(f'f1-score: {stats.f1(cm): .2f}')
print(f"Matthew's Correlation Coefficient: {stats.mcc(cm): .2f}")
print(f"Youden's J: {stats.youdenJ(cm): .2f}")

[[430. 198.]
 [100. 391.]]
Sensitivity: 0.81
Specificity:  0.66
Accuracy:  0.73
f1-score:  0.74
Matthew's Correlation Coefficient:  0.48
Youden's J:  0.48


In [22]:
# Test on train dataset
classifications = cl.nb_classify(freq_tables, test_data, test_classes, inst_axis=0)

# Evaluate classifier

# Produce confusion matrix
# TP FP
# FN TN
cm = cl.cmatrix(classifications, test_classes)

print(cm)
print(f'Sensitivity: {stats.sensitivity(cm):.2f}')
print(f'Specificity: {stats.specificity(cm): .2f}')
print(f'Accuracy: {stats.accuracy(cm): .2f}')
print(f'f1-score: {stats.f1(cm): .2f}')
print(f"Matthew's Correlation Coefficient: {stats.mcc(cm): .2f}")
print(f"Youden's J: {stats.youdenJ(cm): .2f}")

[[161.  68.]
 [ 53. 198.]]
Sensitivity: 0.75
Specificity:  0.74
Accuracy:  0.75
f1-score:  0.73
Matthew's Correlation Coefficient:  0.49
Youden's J:  0.50
