In [None]:
# Confusion Matrix
'''
Code developer: Dr. Francisco Tognoli
Associate Professor of the Graduate Program in Geology at UNISINOS, Brazil.
Created in: Dec-2020; Last update: 2021-04-03.
# Any question or contribution, please contact: ftognoli.geo@gmail.com or ftognoli@unisinos.br

'''
# Below:
# y_real, y_pred are the attributes
# a, b, c, d, e...etc are numbers

'''
To understand how KMeans is an easy and fast classification method, run the following six instructions using a random sampling.
We can generate random datasets using the sklearn´s automatic sample generator
'''
# Step 1: Import libraries and functions
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib notebook
%matplotlib inline
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from IPython.display import Image
from urllib.request import urlopen
from PIL import Image

# Step 2: Create a fictitious set of values named as y_real (ex: measurements)
from sklearn.datasets.samples_generator import make_blobs
X, y_real = make_blobs(n_samples=a, centers=3, n_features=c, random_state=d)
y_real

# Step 3: Create a fictitious set of values named as y_pred (ex: predicted values)
# use a different random state to generate different datasets
X, y_pred = make_blobs(n_samples=a, centers=3, n_features=c, random_state=e) 
y_pred

# Step 4: Check differences between measured and predicted values using a confusion matrix
print (pd.crosstab(y_real, y_pred, rownames=['Real'], colnames=['Predicted'], margins=True))

# Step 5: If you want a normalized confusion matrix
cf_matrix_norm = cf_matrix / cf_matrix.sum(axis=1)
cf_matrix_norm

# Step 6: Get a classifcation report with metrics
target_names = ['class A', 'class B', 'class C'] # centers = 3 --- 3 classes
print(metrics.classification_report(y_real, y_pred, target_names=target_names))

##################################################################
'''
To use KMeans e Confusion Matrix together, run the following instructions using your dataset.
We can generate random datasets using the pandas + numpy.
'''
# Below:
# A, B... are the attributes
# a, b, c...etc are numbers
# k = number of clusters

# Generate a random dataset
df = pd.DataFrame(np.random.random_sample((100,2)), columns=['y_real', 'y_pred'])
df.head()

# But if you have a spreadsheet like this, test your dataset:
from urllib.request import urlopen
from PIL import Image

image = Image.open(urlopen('https://d33v4339jhl8k0.cloudfront.net/docs/assets/55e71aa290336027d7707900/images/5d15ffcf2c7d3a6ebd22b1a5/file-wHNGcOstYW.png'))
image

# Export your spreadsheet as .csv or .txt
df = pd.read_csv('insert file.csv')
df.head()

# Normalize your data
MMscale = StandardScaler() # if you prefer, use MinMaxScaler
MMscale.fit(df)
norm_data = MMscale.transform(df)

# See the best k using the Elbow method
K = range(1,10)
Sum_of_squared_distances = []
for k in K:
    kmeans = KMeans(n_clusters=k)
    kmeans = kmeans.fit(norm_data)
    Sum_of_squared_distances.append(kmeans.inertia_)
    
fig = plt.figure(figsize=(9,5))
plt.plot (K, Sum_of_squared_distances, 'o-')
plt.xlabel('Number of Classes')
plt.ylabel('Sum of squared distances')
plt.title('Elbow method for the best k')
plt.show()

# Apply KMeans
kmeans = KMeans(n_clusters=k) # define k or see k in Elbow´s graph
kmeans = kmeans.fit(norm_data)

# Create and visualize the new column (clusters)
df['clusters'] = kmeans.labels_
df.head()

# Visualize the cluster as a scatter plot
%matplotlib notebook
plt.figure(figsize=(14,8))
plt.scatter(df['insert A'], df['insert B'], s = a, c = kmeans.labels_) # define a
plt.title('Cluster Visualization')
plt.xlabel('insert A')
plt.ylabel('insert B')
plt.colorbar(label='clusters')
plt.show()

# Check differences between measured and predicted values using a confusion matrix
print (pd.crosstab(insert A, insert B, rownames=['Real'], colnames=['Predicted'], margins=True))

# Get a classifcation report with metrics
target_names = ['class A', 'class B', 'class C'] # define target names in accordance with number of clusters
print(metrics.classification_report(insert A, insert B, target_names=target_names))