# Clustering data with Unsupervised Machine Learning

**Aim**: The aim of this notebook is to cluster data into distinct groups or clusters when the data has no labels using Unsupervised Machine Learning Techniques. 

## Table of contents 

1. Implementing the K-Means algorithm in scikit-learn
2. Feature Engineering for optimization
3. Cluster Visualization
4. Unsupervised to Supervised Learning

## Package Requirements

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
from sklearn import tree
from sklearn.manifold import TSNE
import warnings
warnings.simplefilter(action='ignore', category=Warning)

**Data Cleaning**

In [None]:
#Reading in the dataset

df = pd.read_csv('fraud_prediction.csv')

#Dropping the target feature & the index

df = df.drop(['Unnamed: 0', 'isFraud'], axis = 1)

## Implementing the K-Means algorithm in scikit-learn

In [None]:
#Initializing K-means with 2 clusters

k_means = KMeans(n_clusters = 2)

#Fitting the model on the data

k_means.fit(df)

In [None]:
#Extracting labels 

target_labels = k_means.predict(df)

#Printing the labels

target_labels

**Evaluating the inertia of the model**

In [None]:
# Inertia of present model

k_means.inertia_

In [None]:
#Initialize a list of clusters from 1 to 10 clusters 

clusters = [1,2,3,4,5,6,7,8,9,10]

#Create an empty list in order to store the inertia values 

inertia_values = []

for cluster in clusters:
    
    #Build a k-means model for each cluster value
    
    k_means = KMeans(n_clusters = cluster)
    
    #Fit the model to the data
    
    k_means.fit(df)
    
    # Store inertia value of each model into the empty list 
    
    inertia_values.append(k_means.inertia_)
    
# Plot the result

sns.lineplot(x = clusters, y = inertia_values)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia Value')
plt.title('Number of Clusters Vs. Inertia Values')
plt.show()

## Feature Engineering for optimization

**Scaling**

In [None]:
#Setting up the standard scaler 

scale_data = StandardScaler()

#Scaling the data

scale_data.fit(df)

df_scaled = scale_data.transform(df)

In [None]:
#Applying the K-Means algorithm on the scaled data

#Initializing K-means with 2 clusters

k_means = KMeans(n_clusters = 2)

#Fitting the model on the data

k_means.fit(df_scaled)

In [None]:
# Inertia of present model

k_means.inertia_

**Finding the optimal number of clusters post scaling**

In [None]:
#Initialize a list of clusters from 1 to 10 clusters 

clusters = [1,2,3,4,5,6,7,8,9,10]

#Create an empty list in order to store the inertia values 

inertia_values = []

for cluster in clusters:
    
    #Build a k-means model for each cluster value
    
    k_means = KMeans(n_clusters = cluster)
    
    #Fit the model to the data
    
    k_means.fit(df_scaled)
    
    # Store inertia value of each model into the empty list 
    
    inertia_values.append(k_means.inertia_)
    
# Plot the result

sns.lineplot(x = clusters, y = inertia_values)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia Value')
plt.title('Number of Clusters Vs. Inertia Values')
plt.show()

**Principal component analysis**

In [None]:
#Initialize a PCA model with 5 features 

pca_model = PCA(n_components = 5)

In [None]:
#Fit the model to the dataframe

pca_model.fit(df_scaled)

In [None]:
#Transform the features so that it is de-correlated

pca_transform = pca_model.transform(df_scaled)

In [None]:
#Check to see if there are only 5 features

pca_transform.shape

**Fitting and evaluating a new K-Means model**

In [None]:
#Applying the K-Means algorithm on the scaled data

#Initializing K-means with 2 clusters

k_means = KMeans(n_clusters = 2)

#Fitting the model on the data

k_means.fit(pca_transform)

In [None]:
# Inertia of present model

k_means.inertia_

In [None]:
#Initialize a list of principal components

components = [1,2,3,4,5,6,7,8,9,10]

#Create an empty list in order to store the inertia values 

inertia_values = []

for comp in components:
    
    #Initialize a PCA model

    pca_model = PCA(n_components = comp)
    
    #Fit the model to the dataframe

    pca_model.fit(df_scaled)
    
    #Transform the features so that it is de-correlated

    pca_transform = pca_model.transform(df_scaled)
    
    #Build a k-means model 
    
    k_means = KMeans(n_clusters = 2)
    
    #Fit the model to the data
    
    k_means.fit(pca_transform)
    
    # Store inertia value of each model into the empty list 
    
    inertia_values.append(k_means.inertia_)
    
# Plot the result

sns.lineplot(x = components, y = inertia_values)
plt.xlabel('Number of Principal Components')
plt.ylabel('Inertia Value')
plt.title('Number of Components Vs. Inertia Values')
plt.show()

## Cluster Visualization

**t-SNE**

In [None]:
#Reading in the dataset

df = pd.read_csv('fraud_prediction.csv')

#Dropping the target feature & the index

df = df.drop(['Unnamed: 0', 'isFraud'], axis = 1)

In [None]:
#Initializing K-means with 2 clusters

k_means = KMeans(n_clusters = 2)

#Fitting the model on the data

k_means.fit(df)

In [None]:
#Extracting labels 

target_labels = k_means.predict(df)

In [None]:
#Converting the labels to a series 

target_labels = pd.Series(target_labels)

In [None]:
#Merging the labels to the dataset

df = pd.merge(df, pd.DataFrame(target_labels), left_index=True, right_index=True)

In [None]:
#Renaming the target 

df['fraud'] = df[0]
df = df.drop([0], axis = 1)

In [None]:
#Creating the features

features = df.drop('fraud', axis = 1).values

target = df['fraud'].values

In [None]:
#Initialize a TSNE object

tsne_object = TSNE()

#Fit and transform the features using the TSNE object

transformed = tsne_object.fit_transform(features)

In [None]:
#Creating a t-SNE visualization

x_axis = transformed[:,0]


y_axis = transformed[:,1]


plt.scatter(x_axis, y_axis, c = target)

plt.show()

**Heirarchial Clustering**

In [None]:
#Creating an array of 4 features

array = np.array([[1,2,3,4], [5,6,7,8], [2,3,4,5], [5,6,4,3]])

In [None]:
feature_names = ['a', 'b', 'c', 'd']

In [None]:
feature_names

In [None]:
#Creating clusters

clusters = linkage(array, method = 'complete')

In [None]:
#Creating a dendrogram

dendrogram(clusters, labels = feature_names, leaf_rotation = 90)

In [None]:
plt.show()

## Unsupervised to supervised learning

In [None]:
#Reading in the dataset

df = pd.read_csv('fraud_prediction.csv')

#Dropping the target feature & the index

df = df.drop(['Unnamed: 0', 'isFraud'], axis = 1)

In [None]:
#Initializing K-means with 2 clusters

k_means = KMeans(n_clusters = 2)

#Fitting the model on the data

k_means.fit(df)

In [None]:
#Extracting labels 

target_labels = k_means.predict(df)

In [None]:
#Converting the labels to a series 

target_labels = pd.Series(target_labels)

In [None]:
#Merging the labels to the dataset

df = pd.merge(df, pd.DataFrame(target_labels), left_index=True, right_index=True)

**Building the decision tree**

In [None]:
#Renaming the target 

df['fraud'] = df[0]
df = df.drop([0], axis = 1)

In [None]:
#Initializing an empty DT classifier with a random state value of 42

dt_classifier = DecisionTreeClassifier(criterion = 'gini', random_state = 42)

In [None]:
#Creating the features

features = df.drop('fraud', axis = 1).values

target = df['fraud'].values

In [None]:
#Fitting the classifier on the training data 

dt_classifier.fit(features, target)

In [None]:
#Creating a dataframe with the features only

features = df.drop('fraud', axis = 1)

In [None]:
dot_data = tree.export_graphviz(dt_classifier, out_file=None, feature_names= features.columns)

In [None]:
# Draw graph

graph = pydotplus.graph_from_dot_data(dot_data)

In [None]:
#Show graph 

Image(graph.create_png())