<a href="https://colab.research.google.com/github/d-tomas/text-mining/blob/main/notebooks/lecture_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Day 4**: Unsupervised Learning

## Initial setup

In [None]:
# Import the required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

# Download the dataset
!wget https://raw.githubusercontent.com/d-tomas/text-mining/main/datasets/iris.csv

In [None]:
# Loading data from file into a Pandas DataFrame

data = pd.read_csv('iris.csv')
data

In [None]:
# Show information about de DataFrame

data.info()

In [None]:
#Frequency distribution of species

data['species'].value_counts()

In [None]:
# Distribution of each class depending of the feature

sns.kdeplot(data = data, x='sepal_length', hue='species')
plt.show()
sns.kdeplot(data = data, x='sepal_width', hue='species')
plt.show()
sns.kdeplot(data = data, x='petal_length', hue='species')
plt.show()
sns.kdeplot(data = data, x='petal_width', hue='species')
plt.show()

## Example 1: k-means

In [None]:
#Finding the optimum number of clusters for k-means classification

wcss = []
X = data.drop('species', axis=1).values  # Keep the values for all the features but the class

for i in range(1, 11):
  kmeans = KMeans(n_clusters=i, random_state=0)
  kmeans.fit(X)
  wcss.append(kmeans.inertia_)  # intertia_ -> Sum of squared distances of samples to their closest cluster center

In [None]:
# Using the elbow method to determine the optimal number of clusters for k-means clustering
# The optimum clusters is where the elbow occurs
# This is when the within cluster sum of squares (WCSS) does not decrease significantly with every iteration

sns.lineplot(x=range(1, 11), y=wcss)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') # Within cluster sum of squares
plt.show()

In [None]:
# Implementing the k-Means clustering

kmeans = KMeans(n_clusters=3, random_state=0)
y = kmeans.fit_predict(X)  # Assign a cluster to each sample

In [None]:
# Labels assigned to each sample

y

In [None]:
# Visualisation of the clusters

sns.scatterplot(x=X[y == 0, 2], y=X[y == 0, 3])
sns.scatterplot(x=X[y == 1, 2], y=X[y == 1, 3])
sns.scatterplot(x=X[y == 2, 2], y=X[y == 2, 3])

#Plotting the centroids of the clusters
sns.scatterplot(x=kmeans.cluster_centers_[:, 2], y=kmeans.cluster_centers_[:,3], s=100, label = 'Centroids')

plt.legend(bbox_to_anchor=(1.01, 1.01), loc=2)

plt.show()

# References

* [Iris dataset](https://archive.ics.uci.edu/ml/datasets/iris)