In [1]:
"""
Importing all the relevant modules
"""

# Pandas to load CSV file and perform some basic data pre-processing operations
# e.g dropping columns, checking for null values
import pandas as pd

# Numpy for vector and matrix operations
import numpy as np

# DecisionTreeRegressor to train and predict the missing values from the data
# Which we will be intentionally creating in our IRIS dataset
from sklearn.tree import DecisionTreeRegressor

# KMeans will help us cluster/group the similar/alike data 
from sklearn.cluster import KMeans

# For Normalization
from sklearn.preprocessing import MinMaxScaler

# mean_squared_error utility will help us quantify our DecisionTree training results 
# and how close to accurate they are
from sklearn.metrics import mean_squared_error

# Input encoder, to transform categorical values into numerical data
from sklearn.preprocessing import LabelEncoder

# To generate a frequency distribution of data
# e.g total count of elements in different clusters
import collections

### Loading data into pandas dataframe 

In [2]:
df_iris = pd.read_csv("./Data/iris.data", header=None)
print(f"Shape {df_iris.shape}")
df_iris.head(11)

Shape (150, 5)


Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [3]:
# Display some basics statistical information from the data
df_iris.describe()

Unnamed: 0,0,1,2,3
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


### Checking if there are any null values in the given dataset, because they might affect the performance of both of our DecisionTreeRegressor and KMean Algorithms


In [4]:
if(df_iris.isna().sum().sum() == 0):
    print("No missing values")
else:
    print("Missing values in data detected")

No missing values


### Transforming categorical labels into numerical data, this will be helpful for accuracy checks

In [5]:
labels = df_iris.drop([0, 1, 2, 3], axis=1)

encoder = LabelEncoder()
encoder.fit(labels[4].values) # Using 4 as Column, Index because even dropping, pandas maintain's the remaining data indices
encoded_labels = encoder.transform(labels[4].values)
print(encoded_labels)


counts = collections.Counter(encoded_labels)
print("\nCluster Member Counts\n")
print(f"Cluster-0 member count = {counts[0]}")
print(f"Cluster-1 member count = {counts[1]}")
print(f"Cluster-2 member count = {counts[2]}")

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

Cluster Member Counts

Cluster-0 member count = 50
Cluster-1 member count = 50
Cluster-2 member count = 50


### Drop label column from primary data, since its not directly useful in KMeans
### Although can be used for accuracy check later on

In [6]:

# Dropping categorical label name, because in KMean it does not serve us any purpose
df_iris.drop([4],axis=1, inplace=True)

print("Printing IRIS data after `Label` column drop")
df_iris.head(11)

Printing IRIS data after `Label` column drop


Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


#  Assignment Section 1, DecisionTreeRegressor

###  Missing Every 5th row in the dataset, as required by the assignment

In [7]:
series_original_Y_testing_data = [] # Will contain the original non-smudged values of testing data
                                             # To be later used for calculating regressor mean_squared_error

for (i, row) in df_iris.iterrows():
    if((i % 5 == 0) and i != 0): # Miss every 5th row except first one
        series_original_Y_testing_data.append(df_iris.at[i, 0])
        df_iris.at[i, 0] = np.NaN
        
print(f" (Missing Values) Rows with NaN  in first Column  = {df_iris[0].isna().sum()}")
print(f" (Regular Values) Rows with data in first Column = {len(df_iris)- df_iris[0].isna().sum()}")
print("\n Printing dataframe with missing data in column # 0")
df_iris.head(21)

 (Missing Values) Rows with NaN  in first Column  = 29
 (Regular Values) Rows with data in first Column = 121

 Printing dataframe with missing data in column # 0


Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


### Splitting data into training and testing for regressor algorithm

In [8]:

df_training = df_iris[np.isnan(df_iris[0]) == False]
df_testing  = df_iris[np.isnan(df_iris[0])]

print(f" Original IRIS dataset value count = {len(df_iris)}")
print(f" Value count for training data = {df_training.shape}")
print(f" Value count for testing data = {df_testing.shape}")

 Original IRIS dataset value count = 150
 Value count for training data = (121, 4)
 Value count for testing data = (29, 4)


In [9]:
print("Printing training dataset")
df_training.head()

Printing training dataset


Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [10]:
print("Printing testing dataset")
df_testing.head()

Printing testing dataset


Unnamed: 0,0,1,2,3
5,,3.9,1.7,0.4
10,,3.7,1.5,0.2
15,,4.4,1.5,0.4
20,,3.4,1.7,0.2
25,,3.0,1.6,0.2


### Now further segregating data into Predictors and Ground truth for both training and testing data

In [11]:
# Data to be used for training DecisionTreeRegressor
X_training_predictors   = df_training[[1,2,3]]
Y_training_ground_truth = df_training[0]

# Data to be predicted by the regressor
x_testing_predictors    = df_testing[[1,2,3]]
y_testing_ground_truth  = df_testing[0]


## Training DecisionTreeRegressor

In [12]:
# Training a DecisionTreeRegressor with training data
regressor = DecisionTreeRegressor()
regressor = regressor.fit(X_training_predictors, Y_training_ground_truth)

###  Predicting missing values of testing data using the trained regressor


In [13]:
for (i, row) in x_testing_predictors.iterrows():
    y_testing_ground_truth.at[i] = regressor.predict([row])
print("Predicted Missing values")
y_testing_ground_truth.head()

Predicted Missing values


5     5.1
10    5.3
15    5.1
20    4.8
25    4.7
Name: 0, dtype: float64

### Restuffing predicted data back into the missing columns of original data

In [14]:
# Putting data back into original dataframe

# This index is being fetched from the y_testing_ground_truth
# Because pandas preserved the origianl index locations of the smudged data into the 
# Main IRIS dataframe, hence we can simply re-access those indices and put the predicted data

for special_index in y_testing_ground_truth.index:
    df_iris.at[special_index, 0] =  y_testing_ground_truth[special_index]

# Now printing the orignal IRIS dataset with the missing values fixed
df_iris.head(21)

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.1,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


### Calculating Accuracy of our DecisionTreeRegressor predictions

In [15]:
# Calculating Mean Squared Error between the original, non-smudged data and the predicted data
err = mean_squared_error(series_original_Y_testing_data, y_testing_ground_truth)
err

0.2690517241379311

# Assignment Section 2, Clustering 

### Normalizing data, and also making a copy of non-normalized dataframe
### So we can train our clustering algorithm on both datasets

In [16]:
df_iris_non_normalized = df_iris.copy(deep=True) # Perform a deep copy of IRIS, we will used this cloned copy
                                                 # For running KMeans on non-normalized data
    
print("Standard Final IRIS dataframe without any normalization")
print(df_iris_non_normalized.head(6))
    
# Apply normalization on the other dataframe    
    
scalar = MinMaxScaler()
scalar.fit(df_iris)

df_iris = scalar.transform(df_iris)
#Recreating dataframe because MinMaxScalar.transform returns numpy data structure
df_iris = pd.DataFrame(df_iris) 
print("\nNormalized IRIS dataframe")
df_iris.head(6)

Standard Final IRIS dataframe without any normalization
     0    1    2    3
0  5.1  3.5  1.4  0.2
1  4.9  3.0  1.4  0.2
2  4.7  3.2  1.3  0.2
3  4.6  3.1  1.5  0.2
4  5.0  3.6  1.4  0.2
5  5.1  3.9  1.7  0.4

Normalized IRIS dataframe


Unnamed: 0,0,1,2,3
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
5,0.222222,0.791667,0.118644,0.125


In [17]:
K = 3 # Keeping K 3 since there are only 3 species IRIS in dataset, 
      # hence for this toy dataset it is a relatively straight-forword decision
iterations = 10000    

k_cluster = KMeans(n_clusters=K, max_iter=iterations, random_state=11)
k_cluster = k_cluster.fit(df_iris)

In [18]:
k_cluster.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0], dtype=int32)

In [19]:
counts = collections.Counter(k_cluster.labels_)

print("Cluster Member Counts\n")
print(f"Cluster-0 member count = {counts[0]}")
print(f"Cluster-1 member count = {counts[1]}")
print(f"Cluster-2 member count = {counts[2]}")

Cluster Member Counts

Cluster-0 member count = 61
Cluster-1 member count = 50
Cluster-2 member count = 39


### Now clustering without normalization

In [20]:
# Now Clustering with non-normalized data

k_cluster_not_normalized = KMeans(n_clusters=K, max_iter=iterations, random_state=11)
k_cluster_not_normalized = k_cluster_not_normalized.fit(df_iris_non_normalized)


print("\nClusters generated by KMeans\n")
print(k_cluster_not_normalized.labels_)


counts = collections.Counter(k_cluster_not_normalized.labels_)

print("Cluster Member Counts\n")
print(f"Cluster-0 member count = {counts[0]}")
print(f"Cluster-1 member count = {counts[1]}")
print(f"Cluster-2 member count = {counts[2]}")



Clusters generated by KMeans

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]
Cluster Member Counts

Cluster-0 member count = 62
Cluster-1 member count = 50
Cluster-2 member count = 38


In [21]:
"""
 With Normalization enabled, before sending dataset for clustering we are getting following clusters
 
 Cluster Member Counts

 Cluster-0 member count = 61
 Cluster-1 member count = 50
 Cluster-2 member count = 39
 
 
 And without normalization, the clusters look something like following
 
 Cluster-0 member count = 62
 Cluster-1 member count = 50
 Cluster-2 member count = 38
 
"""

'\n With Normalization enabled, before sending dataset for clustering we are getting following clusters\n \n Cluster Member Counts\n\n Cluster-0 member count = 61\n Cluster-1 member count = 50\n Cluster-2 member count = 39\n \n \n And without normalization, the clusters look something like following\n \n Cluster-0 member count = 62\n Cluster-1 member count = 50\n Cluster-2 member count = 38\n \n'