# Exercise: K-Means Clustering

In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

animalData = pd.read_csv('Data/2021-11-ml-09-k-means-animals-dataset.csv')
animalData.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Region               344 non-null    object 
 1   Stage                344 non-null    object 
 2   Individual ID        344 non-null    object 
 3   Culmen Length (mm)   342 non-null    float64
 4   Culmen Depth (mm)    342 non-null    float64
 5   Flipper Length (mm)  342 non-null    float64
 6   Body Mass (g)        342 non-null    float64
 7   Delta 15 N (o/oo)    330 non-null    float64
 8   Delta 13 C (o/oo)    331 non-null    float64
 9   Comments             26 non-null     object 
dtypes: float64(6), object(4)
memory usage: 27.0+ KB


In [8]:
# It looks like there may be a few missing values. Let's check to see if there are any rows that have almost all empty

In [9]:
# Check if anything looks weird
animalData.describe()

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo)
count,342.0,342.0,342.0,342.0,330.0,331.0
mean,43.92193,17.15117,200.915205,4201.754386,8.733382,-25.686292
std,5.459584,1.974793,14.061714,801.954536,0.55177,0.793961
min,32.1,13.1,172.0,2700.0,7.6322,-27.01854
25%,39.225,15.6,190.0,3550.0,8.29989,-26.320305
50%,44.45,17.3,197.0,4050.0,8.652405,-25.83352
75%,48.5,18.7,213.0,4750.0,9.172123,-25.06205
max,59.6,21.5,231.0,6300.0,10.02544,-23.78767


There are missing values in Delta15 and Delta13. Let's impute some value in them. This can be done here because we aren't breaking into a Training and Testing sets

In [6]:
from sklearn.impute import SimpleImputer
import numpy as np

cols = ('Delta 15 N (o/oo)', 'Delta 13 C (o/oo)')

## Impute the numerical values to the mean
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
for col in cols:
    animalData[col] = imp_mean.fit_transform(
        animalData[col].to_numpy().reshape(-1, 1))

print(animalData.describe())


       Culmen Length (mm)  Culmen Depth (mm)  Flipper Length (mm)  \
count          342.000000         342.000000           342.000000   
mean            43.921930          17.151170           200.915205   
std              5.459584           1.974793            14.061714   
min             32.100000          13.100000           172.000000   
25%             39.225000          15.600000           190.000000   
50%             44.450000          17.300000           197.000000   
75%             48.500000          18.700000           213.000000   
max             59.600000          21.500000           231.000000   

       Body Mass (g)  Delta 15 N (o/oo)  Delta 13 C (o/oo)  
count     342.000000         344.000000         344.000000  
mean     4201.754386           8.733382         -25.686292  
std       801.954536           0.540392           0.778770  
min      2700.000000           7.632200         -27.018540  
25%      3550.000000           8.307415         -26.285460  
50%      405

## Setup and Tuning the Model
1. Remove all of the categorical columns and ones that won't help.
2. Standardize the remaining columns

In [2]:
animalData.drop(['Region','Stage','Individual ID','Comments'], axis=1, inplace=True)
animalData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Culmen Length (mm)   342 non-null    float64
 1   Culmen Depth (mm)    342 non-null    float64
 2   Flipper Length (mm)  342 non-null    float64
 3   Body Mass (g)        342 non-null    float64
 4   Delta 15 N (o/oo)    330 non-null    float64
 5   Delta 13 C (o/oo)    331 non-null    float64
dtypes: float64(6)
memory usage: 16.2 KB


Check the correlation to see what might make sense to graph later

In [3]:
animalData.corr().round(3)

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo)
Culmen Length (mm),1.0,-0.235,0.656,0.595,-0.06,0.189
Culmen Depth (mm),-0.235,1.0,-0.584,-0.472,0.606,0.43
Flipper Length (mm),0.656,-0.584,1.0,0.871,-0.508,-0.376
Body Mass (g),0.595,-0.472,0.871,1.0,-0.538,-0.375
Delta 15 N (o/oo),-0.06,0.606,-0.508,-0.538,1.0,0.571
Delta 13 C (o/oo),0.189,0.43,-0.376,-0.375,0.571,1.0
