## Unsupervised Learning techniques in Python

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering # hierarchical clustering

In [4]:
arrests_path=os.path.join('data', 'USArrests.csv')
arrests_data=pd.read_csv(arrests_path)
arrests_data.head()

Unnamed: 0,state_name,murder,assault,urban_pop,rape
0,Alabama,13.2,236,58,21.2
1,Alaska,10.0,263,48,44.5
2,Arizona,8.1,294,80,31.0
3,Arkansas,8.8,190,50,19.5
4,California,9.0,276,91,40.6


In [15]:
# performing hierarchical clustering on the states resulting in three clusters
arrests_hierarchical=AgglomerativeClustering(n_clusters=3,
                                             affinity='euclidean',
                                            linkage='complete')

In [16]:
# removing the state name from the data so the clustering can cluster on the numbers
arrests=arrests_data.drop('state_name', axis=1)
arrests.head()

Unnamed: 0,murder,assault,urban_pop,rape
0,13.2,236,58,21.2
1,10.0,263,48,44.5
2,8.1,294,80,31.0
3,8.8,190,50,19.5
4,9.0,276,91,40.6


In [17]:
# making a copy of the state name column to copy on to it later for analysis
state_name=arrests_data['state_name'].copy()
state_name.head()

0       Alabama
1        Alaska
2       Arizona
3      Arkansas
4    California
Name: state_name, dtype: object

In [18]:
# runnning hierarchical clustering without scaling the variables
arrests_hierarchical.fit(arrests)

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='complete', memory=None, n_clusters=3,
                        pooling_func='deprecated')

In [22]:
# looking at the labels that resulted from the clustering method
arrests_hierarchical.labels_

array([0, 0, 0, 2, 0, 2, 1, 0, 0, 2, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 2, 0,
       1, 0, 2, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 2, 2, 1, 2, 0, 1, 2, 2, 1,
       1, 2, 2, 1, 1, 2])

In [24]:
# joining the cluster membership and state name to the data to the state name to see membership
clusters=pd.DataFrame(arrests_hierarchical.labels_).rename(columns={0:'clusters'})
clusters.head()

Unnamed: 0,clusters
0,0
1,0
2,0
3,2
4,0


In [27]:
# must include axis=1 to concatenate
arrests_clusters=pd.concat([arrests_data, clusters], axis=1)
arrests_clusters.head()

Unnamed: 0,state_name,murder,assault,urban_pop,rape,clusters
0,Alabama,13.2,236,58,21.2,0
1,Alaska,10.0,263,48,44.5,0
2,Arizona,8.1,294,80,31.0,0
3,Arkansas,8.8,190,50,19.5,2
4,California,9.0,276,91,40.6,0


In [30]:
from sklearn.preprocessing import StandardScaler