## Unsupervised Learning techniques in Python

In [68]:
import os
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering # hierarchical clustering

### Arrests data

In [2]:
arrests_path=os.path.join('data', 'USArrests.csv')
arrests_data=pd.read_csv(arrests_path)
arrests_data.head()

Unnamed: 0,state_name,murder,assault,urban_pop,rape
0,Alabama,13.2,236,58,21.2
1,Alaska,10.0,263,48,44.5
2,Arizona,8.1,294,80,31.0
3,Arkansas,8.8,190,50,19.5
4,California,9.0,276,91,40.6


In [3]:
# performing hierarchical clustering on the states resulting in three clusters
arrests_hierarchical=AgglomerativeClustering(n_clusters=3,
                                             affinity='euclidean',
                                            linkage='complete')

In [4]:
# removing the state name from the data so the clustering can cluster on the numbers
arrests=arrests_data.drop('state_name', axis=1)
arrests.head()

Unnamed: 0,murder,assault,urban_pop,rape
0,13.2,236,58,21.2
1,10.0,263,48,44.5
2,8.1,294,80,31.0
3,8.8,190,50,19.5
4,9.0,276,91,40.6


In [5]:
# making a copy of the state name column to copy on to it later for analysis
state_name=arrests_data['state_name'].copy()
state_name.head()

0       Alabama
1        Alaska
2       Arizona
3      Arkansas
4    California
Name: state_name, dtype: object

In [6]:
# runnning hierarchical clustering without scaling the variables
arrests_hierarchical.fit(arrests)

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='complete', memory=None, n_clusters=3,
                        pooling_func='deprecated')

In [7]:
# looking at the labels that resulted from the clustering method
arrests_hierarchical.labels_

array([0, 0, 0, 2, 0, 2, 1, 0, 0, 2, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 2, 0,
       1, 0, 2, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 2, 2, 1, 2, 0, 1, 2, 2, 1,
       1, 2, 2, 1, 1, 2])

In [8]:
# joining the cluster membership and state name to the data to the state name to see membership
clusters=pd.DataFrame(arrests_hierarchical.labels_).rename(columns={0:'clusters'})
clusters.head()

Unnamed: 0,clusters
0,0
1,0
2,0
3,2
4,0


In [22]:
# must include axis=1 to concatenate
arrests_clusters=pd.concat([arrests_data, clusters], axis=1)
arrests_clusters.head()

Unnamed: 0,state_name,murder,assault,urban_pop,rape,clusters
0,Alabama,13.2,236,58,21.2,0
1,Alaska,10.0,263,48,44.5,0
2,Arizona,8.1,294,80,31.0,0
3,Arkansas,8.8,190,50,19.5,2
4,California,9.0,276,91,40.6,0


Now I will take a look at the clusters to see if they make sense with the data that was not scaled beforehand. Afterwards I will check if the clusters make more sense when the data is scaled.

Let's look at the first cluster:

In [23]:
arrests_clusters.loc[arrests_clusters['clusters']==0]

Unnamed: 0,state_name,murder,assault,urban_pop,rape,clusters
0,Alabama,13.2,236,58,21.2,0
1,Alaska,10.0,263,48,44.5,0
2,Arizona,8.1,294,80,31.0,0
4,California,9.0,276,91,40.6,0
7,Delaware,5.9,238,72,15.8,0
8,Florida,15.4,335,80,31.9,0
12,Illinois,10.4,249,83,24.0,0
17,Louisiana,15.4,249,66,22.2,0
19,Maryland,11.3,300,67,27.8,0
21,Michigan,12.1,255,74,35.1,0


As can be seen here, the first cluster does not make sense. California and Alabama should not be in the same cluster. There are some states that have a higher development index than others and those are the ones that should be in the same cluster. Looking at the second cluster will probably show similar results.

In [24]:
arrests_clusters.loc[arrests_clusters['clusters']==1]

Unnamed: 0,state_name,murder,assault,urban_pop,rape,clusters
6,Connecticut,3.3,110,77,11.1,1
10,Hawaii,5.3,46,83,20.2,1
11,Idaho,2.6,120,54,14.2,1
13,Indiana,7.2,113,65,21.0,1
14,Iowa,2.2,56,57,11.3,1
15,Kansas,6.0,115,66,18.0,1
16,Kentucky,9.7,109,52,16.3,1
18,Maine,2.1,83,51,7.8,1
22,Minnesota,2.7,72,66,14.9,1
25,Montana,6.0,109,53,16.4,1


Similar to the first cluster, there are states that do not belong together.

Now we will re-run the clustering method using a scaled data set to see if the cluster membership is different.

In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [26]:
arrests.head()

Unnamed: 0,murder,assault,urban_pop,rape
0,13.2,236,58,21.2
1,10.0,263,48,44.5
2,8.1,294,80,31.0
3,8.8,190,50,19.5
4,9.0,276,91,40.6


In [36]:
standardize_data=Pipeline([
    ('scaler', StandardScaler())
])

In [37]:
col_names=list(arrests)
col_names

['murder', 'assault', 'urban_pop', 'rape']

In [41]:
arrests_standard=pd.DataFrame(standardize_data.fit_transform(arrests))
arrests_standard.head()

Unnamed: 0,0,1,2,3
0,1.255179,0.790787,-0.526195,-0.003451
1,0.513019,1.11806,-1.224067,2.509424
2,0.072361,1.493817,1.009122,1.053466
3,0.234708,0.233212,-1.084492,-0.186794
4,0.281093,1.275635,1.776781,2.088814


In [43]:
arrests_standard.rename(columns=dict(zip(arrests_standard[1:], col_names), inplace=True)).head()

Unnamed: 0,murder,assault,urban_pop,rape
0,1.255179,0.790787,-0.526195,-0.003451
1,0.513019,1.11806,-1.224067,2.509424
2,0.072361,1.493817,1.009122,1.053466
3,0.234708,0.233212,-1.084492,-0.186794
4,0.281093,1.275635,1.776781,2.088814


Now running hierarchical clustering using the standardized data.

In [45]:
arrests_hc=AgglomerativeClustering(n_clusters=3,
                                  affinity='euclidean',
                                  linkage='complete')

In [49]:
arrests_hc.fit(arrests_standard)
arrests_hc.labels_

array([1, 1, 2, 0, 2, 2, 0, 0, 2, 1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 2, 0, 2,
       0, 1, 0, 0, 0, 2, 0, 0, 2, 2, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 2, 0,
       0, 0, 0, 0, 0, 0])

In [51]:
standard_labels={'cluster': arrests_hc.labels_}
standard_labels=pd.DataFrame(data = standard_labels)
standard_labels.head()

Unnamed: 0,cluster
0,1
1,1
2,2
3,0
4,2


In [56]:
arrests_stand=pd.concat([state_name, arrests, standard_labels], axis=1)
arrests_stand.head()

Unnamed: 0,state_name,murder,assault,urban_pop,rape,cluster
0,Alabama,13.2,236,58,21.2,1
1,Alaska,10.0,263,48,44.5,1
2,Arizona,8.1,294,80,31.0,2
3,Arkansas,8.8,190,50,19.5,0
4,California,9.0,276,91,40.6,2


Lets look at the clusters to see if it makes more sense than before.

In [66]:
arrests_stand.loc[arrests_stand['cluster']==2]

Unnamed: 0,state_name,murder,assault,urban_pop,rape,cluster
2,Arizona,8.1,294,80,31.0,2
4,California,9.0,276,91,40.6,2
5,Colorado,7.9,204,78,38.7,2
8,Florida,15.4,335,80,31.9,2
12,Illinois,10.4,249,83,24.0,2
19,Maryland,11.3,300,67,27.8,2
21,Michigan,12.1,255,74,35.1,2
27,Nevada,12.2,252,81,46.0,2
30,New Mexico,11.4,285,70,32.1,2
31,New York,11.1,254,86,26.1,2


Looking at the third cluster, I can see that it makes more sense than it did before. California, Illinois, New York and other states make more sense to go together. However, as can be seen below, cluster one is large compared to the other two. Perhaps it would be beneficial to increase the number of clusters to four or five to even out the cluster memberships.

In [67]:
arrests_stand.loc[arrests_stand['cluster']==0]

Unnamed: 0,state_name,murder,assault,urban_pop,rape,cluster
3,Arkansas,8.8,190,50,19.5,0
6,Connecticut,3.3,110,77,11.1,0
7,Delaware,5.9,238,72,15.8,0
10,Hawaii,5.3,46,83,20.2,0
11,Idaho,2.6,120,54,14.2,0
13,Indiana,7.2,113,65,21.0,0
14,Iowa,2.2,56,57,11.3,0
15,Kansas,6.0,115,66,18.0,0
16,Kentucky,9.7,109,52,16.3,0
18,Maine,2.1,83,51,7.8,0


### College data