## This page uses unsupervised learning models to uncover clusters in our data set

In [1]:
# imports
import pandas as pd
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import warnings

In [2]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
np.random.seed(42)

In [4]:
# modeling imports
from scipy import stats
from sklearn.metrics import mean_squared_error, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

#### Bring in transformed data set, which combines voting and census data for each precinct

In [16]:
df = pd.read_pickle('./data/combined18_extended.pkl')
df.shape

(1334, 37)

In [17]:
df.head()

Unnamed: 0,srprec_orig,subindex,pctsrprec_tract,employed_wgt,unemployed_wgt,empl_military_wgt,not_inlaborforce_wgt,working_women_wgt,parents_work_under6_wgt,parents_work_0617_wgt,...,totreg_r,dem,rep,dcl,male,female,hispdem,hisprep,hispdcl,hispoth
0,2001.0,3.0,100.0,2647.0,161.0,0.0,1372.0,1152.0,268.0,783.0,...,371.0,161.0,106.0,83.0,173.0,198.0,106.0,20.0,56.0,7.0
1,2002.0,0.0,100.0,2881.0,217.0,9.0,1525.0,1030.0,257.0,1082.0,...,109.0,61.0,8.0,39.0,46.0,63.0,52.0,0.0,23.0,0.0
2,2008.0,0.0,100.0,3948.0,278.0,0.0,1438.0,1656.0,561.0,677.0,...,1197.0,621.0,234.0,301.0,574.0,623.0,376.0,60.0,171.0,12.0
3,2009.0,3.0,100.0,3045.0,147.0,0.0,2459.0,1447.0,273.0,717.0,...,735.0,380.0,177.0,160.0,359.0,376.0,169.0,39.0,54.0,1.0
4,2011.0,0.0,100.0,2991.0,74.0,0.0,1401.0,1486.0,334.0,492.0,...,805.0,251.0,360.0,167.0,415.0,390.0,59.0,29.0,30.0,5.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1334 entries, 0 to 1333
Data columns (total 37 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   srprec_orig               1334 non-null   float64
 1   subindex                  1334 non-null   float64
 2   pctsrprec_tract           1334 non-null   float64
 3   employed_wgt              1334 non-null   float64
 4   unemployed_wgt            1334 non-null   float64
 5   empl_military_wgt         1334 non-null   float64
 6   not_inlaborforce_wgt      1334 non-null   float64
 7   working_women_wgt         1334 non-null   float64
 8   parents_work_under6_wgt   1334 non-null   float64
 9   parents_work_0617_wgt     1334 non-null   float64
 10  occ_mgmt_sci_art_wgt      1334 non-null   float64
 11  occ_service_sector_wgt    1334 non-null   float64
 12  occ_sales_gen_office_wgt  1334 non-null   float64
 13  occ_constr_maintc_wgt     1334 non-null   float64
 14  occ_manu

In [19]:
df.columns.tolist()

['srprec_orig',
 'subindex',
 'pctsrprec_tract',
 'employed_wgt',
 'unemployed_wgt',
 'empl_military_wgt',
 'not_inlaborforce_wgt',
 'working_women_wgt',
 'parents_work_under6_wgt',
 'parents_work_0617_wgt',
 'occ_mgmt_sci_art_wgt',
 'occ_service_sector_wgt',
 'occ_sales_gen_office_wgt',
 'occ_constr_maintc_wgt',
 'occ_manuf_transpo_wgt',
 'hh_med_income_wgt',
 'hlthins_priv_wgt',
 'hlthins_public_wgt',
 'hlthins_none_wgt',
 'county',
 'srprec',
 'cddist',
 'TOTREG',
 'TOTVOTE',
 'CNGDEM01',
 'CNGREP01',
 'election',
 'totreg_r',
 'dem',
 'rep',
 'dcl',
 'male',
 'female',
 'hispdem',
 'hisprep',
 'hispdcl',
 'hispoth']

In [140]:
data = df[['employed_wgt',
#  'unemployed_wgt',
#  'empl_military_wgt',
#  'not_inlaborforce_wgt',
 'working_women_wgt',
 'parents_work_under6_wgt',
 'parents_work_0617_wgt',
#  'occ_mgmt_sci_art_wgt',
#  'occ_service_sector_wgt',
#  'occ_sales_gen_office_wgt',
#  'occ_constr_maintc_wgt',
#  'occ_manuf_transpo_wgt',
 'hh_med_income_wgt',
 'hlthins_priv_wgt',
 'hlthins_public_wgt',
 'hlthins_none_wgt'
#  'dcl',
#  'hispdem',
#  'hisprep',
#  'hispdcl',
#  'hispoth'
#  'TOTVOTE'
          ]]

In [141]:
ss = StandardScaler()
X_ss = ss.fit_transform(data)
X_ss

array([[-0.26063261, -0.34483357,  0.03681775, ..., -0.78857442,
         1.09310114,  1.56542936],
       [-0.11415821, -0.51638966, -0.01831119, ..., -0.88810001,
         1.88287619,  3.24744983],
       [ 0.55374001,  0.36388997,  1.5052523 , ..., -0.13082414,
         1.17882978,  2.35727284],
       ...,
       [-1.15012031, -1.25464335, -1.0657611 , ..., -0.69235411,
        -0.88830193, -0.82821517],
       [-1.15012031, -1.25464335, -1.0657611 , ..., -0.69235411,
        -0.88830193, -0.82821517],
       [-1.1670212 , -1.18433348, -1.24117137, ..., -0.74046427,
        -0.88615871, -0.96795225]])

In [154]:
dbscan = DBSCAN(min_samples=8, eps=0.2)
dbscan.fit(X_ss)       # default epsilon=0.5, min_samples=5

DBSCAN(eps=0.2, min_samples=8)

In [155]:
silhouette_score(X_ss, dbscan.labels_)

-0.3328343455122142

In [156]:
set(dbscan.labels_)   

{-1, 0, 1, 2, 3, 4, 5, 6}

In [157]:
data['cluster'] = dbscan.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['cluster'] = dbscan.labels_


In [158]:
data.head()

Unnamed: 0,employed_wgt,working_women_wgt,parents_work_under6_wgt,parents_work_0617_wgt,hh_med_income_wgt,hlthins_priv_wgt,hlthins_public_wgt,hlthins_none_wgt,cluster
0,2647.0,1152.0,268.0,783.0,54480.0,2156.0,2846.0,1050.0,-1
1,2881.0,1030.0,257.0,1082.0,44136.0,1885.0,3583.0,1700.0,-1
2,3948.0,1656.0,561.0,677.0,66970.0,3947.0,2926.0,1356.0,-1
3,3045.0,1447.0,273.0,717.0,54967.0,3794.0,2708.0,822.0,-1
4,2991.0,1486.0,334.0,492.0,119595.0,4400.0,1523.0,259.0,-1


In [159]:
data['cluster'].value_counts()

-1    1273
 5      11
 6      10
 1       8
 4       8
 3       8
 2       8
 0       8
Name: cluster, dtype: int64

In [None]:
#kmeans
#agglomerative = example on NLP. tracks the process
#dendrogram