In [28]:
import pandas as pd 
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [29]:
# Read in the data 
df = pd.read_csv("familyData.csv")
categoricalFeatures = ["employmentStatus", "ownership", "participation"]

numericalFeatures = ['liquidWealth', 'laborIncome', 'costPerPerson', 'totalExpense',
                        'investmentAmount', 'annuityIRA', 'wealthWithoutHomeEquity', 
                        'wealthWithHomeEquity', 'HomeEquity','stockInvestmentRatio']

features = categoricalFeatures + numericalFeatures
df = df[features + ['education', 'industry']]
df_features = df[features]
# Standardize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_features[numericalFeatures] = scaler.fit_transform(df_features[numericalFeatures])
df_features = pd.get_dummies(df_features)

In [34]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(df_features)
df['Cluster'] = kmeans.labels_

In [38]:
df.groupby('Cluster').education.value_counts(normalize = True)

Cluster  education   
0        postGraduate    0.561391
         college         0.395915
         highSchool      0.039895
         middleSchool    0.002799
1        college         0.573964
         postGraduate    0.218753
         highSchool      0.177103
         middleSchool    0.030180
2        postGraduate    0.727194
         college         0.257315
         highSchool      0.015491
Name: education, dtype: float64

In [39]:
df.groupby('Cluster').industry.value_counts(normalize = True)

Cluster  industry   
0        noneFinance    0.958080
         finance        0.041920
1        noneFinance    0.975994
         finance        0.024006
2        noneFinance    0.956971
         finance        0.043029
Name: industry, dtype: float64

In [37]:
df.groupby('Cluster')[numericalFeatures].mean()

Unnamed: 0_level_0,liquidWealth,laborIncome,costPerPerson,totalExpense,investmentAmount,annuityIRA,wealthWithoutHomeEquity,wealthWithHomeEquity,HomeEquity,stockInvestmentRatio
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,28.501437,90.043246,22.636606,54.24215,28.380325,50.254854,189.178049,315.42335,126.245304,0.128628
1,4.576848,30.802703,11.755479,25.466513,0.607969,4.293115,18.568779,41.42369,22.854872,0.006636
2,126.553606,96.490095,25.056962,55.552341,384.652423,334.777087,1485.246489,1804.451003,319.204406,0.289482


In [40]:
df.groupby('Cluster')[numericalFeatures].count()

Unnamed: 0_level_0,liquidWealth,laborIncome,costPerPerson,totalExpense,investmentAmount,annuityIRA,wealthWithoutHomeEquity,wealthWithHomeEquity,HomeEquity,stockInvestmentRatio
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,16794,16794,16794,16794,16794,16794,16794,16794,16794,16794
1,47780,47780,47780,47780,47780,47780,47780,47780,47780,47780
2,2324,2324,2324,2324,2324,2324,2324,2324,2324,2324


In [20]:
#sklearn imports
from sklearn.decomposition import PCA #Principal Component Analysis
#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [21]:
df = pd.get_dummies(df)
df[numericalFeatures] = scaler.fit_transform(df[numericalFeatures])

In [22]:
#plotX is a DataFrame containing 5000 values sampled randomly from df
plotX = pd.DataFrame(np.array(df.sample(10000)))

#Rename plotX's columns since it was briefly converted to an np.array above
plotX.columns = df.columns

In [23]:
#PCA with two principal components
pca_2d = PCA(n_components=2)

#PCA with three principal components
pca_3d = PCA(n_components=3)


#This DataFrame contains the two principal components that will be used
#for the 2-D visualization mentioned above
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX.drop(["Cluster"], axis=1)))

#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 3-D
PCs_3d = pd.DataFrame(pca_3d.fit_transform(plotX.drop(["Cluster"], axis=1)))

PCs_2d.columns = ["PC1_2d", "PC2_2d"]

PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]

plotX = pd.concat([plotX,PCs_2d,PCs_3d], axis=1, join='inner')


cluster0 = plotX[plotX["Cluster"] == 0]
cluster1 = plotX[plotX["Cluster"] == 1]

In [24]:
#display plotly plots properly
init_notebook_mode(connected=True)

In [25]:
#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_2d"],
                    y = cluster0["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_2d"],
                    y = cluster1["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

data = [trace1, trace2]

title = "Visualizing Clusters in Two Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [26]:
#trace1 is for 'Cluster 0'
trace1 = go.Scatter3d(
                    x = cluster0["PC1_3d"],
                    y = cluster0["PC2_3d"],
                    z = cluster0["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter3d(
                    x = cluster1["PC1_3d"],
                    y = cluster1["PC2_3d"],
                    z = cluster1["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)


data = [trace1, trace2]

title = "Visualizing Clusters in Three Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)