In [53]:
import pandas as pd 
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [54]:
# Read in the data 
df = pd.read_csv("familyData.csv")
categoricalFeatures = ["employmentStatus", "ownership", "participation"]

numericalFeatures = ['liquidWealth', 'laborIncome', 'costPerPerson', 'totalExpense',
                        'investmentAmount', 'annuityIRA', 'wealthWithoutHomeEquity', 
                        'wealthWithHomeEquity', 'HomeEquity','stockInvestmentRatio']

features = categoricalFeatures + numericalFeatures
df = df[features + ['education', 'industry']]
df_features = df[features]
# Standardize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_features[numericalFeatures] = scaler.fit_transform(df_features[numericalFeatures])
df_features = pd.get_dummies(df_features)

In [55]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0).fit(df_features)
df['Cluster'] = kmeans.labels_

In [56]:
df.groupby('Cluster').education.value_counts()

Cluster  education   
0        college         33056
         postGraduate    17411
         highSchool       9037
         middleSchool     1485
1        postGraduate     4159
         college          1615
         highSchool        131
         middleSchool        4
Name: education, dtype: int64

In [57]:
df.groupby('Cluster').industry.value_counts()

Cluster  industry   
0        noneFinance    59330
         finance         1659
1        noneFinance     5617
         finance          292
Name: industry, dtype: int64

In [58]:
df.groupby('Cluster')[numericalFeatures].mean()

Unnamed: 0_level_0,liquidWealth,laborIncome,costPerPerson,totalExpense,investmentAmount,annuityIRA,wealthWithoutHomeEquity,wealthWithHomeEquity,HomeEquity,stockInvestmentRatio
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,7.977889,42.301963,13.94334,31.28496,3.216431,9.93147,40.009388,78.920582,38.911161,0.026464
1,85.442892,106.317133,25.330508,59.028242,203.661063,206.704446,859.003671,1126.535864,267.532181,0.259941


In [59]:
#sklearn imports
from sklearn.decomposition import PCA #Principal Component Analysis
#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [60]:
df = pd.get_dummies(df)
df[numericalFeatures] = scaler.fit_transform(df[numericalFeatures])

In [61]:
#plotX is a DataFrame containing 5000 values sampled randomly from df
plotX = pd.DataFrame(np.array(df.sample(10000)))

#Rename plotX's columns since it was briefly converted to an np.array above
plotX.columns = df.columns

In [62]:
#PCA with two principal components
pca_2d = PCA(n_components=2)

#PCA with three principal components
pca_3d = PCA(n_components=3)


#This DataFrame contains the two principal components that will be used
#for the 2-D visualization mentioned above
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX.drop(["Cluster"], axis=1)))

#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 3-D
PCs_3d = pd.DataFrame(pca_3d.fit_transform(plotX.drop(["Cluster"], axis=1)))

PCs_2d.columns = ["PC1_2d", "PC2_2d"]

PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]

plotX = pd.concat([plotX,PCs_2d,PCs_3d], axis=1, join='inner')


cluster0 = plotX[plotX["Cluster"] == 0]
cluster1 = plotX[plotX["Cluster"] == 1]

In [63]:
#display plotly plots properly
init_notebook_mode(connected=True)

In [64]:
#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_2d"],
                    y = cluster0["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_2d"],
                    y = cluster1["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

data = [trace1, trace2]

title = "Visualizing Clusters in Two Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [65]:
#trace1 is for 'Cluster 0'
trace1 = go.Scatter3d(
                    x = cluster0["PC1_3d"],
                    y = cluster0["PC2_3d"],
                    z = cluster0["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter3d(
                    x = cluster1["PC1_3d"],
                    y = cluster1["PC2_3d"],
                    z = cluster1["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)


data = [trace1, trace2]

title = "Visualizing Clusters in Three Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)