In [1]:
!pip install mpld3



In [2]:
!pip install -U plotly

Requirement already up-to-date: plotly in /usr/local/lib/python3.6/dist-packages (4.14.1)


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from sklearn.cluster import AgglomerativeClustering
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from scipy.stats import sem
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from collections import Counter



import mpld3
from mpld3 import plugins

import warnings
warnings.filterwarnings('ignore')

#***************************************************************
#                   DEFINITION OF CONSTANTS
#***************************************************************

# Data path constant
DATA_PATH = "PisoFirme_AEJPol-20070024_household.dta"

# Dependent variables (dataset name)
DEP_VARS = ['S_shcementfloor', 
            'S_cementfloorkit', 
            'S_cementfloordin', 
            'S_cementfloorbat', 
            'S_cementfloorbed']

# Continuous control variables (dataset name)
CONT_CTRL_VARS = ['S_rooms',
                  'S_HHpeople',
                  'S_headeduc',
                  'S_spouseeduc',
                  'S_headage',
                  'S_spouseage',
                  'S_washhands',
                  'S_cashtransfers']

# Dummy NaN for continuous control variables (dataset name)
CONT_CTRL_VARS_NAN = [i + '_nan' for i in CONT_CTRL_VARS]

# Demographic control variables (dataset name)
DEMO_CTRL_VARS = ['S_dem'+str(i+1) for i in range(8)]

# Categorical control variables (dataset name)
DUMMY_CTRL_VARS = ['S_hasanimals',
                   'S_animalsinside',
                   'S_waterland',
                   'S_waterhouse',
                   'S_electricity',
                   'S_garbage',
                   'S_milkprogram',
                   'S_foodprogram',
                   'S_seguropopular']

# Dummy NaN for categorical control variables (dataset name)
DUMMY_CTRL_VARS_NAN = [i + '_nan' for i in DUMMY_CTRL_VARS]

# All control variables to generate NaN related dummies (dataset name) --> demographic variables not included
CTRL_VARS = CONT_CTRL_VARS + DUMMY_CTRL_VARS

# Variables for Model 1 linear regression (statsmodels name) --> single program dummy
MDL1_VARS = ['C(dpisofirme)']

# Variables for Model 2 linear regression (statsmodels name) --> add demographic and health control variables
MDL2_VARS = MDL1_VARS + CONT_CTRL_VARS[:7] + ['C('+i+')' for i in DUMMY_CTRL_VARS[:6]] \
                                           + ['C('+i+')' for i in CONT_CTRL_VARS_NAN[:7]] \
                                           + ['C('+i+')' for i in DUMMY_CTRL_VARS_NAN[:6]] \
                                           + DEMO_CTRL_VARS

# Variables for Model 3 linear regression (statsmodels name) --> add social program control variables
MDL3_VARS = MDL2_VARS + CONT_CTRL_VARS[-1:] + ['C('+i+')' for i in DUMMY_CTRL_VARS[-3:]] \
                                            + ['C('+i+')' for i in CONT_CTRL_VARS_NAN[-1:]] \
                                            + ['C('+i+')' for i in DUMMY_CTRL_VARS_NAN[-3:]] \

# Model variables without S_rooms for discussion part (statsmodels name)
MDL2_VARS_NOROOMS = [x for x in MDL2_VARS if x != 'S_rooms' and x != 'C(S_rooms_nan)']
MDL3_VARS_NOROOMS = [x for x in MDL3_VARS if x != 'S_rooms' and x != 'C(S_rooms_nan)']

# Names for table rows
ROWS = ['Share of rooms with cement floors',
        'Cement floor in kitchen',
        'Cement floor in dining room',
        'Cement floor in bathroom',
        'Cement floor in bedroom']

# Columns for the control group table
CG_COLUMNS = pd.MultiIndex.from_product([['Control Group'], ['Mean','Standard Deviation']])

# Program dummy name in statsmodels coefficients output
PROGRAMM_DUMMY = 'C(dpisofirme)[T.1.0]'

  import pandas.util.testing as tm


In [4]:
def cluster(model, X, **kwargs):
    """ Run a clustering model and return predictions.
    
    Args:
        model : {sklearn.cluster, sklearn.mixture, or hdbscan}
            Model to fit and predict
        X : pandas.DataFrame
            Data used to fit `model`
        **kwargs : `model`.fit_predict() args, optional
            Keyword arguments to be passed into `model`.fit_predict()
    Returns:
        (labels,centers) : tuple(array, pandas.DataFrame)
            A tuple containing cluster labels and a DataFrame of cluster centers formated with X columns
    """
    clust_labels = model.fit_predict(X,**kwargs)
    centers = X.assign(**{model.__class__.__name__ : clust_labels} # assign a temp column to X with model name
                      ).groupby(model.__class__.__name__,sort=True).mean() # group on temp, gather mean of labels
    
    return (clust_labels, centers)

In [5]:
# Load dataset
data = pd.read_stata(DATA_PATH)
# Drop households whose geographical informations is not complete (NaN)
data = data[data['idcluster'].notna()]
# Generate dummies for NaN values for all control variables except S_dem
data = pd.concat([data, pd.get_dummies(data[CTRL_VARS], columns=CTRL_VARS, dummy_na=True)[CONT_CTRL_VARS_NAN + DUMMY_CTRL_VARS_NAN]], axis=1) 
# Impute all NaN values with 0
data = data.fillna(0)
data

Unnamed: 0,dpisofirme,idcluster,coord_x,coord_y,idmun,idmza,C_blocksdirtfloor,C_HHdirtfloor,C_child05,C_households,C_people,C_rooms,C_HHpersons,C_waterland,C_waterhouse,C_waterbath,C_gasheater,C_refrigerator,C_washing,C_telephone,C_vehicle,C_overcrowding,C_poverty,C_illiterate,C_headeduc,C_dropouts515,C_employment,C_earnincome,S_HHpeople,S_headage,S_spouseage,S_headeduc,S_spouseeduc,S_rooms,S_waterland,S_waterhouse,S_electricity,S_cementfloor2000,S_hasanimals,S_animalsinside,...,S_dem5,S_dem6,S_dem7,S_dem8,S_seguropopular,S_shcementfloor,S_cementfloorkit,S_cementfloordin,S_cementfloorbat,S_cementfloorbed,S_satisfloor,S_satishouse,S_satislife,S_cesds,S_pss,S_instcement,S_instsanita,S_restsanita,S_constceili,S_restowalls,S_improveany,S_logrent,S_logsell,S_rooms_nan,S_HHpeople_nan,S_headeduc_nan,S_spouseeduc_nan,S_headage_nan,S_spouseage_nan,S_washhands_nan,S_cashtransfers_nan,S_hasanimals_nan,S_animalsinside_nan,S_waterland_nan,S_waterhouse_nan,S_electricity_nan,S_garbage_nan,S_milkprogram_nan,S_foodprogram_nan,S_seguropopular_nan
0,0.0,70000537.0,-103.503670,25.583067,7.0,40,0.300000,0.036629,0.555554,819.0,3530.0,3.097682,4.310134,0.002443,0.151522,0.272279,0.004885,0.114775,0.247868,0.524304,0.644129,1.731482,0.062267,0.045177,7.578925,0.092800,1.710631,1.610496,3.0,44.0,43.0,6.0,6.0,3,1,1,1,0.40,1.0,0.0,...,0.333333,0.00,0.333333,0.000000,0.0,0.6,1.0,0.0,1.0,0.0,1.0,1.0,1.0,14.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,5.298317,9.903487,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.0,70000537.0,-103.503670,25.583067,7.0,40,0.300000,0.036629,0.555554,819.0,3530.0,3.097682,4.310134,0.002443,0.151522,0.272279,0.004885,0.114775,0.247868,0.524304,0.644129,1.731482,0.062267,0.045177,7.578925,0.092800,1.710631,1.610496,2.0,37.0,0.0,6.0,0.0,1,1,1,1,0.75,0.0,0.0,...,0.500000,0.00,0.500000,0.000000,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,17.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,5.298317,9.615806,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.0,70000537.0,-103.503670,25.583067,7.0,40,0.300000,0.036629,0.555554,819.0,3530.0,3.097682,4.310134,0.002443,0.151522,0.272279,0.004885,0.114775,0.247868,0.524304,0.644129,1.731482,0.062267,0.045177,7.578925,0.092800,1.710631,1.610496,2.0,18.0,0.0,12.0,0.0,4,1,1,1,1.00,0.0,0.0,...,0.500000,0.00,0.500000,0.000000,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,16.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,6.214608,10.819778,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.0,70000537.0,-103.503670,25.583067,7.0,47,0.300000,0.036629,0.555554,819.0,3530.0,3.097682,4.310134,0.002443,0.151522,0.272279,0.004885,0.114775,0.247868,0.524304,0.644129,1.731482,0.062267,0.045177,7.578925,0.092800,1.710631,1.610496,4.0,43.0,30.0,9.0,9.0,3,1,1,1,1.00,0.0,0.0,...,0.000000,0.00,0.250000,0.000000,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,11.385092,11.918390,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.0,70000537.0,-103.503670,25.583067,7.0,47,0.300000,0.036629,0.555554,819.0,3530.0,3.097682,4.310134,0.002443,0.151522,0.272279,0.004885,0.114775,0.247868,0.524304,0.644129,1.731482,0.062267,0.045177,7.578925,0.092800,1.710631,1.610496,5.0,46.0,45.0,3.0,6.0,3,1,1,1,1.00,1.0,0.0,...,0.200000,0.00,0.400000,0.000000,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.703783,10.819778,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2750,1.0,353150000.0,-103.399841,25.501871,35.0,40,0.538462,0.100774,0.759924,454.0,1866.0,3.264429,4.110127,0.011336,0.262172,0.333881,0.022024,0.113730,0.286599,0.660113,0.534601,1.655615,0.070493,0.035692,9.006986,0.116742,1.473560,1.418500,4.0,25.0,24.0,6.0,4.0,1,1,0,1,0.00,0.0,0.0,...,0.000000,0.25,0.250000,0.000000,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,19.0,20.0,1.0,0.0,0.0,1.0,0.0,1.0,5.298317,9.615806,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2751,1.0,353150000.0,-103.399841,25.501871,35.0,40,0.538462,0.100774,0.759924,454.0,1866.0,3.264429,4.110127,0.011336,0.262172,0.333881,0.022024,0.113730,0.286599,0.660113,0.534601,1.655615,0.070493,0.035692,9.006986,0.116742,1.473560,1.418500,6.0,66.0,0.0,2.0,0.0,3,1,1,1,0.50,0.0,0.0,...,0.000000,0.00,0.166667,0.166667,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,5.991465,10.819778,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2752,1.0,353150000.0,-103.399841,25.501871,35.0,35,0.538462,0.100774,0.759924,454.0,1866.0,3.264429,4.110127,0.011336,0.262172,0.333881,0.022024,0.113730,0.286599,0.660113,0.534601,1.655615,0.070493,0.035692,9.006986,0.116742,1.473560,1.418500,5.0,35.0,32.0,9.0,9.0,2,1,1,1,0.80,1.0,0.0,...,0.200000,0.20,0.200000,0.000000,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,19.0,1.0,0.0,0.0,0.0,0.0,0.0,5.991465,9.210340,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2753,1.0,353150000.0,-103.399841,25.501871,35.0,34,0.538462,0.100774,0.759924,454.0,1866.0,3.264429,4.110127,0.011336,0.262172,0.333881,0.022024,0.113730,0.286599,0.660113,0.534601,1.655615,0.070493,0.035692,9.006986,0.116742,1.473560,1.418500,5.0,35.0,33.0,12.0,6.0,2,1,1,1,0.25,1.0,0.0,...,0.000000,0.20,0.200000,0.000000,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,10.0,1.0,0.0,0.0,0.0,0.0,0.0,6.396930,11.918390,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
data_treatment = data[data['dpisofirme'] == 1]
data_treatment['dummy_cement'] = (data_treatment['S_cementfloor2000'] > data_treatment['S_cementfloor2000'].median()).astype(int)
data_treatment = data_treatment.reset_index()

# Make dataset with only happy and variables
happy_dataset = data_treatment[['S_satisfloor', 'S_satishouse', 'S_satislife', 'S_cesds', 'S_pss']]
happy_dataset

Unnamed: 0,S_satisfloor,S_satishouse,S_satislife,S_cesds,S_pss
0,0.0,0.0,0.0,40.0,36.0
1,1.0,1.0,1.0,15.0,14.0
2,1.0,1.0,0.0,20.0,27.0
3,1.0,1.0,1.0,14.0,10.0
4,1.0,1.0,1.0,28.0,18.0
...,...,...,...,...,...
1357,1.0,1.0,1.0,19.0,20.0
1358,1.0,1.0,1.0,9.0,11.0
1359,1.0,1.0,1.0,12.0,19.0
1360,1.0,1.0,1.0,6.0,10.0


In [7]:
ac = AgglomerativeClustering(n_clusters=2, affinity = 'euclidean', linkage = 'ward')
clabels_ac,cent_ac = cluster(ac, happy_dataset)

happy_dataset['Happy'] = clabels_ac
happy_dataset

Unnamed: 0,S_satisfloor,S_satishouse,S_satislife,S_cesds,S_pss,Happy
0,0.0,0.0,0.0,40.0,36.0,1
1,1.0,1.0,1.0,15.0,14.0,0
2,1.0,1.0,0.0,20.0,27.0,1
3,1.0,1.0,1.0,14.0,10.0,0
4,1.0,1.0,1.0,28.0,18.0,1
...,...,...,...,...,...,...
1357,1.0,1.0,1.0,19.0,20.0,1
1358,1.0,1.0,1.0,9.0,11.0,0
1359,1.0,1.0,1.0,12.0,19.0,0
1360,1.0,1.0,1.0,6.0,10.0,0


In [8]:
for i in happy_dataset.index:
  if happy_dataset.loc[i, 'Happy'] == 0:
    happy_dataset.loc[i, 'Feeling'] = 'Happy'
  else:
    happy_dataset.loc[i, 'Feeling'] = 'Unhappy'

happy_dataset

Unnamed: 0,S_satisfloor,S_satishouse,S_satislife,S_cesds,S_pss,Happy,Feeling
0,0.0,0.0,0.0,40.0,36.0,1,Unhappy
1,1.0,1.0,1.0,15.0,14.0,0,Happy
2,1.0,1.0,0.0,20.0,27.0,1,Unhappy
3,1.0,1.0,1.0,14.0,10.0,0,Happy
4,1.0,1.0,1.0,28.0,18.0,1,Unhappy
...,...,...,...,...,...,...,...
1357,1.0,1.0,1.0,19.0,20.0,1,Unhappy
1358,1.0,1.0,1.0,9.0,11.0,0,Happy
1359,1.0,1.0,1.0,12.0,19.0,0,Happy
1360,1.0,1.0,1.0,6.0,10.0,0,Happy


In [9]:
counts = Counter([(x, y) for x, y in zip(happy_dataset.S_pss.values, happy_dataset.S_cesds.values)])

for i in range(len(happy_dataset)):
  for key, value in counts.items():
    if (happy_dataset.loc[i, 'S_pss'], happy_dataset.loc[i, 'S_cesds']) == key:
      happy_dataset.loc[i, 'count'] = value

In [10]:
fig = px.scatter(happy_dataset, "S_pss", "S_cesds", color="Feeling", size="count", color_discrete_sequence=['red', 'blue'], opacity=0.5)
fig.update_layout(height=1000, width=1000, 
                  title='Agglomerative clusters: Depression vs Perceived stress by feeling of happiness',
                  yaxis_title='Depression Scale',
                  xaxis_title='Perceived Stress Scale')
fig.show()

In [11]:
fig.write_html("./scatter_agglomerative.html")

In [12]:
y0 = happy_dataset[happy_dataset['Happy'] == 0]['S_pss'].values
y1 = happy_dataset[happy_dataset['Happy'] == 1]['S_pss'].values

fig = go.Figure()

meanline = go.violin.Meanline(visible=True, color='black')

fig.add_trace(go.Violin(y=y0, name='Happy',
                marker_color = 'blue', meanline=meanline))


fig.add_trace(go.Violin(y=y1, name = 'Unhappy',
                marker_color = 'red', meanline=meanline))

fig.update_layout(title='Ditribution of Perceived Stress Scale', yaxis_title='Perceived Stress Scale', autosize=False, width=800, height=800)
fig.show()

In [13]:
fig.write_html("./boxplot_pss_agglomerative.html")

In [14]:
y0 = happy_dataset[happy_dataset['Happy'] == 0]['S_cesds'].values
y1 = happy_dataset[happy_dataset['Happy'] == 1]['S_cesds'].values

fig = go.Figure()

meanline = go.violin.Meanline(visible=True, color='black')

fig.add_trace(go.Violin(y=y0, name='Happy',
                marker_color = 'blue', meanline=meanline))


fig.add_trace(go.Violin(y=y1, name = 'Unhappy',
                marker_color = 'red', meanline=meanline))

fig.update_layout(title='Distribution of Depression Scale', yaxis_title='Depression Scale', autosize=False, width=800, height=800,)
fig.show()

In [15]:
fig.write_html("./boxplot_cesds_agglomerative.html")

In [16]:
satisfaction_dataset = pd.DataFrame(columns=['Satisfaction', 'Feeling', 'Mean', 'Standard Error'])
satisfaction_dataset.loc[0,:] = ['Floor Quality<br>Satisfaction', 'Happy', happy_dataset[happy_dataset.Happy==0].S_satisfloor.mean(), sem(happy_dataset[happy_dataset.Happy==0].S_satisfloor.values)]
satisfaction_dataset.loc[1,:] = ['House Quality<br>Satisfaction', 'Happy', happy_dataset[happy_dataset.Happy==0].S_satishouse.mean(), sem(happy_dataset[happy_dataset.Happy==0].S_satishouse.values)]
satisfaction_dataset.loc[2,:] = ['Life Quality<br>Satisfaction', 'Happy', happy_dataset[happy_dataset.Happy==0].S_satislife.mean(), sem(happy_dataset[happy_dataset.Happy==0].S_satislife.values)]
satisfaction_dataset.loc[3,:] = ['Floor Quality<br>Satisfaction', 'Unhappy', happy_dataset[happy_dataset.Happy==1].S_satisfloor.mean(), sem(happy_dataset[happy_dataset.Happy==1].S_satisfloor.values)]
satisfaction_dataset.loc[4,:] = ['House Quality<br>Satisfaction', 'Unhappy', happy_dataset[happy_dataset.Happy==1].S_satishouse.mean(), sem(happy_dataset[happy_dataset.Happy==1].S_satishouse.values)]
satisfaction_dataset.loc[5,:] = ['Life Quality<br>Satisfaction', 'Unhappy', happy_dataset[happy_dataset.Happy==1].S_satislife.mean(), sem(happy_dataset[happy_dataset.Happy==1].S_satislife.values)]


fig = px.scatter(satisfaction_dataset, x="Satisfaction", y="Mean", color="Feeling", error_y='Standard Error', title="Mean levels of satisfaction by feeling of happiness")
fig.update_traces(marker=dict(size=12),
                  selector=dict(mode='markers'))
fig.update_xaxes(
        tickangle = 0,
        title_text = "")

fig.update_traces(marker=dict(size=10, symbol='diamond'), selector=dict(mode='markers'))                              

fig.update_layout(autosize=False, width=800, height=600)
fig.show()

In [17]:
fig.write_html("./mean_satisfaction_agglomerative.html")

# **Pareil avec KMeans, attention happy et unhapyy sont inversés**

In [18]:
from sklearn.cluster import KMeans

happy_dataset = data_treatment[['S_satisfloor', 'S_satishouse', 'S_satislife', 'S_cesds', 'S_pss']]

km = KMeans(n_clusters=2, random_state=47)
clabels_km, cent_km = cluster(km, happy_dataset)
happy_dataset['Happy'] = clabels_km
happy_dataset

Unnamed: 0,S_satisfloor,S_satishouse,S_satislife,S_cesds,S_pss,Happy
0,0.0,0.0,0.0,40.0,36.0,0
1,1.0,1.0,1.0,15.0,14.0,1
2,1.0,1.0,0.0,20.0,27.0,0
3,1.0,1.0,1.0,14.0,10.0,1
4,1.0,1.0,1.0,28.0,18.0,0
...,...,...,...,...,...,...
1357,1.0,1.0,1.0,19.0,20.0,0
1358,1.0,1.0,1.0,9.0,11.0,1
1359,1.0,1.0,1.0,12.0,19.0,1
1360,1.0,1.0,1.0,6.0,10.0,1


In [19]:
for i in happy_dataset.index:
  if happy_dataset.loc[i, 'Happy'] == 1:
    happy_dataset.loc[i, 'Feeling'] = 'Happy'
  else:
    happy_dataset.loc[i, 'Feeling'] = 'Unhappy'

counts = Counter([(x, y) for x, y in zip(happy_dataset.S_pss.values, happy_dataset.S_cesds.values)])

for i in range(len(happy_dataset)):
  for key, value in counts.items():
    if (happy_dataset.loc[i, 'S_pss'], happy_dataset.loc[i, 'S_cesds']) == key:
      happy_dataset.loc[i, 'count'] = value

happy_dataset

Unnamed: 0,S_satisfloor,S_satishouse,S_satislife,S_cesds,S_pss,Happy,Feeling,count
0,0.0,0.0,0.0,40.0,36.0,0,Unhappy,1.0
1,1.0,1.0,1.0,15.0,14.0,1,Happy,7.0
2,1.0,1.0,0.0,20.0,27.0,0,Unhappy,1.0
3,1.0,1.0,1.0,14.0,10.0,1,Happy,3.0
4,1.0,1.0,1.0,28.0,18.0,0,Unhappy,3.0
...,...,...,...,...,...,...,...,...
1357,1.0,1.0,1.0,19.0,20.0,0,Unhappy,8.0
1358,1.0,1.0,1.0,9.0,11.0,1,Happy,1.0
1359,1.0,1.0,1.0,12.0,19.0,1,Happy,3.0
1360,1.0,1.0,1.0,6.0,10.0,1,Happy,3.0


In [20]:
fig = px.scatter(happy_dataset, "S_pss", "S_cesds", color="Feeling", size="count", color_discrete_sequence=['red', 'blue'], opacity=0.5)
fig.update_layout(height=1000, width=1000, 
                  title='Agglomerative clusters: Depression vs Perceived stress by feeling of happiness',
                  yaxis_title='Depression Scale',
                  xaxis_title='Perceived Stress Scale')
fig.show()

In [21]:
fig.write_html("./scatter_kmeans.html")

In [22]:
y0 = happy_dataset[happy_dataset['Happy'] == 1]['S_pss'].values
y1 = happy_dataset[happy_dataset['Happy'] == 0]['S_pss'].values

fig = go.Figure()

meanline = go.violin.Meanline(visible=True, color='black')

fig.add_trace(go.Violin(y=y0, name='Happy',
                marker_color = 'blue', meanline=meanline))

fig.add_trace(go.Violin(y=y1, name = 'Unhappy',
                marker_color = 'red', meanline=meanline))

fig.update_layout(title='Boxplot of Perceived Stress Scale', yaxis_title='Perceived Stress Scale', autosize=False, width=800, height=800,)
fig.show()

In [23]:
fig.write_html("./boxplot_pss_kmeans.html")

In [24]:
y0 = happy_dataset[happy_dataset['Happy'] == 1]['S_cesds'].values
y1 = happy_dataset[happy_dataset['Happy'] == 0]['S_cesds'].values

fig = go.Figure()

meanline = go.violin.Meanline(visible=True, color='black')

fig.add_trace(go.Violin(y=y0, name='Happy',
                marker_color = 'blue', meanline=meanline))


fig.add_trace(go.Violin(y=y1, name = 'Unhappy',
                marker_color = 'red', meanline=meanline))

fig.update_layout(title='Distribution of Depression Scale', yaxis_title='Depression Scale', autosize=False, width=800, height=800)
fig.show()

In [25]:
fig.write_html("./boxplot_pss_cesds.html")

In [26]:
satisfaction_dataset = pd.DataFrame(columns=['Satisfaction', 'Feeling', 'Mean', 'Standard Error'])
satisfaction_dataset.loc[0,:] = ['Floor Quality<br>Satisfaction', 'Happy', happy_dataset[happy_dataset.Happy==1].S_satisfloor.mean(), sem(happy_dataset[happy_dataset.Happy==1].S_satisfloor.values)]
satisfaction_dataset.loc[1,:] = ['House Quality<br>Satisfaction', 'Happy', happy_dataset[happy_dataset.Happy==1].S_satishouse.mean(), sem(happy_dataset[happy_dataset.Happy==1].S_satishouse.values)]
satisfaction_dataset.loc[2,:] = ['Life Quality<br>Satisfaction', 'Happy', happy_dataset[happy_dataset.Happy==1].S_satislife.mean(), sem(happy_dataset[happy_dataset.Happy==1].S_satislife.values)]
satisfaction_dataset.loc[3,:] = ['Floor Quality<br>Satisfaction', 'Unhappy', happy_dataset[happy_dataset.Happy==0].S_satisfloor.mean(), sem(happy_dataset[happy_dataset.Happy==0].S_satisfloor.values)]
satisfaction_dataset.loc[4,:] = ['House Quality<br>Satisfaction', 'Unhappy', happy_dataset[happy_dataset.Happy==0].S_satishouse.mean(), sem(happy_dataset[happy_dataset.Happy==0].S_satishouse.values)]
satisfaction_dataset.loc[5,:] = ['Life Quality<br>Satisfaction', 'Unhappy', happy_dataset[happy_dataset.Happy==0].S_satislife.mean(), sem(happy_dataset[happy_dataset.Happy==0].S_satislife.values)]


fig = px.scatter(satisfaction_dataset, x="Satisfaction", y="Mean", color="Feeling", error_y='Standard Error', title="Mean levels of satisfaction by feeling of happiness")
fig.update_traces(marker=dict(size=12),
                  selector=dict(mode='markers'))
fig.update_xaxes(
        tickangle = 0,
        title_text = "")

fig.update_traces(marker=dict(size=10, symbol='diamond'), selector=dict(mode='markers'))                              

fig.update_layout(autosize=False, width=800, height=600)
fig.show()

In [27]:
fig.write_html("./mean_satisfaction_kmeans.html")