In [None]:
# Imports

import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import seaborn as sns
sns.set_theme()
import numpy as np
from IPython.display import display
import math

# Load the statistical libraries
from statsmodels.stats import diagnostic
import statsmodels.formula.api as smf
from scipy import stats
from scipy.stats import pearsonr

#import machine learning
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
# Data Imports

year_grocery = pd.read_csv("data/year_osward_grocery.csv")
display(year_grocery)

wellbeing_data= pd.read_excel("data/london-ward-well-being-probability-scores.xls", sheet_name="Data")
display(wellbeing_data.head())

wellbeing_scores = pd.read_excel("data/london-ward-well-being-probability-scores.xls", sheet_name="Scores", header=[0,1])
display(wellbeing_scores.head())

wellbeing_final_scores = pd.read_excel("data/london-ward-well-being-probability-scores.xls", sheet_name="Ranked", header=[3],usecols="B:C")
display(wellbeing_final_scores)

In [None]:
#year_grocery=year_grocery.sort_values(by='representativeness_norm', ascending=False, ignore_index=True)
year_grocery=year_grocery.nlargest(int(0.8*len(year_grocery)), 'representativeness_norm')

In [None]:
display(year_grocery)

In [None]:
wellbeing_scores=wellbeing_scores.loc[:, (slice(None), [2013, "New ward code","Ward name","Borough"])].dropna(how="all")
wellbeing_scores=wellbeing_scores.droplevel(1, axis=1)
display(wellbeing_scores)

In [None]:
wellbeing_scores=pd.merge(left=wellbeing_scores, right=wellbeing_final_scores, left_on='Ward name', right_on="Ward")
wellbeing_scores=wellbeing_scores.drop("Ward", axis=1)
display(wellbeing_scores)

In [None]:
list_column=["area_id","energy_tot","energy_fat","energy_saturate","energy_sugar","energy_protein", "energy_carb",
             "energy_fibre", "energy_alcohol", "h_nutrients_calories"]
year_grocery = year_grocery.loc[:,year_grocery.columns.isin(list(list_column))]
display(year_grocery)

wellbeing_data=wellbeing_data.filter(regex=r'(2013$|-13$|New ward code|Ward$|Borough)')
display(wellbeing_data)

In [None]:
wellbeing_data.dropna(how='all', axis=0)

In [None]:
len(set(wellbeing_data["New ward code"].values))

In [None]:
len(set(wellbeing_scores["New ward code"].values))

In [None]:
len(set(year_grocery["area_id"].values))

In [None]:
len(set(wellbeing_data["New ward code"].values)&set(year_grocery["area_id"].values))

In [None]:
wellbeing_grocery = pd.merge(left=year_grocery, right=wellbeing_data, left_on='area_id', right_on="New ward code")
wellbeing_grocery = wellbeing_grocery.drop("New ward code", axis=1)
display(wellbeing_grocery)

In [None]:
# Comprehension of the data
wellbeing_grocery_columns = wellbeing_grocery.columns.values.tolist()
wellbeing_grocery_columns

In [None]:
wellbeing_grocery.isnull().any()

In [None]:
wellbeing_grocery.describe()

In [None]:
column_boxplot=['energy_fat', 'energy_saturate', 'energy_sugar', 'energy_protein', 'energy_carb', 'energy_fibre', 'energy_tot',
                'h_nutrients_calories', "energy_alcohol",'Life Expectancy 2009-13', 'Childhood Obesity 2013', 'Incapacity Benefit rate - 2013',
                'Unemployment rate 2013', 'Crime rate - 2013', 'Deliberate Fires - 2013', 'GCSE point scores - 2013',
                'Unauthorised Absence in All Schools (%) - 2013', '% dependent children in out-of-work households - 2013',
                'Public Transport Accessibility - 2013', 'Homes with access to open space & nature, and % greenspace - 2013',
                'Subjective well-being average score, 2013']
wellbeing_grocery_analysis=wellbeing_grocery[column_boxplot].copy()
fig, ax = plt.subplots(3, 7, figsize=(16,8), sharey=False)

for i in range(21):
    sbplt = ax[int(i/7), i%7]

    sns.boxplot(data=wellbeing_grocery_analysis.iloc[:,i], ax=sbplt) 
    sbplt.set_title('')
    sbplt.set_xlabel('')
    sbplt.set_ylabel('')
    sbplt.set_title(wellbeing_grocery_analysis.columns[i],wrap=True)

fig.tight_layout()
fig.subplots_adjust(top=0.9)

fig.suptitle('boxplot for each column', fontsize=18)

We observe that they are some outliers for the different variables. This is due to the differences between the different ward. 

In [None]:
fig, ax = plt.subplots(3, 7, figsize=(16,8), sharey=False)

for i in range(21):
    sbplt = ax[int(i/7), i%7]

    sns.histplot(data=wellbeing_grocery_analysis.iloc[:,i], ax=sbplt) 
    sbplt.set_title('')
    sbplt.set_xlabel('')
    sbplt.set_ylabel('')
    sbplt.set_title(wellbeing_grocery_analysis.columns[i],wrap=True)

fig.tight_layout()
fig.subplots_adjust(top=0.9)

fig.suptitle('boxplot for each column', fontsize=18)

In [None]:
correlation = wellbeing_grocery_analysis.corr(method="spearman")
display(correlation)

In [None]:
plt.figure(figsize=(14,3))
correlation["energy_fibre"].plot.bar(x=None, y=None, width=0.8, legend=None)
plt.ylabel("Spearman R")
plt.title("Correlation fibre")
plt.show()

In [None]:
plt.figure(figsize=(14,3))
correlation["energy_alcohol"].plot.bar(x=None, y=None, width=0.8, legend=None)
plt.ylabel("Spearman R")
plt.title("Correlation alcohol")
plt.show()

In [None]:
plt.figure(figsize=(14,3))
correlation["h_nutrients_calories"].plot.bar(x=None, y=None, width=0.8, legend=None)
plt.ylabel("Spearman R")
plt.title("Correlation entropy")
plt.show()

In [None]:
plt.figure(figsize=(14,3))
correlation["energy_saturate"].plot.bar(x=None, y=None, width=0.8, legend=None)
plt.ylabel("Spearman R")
plt.title("Correlation saturate")
plt.show()

In [None]:
plt.figure(figsize=(14,3))
correlation["energy_sugar"].plot.bar(x=None, y=None, width=0.8, legend=None)
plt.ylabel("Spearman R")
plt.title("Correlation saturate")
plt.show()

In [None]:
plt.figure(figsize=(14,3))
correlation["Subjective well-being average score, 2013"].plot.bar(x=None, y=None, width=0.8, legend=None)
plt.ylabel("Spearman R")
plt.title("Correlation wellbeing")
plt.show()

In [None]:
plt.figure(figsize=(14,3))
correlation["energy_tot"].plot.bar(x=None, y=None, width=0.8, legend=None)
plt.ylabel("Spearman R")
plt.title("Correlation energy tot")
plt.show()

In [None]:
scaler = StandardScaler()
wellbeing_grocery_analysis[wellbeing_grocery_analysis.columns]=scaler.fit_transform(wellbeing_grocery_analysis
                                                                                    [wellbeing_grocery_analysis.columns])
display(wellbeing_grocery_analysis)

In [None]:
mod = smf.ols(formula='h_nutrients_calories ~   Q("Life Expectancy 2009-13")+Q("Incapacity Benefit rate - 2013") +Q("Unemployment rate 2013")+ Q("Crime rate - 2013")+ Q("Childhood Obesity 2013")', data=wellbeing_grocery_analysis)
res = mod.fit()
print(res.summary())

In [None]:
mod = smf.ols(formula='energy_fibre ~  Q("Life Expectancy 2009-13")+Q("Incapacity Benefit rate - 2013") +Q("Unemployment rate 2013")+ Q("Crime rate - 2013")+ Q("Childhood Obesity 2013")+ Q("Homes with access to open space & nature, and % greenspace - 2013")', data=wellbeing_grocery_analysis)
res = mod.fit()
print(res.summary())

In [None]:
numerical_wellbeing_data_columns=['Life Expectancy 2009-13', 'Childhood Obesity 2013', 'Incapacity Benefit rate - 2013',
                'Unemployment rate 2013', 'Crime rate - 2013', 'Deliberate Fires - 2013', 'GCSE point scores - 2013',
                'Unauthorised Absence in All Schools (%) - 2013', '% dependent children in out-of-work households - 2013',
                'Public Transport Accessibility - 2013', 'Homes with access to open space & nature, and % greenspace - 2013',
                'Subjective well-being average score, 2013']
wellbeing_data_analysis=wellbeing_data[numerical_wellbeing_data_columns].dropna().copy()
wellbeing_data_reduced_pca = PCA(n_components=2).fit(wellbeing_data_analysis).transform(wellbeing_data_analysis)

print("The features of the first sample are: %s" % wellbeing_data_reduced_pca[0])

In [None]:
labels=wellbeing_data_analysis.apply(lambda row: "g" if row["Subjective well-being average score, 2013"]>=0 else "r", axis=1)
# Plot the data reduced in 2d space with PCA
plt.figure(figsize=(14,3))
plt.scatter(wellbeing_data_reduced_pca[:,0], wellbeing_data_reduced_pca[:,1], c=labels, alpha=0.6)

In [None]:
wellbeing_scores_columns = wellbeing_scores.columns.values.tolist()
print(wellbeing_scores_columns)

In [None]:
numerical_wellbeing_scores_columns=['Life Expectancy', 'Childhood Obesity', 'Incapacity Benefit rate', 'Unemployment rate',
                                  'Crime rate - Index', 'Deliberate Fires',
                                  'Average Capped GCSE and Equivalent Point Score Per Pupil', 
                                  'Unauthorised Absence in All Schools (%)', 'Dependent children in out-of-work families',
                                  'Public Transport Accessibility', 
                                  'Homes with access to open space & nature, and % greenspace',
                                  'Subjective well-being average score', 'Index Score 2013']
wellbeing_scores_analysis=wellbeing_scores[numerical_wellbeing_scores_columns].dropna().copy()
wellbeing_scores_reduced_pca = PCA(n_components=2).fit(wellbeing_scores_analysis).transform(wellbeing_scores_analysis)

In [None]:
labels=wellbeing_scores_analysis.apply(lambda row: "g" if row['Index Score 2013']>=0 else "r", axis=1)
# Plot the data reduced in 2d space with PCA
plt.figure(figsize=(14,3))
plt.scatter(wellbeing_scores_reduced_pca[:,0], wellbeing_scores_reduced_pca[:,1], c=labels, alpha=0.6)

In [None]:
#create map https://sensitivecities.com/so-youd-like-to-make-a-map-using-python-EN.html#.X8d8vqpKhN0