# Well-being - Nutrition

In this notebook, bla bla bla

## Import modules

In [None]:
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import RFE

from utils import calculate_nutripoints

sns.set_theme('notebook')
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Import data

In [None]:
year_grocery = pd.read_csv("data/year_osward_grocery.csv")
display(year_grocery.head())

wellbeing_data = pd.read_excel(
    "data/london-ward-well-being-probability-scores.xls", sheet_name="Data")
display(wellbeing_data.head())

wellbeing_scores = pd.read_excel(
    "data/london-ward-well-being-probability-scores.xls", sheet_name="Scores", header=[0, 1])
display(wellbeing_scores.head())

wellbeing_final_scores = pd.read_excel(
    "data/london-ward-well-being-probability-scores.xls", sheet_name="Ranked", header=[3], usecols="B:C")
display(wellbeing_final_scores.head())

## Preprocessing

### Filter data

We filter the data points with the lowest representativeness

In [None]:
PERCENTAGE_SPLIT_REPRESENTATIVENESS = 0.8
N = len(year_grocery)

year_grocery = year_grocery.nlargest(
    int(PERCENTAGE_SPLIT_REPRESENTATIVENESS * N), 'representativeness_norm')

### Compute nutripoints

We add a column with the computed nutripoints, assessing the quality of the average product of each area

In [None]:
year_grocery["nutripoints"] = year_grocery.apply(calculate_nutripoints, axis=1)
year_grocery["nutripoints"].describe()

In [None]:
display(year_grocery.head())

### Filter wellbeing data

We filter the columns giving the information about the last year of the dataset, 2013

In [None]:
wellbeing_scores = wellbeing_scores.loc[:, (slice(
    None), [2013, "New ward code", "Ward name", "Borough"])].dropna(how="all")
wellbeing_scores = wellbeing_scores.droplevel(1, axis=1)
display(wellbeing_scores.head())

### Merge datasets

In [None]:
wellbeing_scores = pd.merge(
    left=wellbeing_scores, right=wellbeing_final_scores, left_on='Ward name', right_on="Ward")
wellbeing_scores = wellbeing_scores.drop("Ward", axis=1)
display(wellbeing_scores.head())

In [None]:
list_column = ["area_id", "energy_tot", "energy_fat", "energy_saturate", "energy_sugar", "energy_protein", "energy_carb",
               "energy_fibre", "energy_alcohol", "h_nutrients_calories", "nutripoints"]
year_grocery = year_grocery.loc[:,
                                year_grocery.columns.isin(list(list_column))]
display(year_grocery.head())

wellbeing_data = wellbeing_data.filter(
    regex=r'(2013$|-13$|New ward code|Ward$|Borough)')
display(wellbeing_data.head())

In [None]:
wellbeing_data.dropna(how='all', axis=0)

In [None]:
wellbeing_grocery = pd.merge(
    left=year_grocery, right=wellbeing_scores, left_on='area_id', right_on="New ward code")
wellbeing_grocery = wellbeing_grocery.drop("New ward code", axis=1)
display(wellbeing_grocery.head())

In [None]:
wellbeing_grocery_columns = wellbeing_grocery.columns.values.tolist()
wellbeing_grocery.isnull().any()

In [None]:
wellbeing_grocery.describe()

In [None]:
COLUMNS_SCORES = [
    'Life Expectancy',
    'Childhood Obesity',
    'Incapacity Benefit rate',
    'Unemployment rate',
    'Crime rate - Index',
    'Deliberate Fires',
    'Average Capped GCSE and Equivalent Point Score Per Pupil',
    'Unauthorised Absence in All Schools (%)',
    'Dependent children in out-of-work families',
    'Public Transport Accessibility',
    'Homes with access to open space & nature, and % greenspace',
    'Subjective well-being average score',
    'Index Score 2013'
]

COLUMNS_GROCERY = [
    'energy_fat',
    'energy_saturate',
    'energy_sugar',
    'energy_protein',
    'energy_carb',
    'energy_fibre',
    'energy_alcohol',
    'energy_tot',
    'h_nutrients_calories',
    'nutripoints'
]

COLUMNS = COLUMNS_GROCERY + COLUMNS_SCORES

wellbeing_grocery_analysis = wellbeing_grocery[COLUMNS].copy()

In [None]:
scaler = StandardScaler()
wellbeing_grocery_analysis[wellbeing_grocery_analysis.columns] = scaler.fit_transform(wellbeing_grocery_analysis
                                                                                      [wellbeing_grocery_analysis.columns])
wellbeing_grocery_analysis.describe()

## Exploratory Data Analysis

In [None]:
fig, ax = plt.subplots(4, 6, figsize=(16, 8), sharey=False)

for i in range(len(COLUMNS)):
    sbplt = ax[int(i/6), i % 6]

    sns.boxplot(data=wellbeing_grocery_analysis.iloc[:, i], ax=sbplt)
    sbplt.set_xlabel('')
    sbplt.set_ylabel('')
    # TODO: make names of columns shorter to fit plot
    sbplt.set_title(
        wellbeing_grocery_analysis.columns[i], loc='center', wrap=True)

fig.tight_layout()
fig.subplots_adjust(top=0.9)

fig.suptitle('boxplot for each column', fontsize=18)

We observe that they are some outliers for the different variables. This is due to the differences between the different ward. 

In [None]:
fig, ax = plt.subplots(4, 6, figsize=(16, 8), sharey=False)

for i in range(len(COLUMNS)):
    sbplt = ax[int(i/6), i % 6]

    sns.histplot(data=wellbeing_grocery_analysis.iloc[:, i], ax=sbplt)
    sbplt.set_xlabel('')
    sbplt.set_ylabel('')
    sbplt.set_title(wellbeing_grocery_analysis.columns[i], wrap=True)

fig.tight_layout()
fig.subplots_adjust(top=0.9)

fig.suptitle('boxplot for each column', fontsize=18)

In [None]:
fig = plt.figure(figsize=(10, 5))

sns.heatmap(
    wellbeing_grocery_analysis[COLUMNS_SCORES + ['nutripoints']].corr())

In [None]:
correlation = wellbeing_grocery_analysis.corr(method="spearman")
display(correlation.head())

In [None]:
display(correlation[["energy_fibre"]])

In [None]:
# TODO: put the five following barplots in a single subplots

plt.figure(figsize=(14, 3))

correlation["energy_fibre"][COLUMNS_SCORES].plot.bar(
    x=None, y=None, width=0.8, legend=None)

plt.ylabel("Spearman R")
plt.title("Correlation fibre")
plt.show()

In [None]:
plt.figure(figsize=(14, 3))

correlation["energy_alcohol"][COLUMNS_SCORES].plot.bar(
    x=None, y=None, width=0.8, legend=None)

plt.ylabel("Spearman R")
plt.title("Correlation alcohol")
plt.show()

In [None]:
plt.figure(figsize=(14, 3))

correlation["h_nutrients_calories"][COLUMNS_SCORES].plot.bar(
    x=None, y=None, width=0.8, legend=None)

plt.ylabel("Spearman R")
plt.title("Correlation entropy")
plt.show()

In [None]:
plt.figure(figsize=(14, 3))

correlation["energy_saturate"][COLUMNS_SCORES].plot.bar(
    x=None, y=None, width=0.8, legend=None)

plt.ylabel("Spearman R")
plt.title("Correlation saturate")
plt.show()

In [None]:
plt.figure(figsize=(14, 3))

correlation["energy_sugar"][COLUMNS_SCORES].plot.bar(
    x=None, y=None, width=0.8, legend=None)

plt.ylabel("Spearman R")
plt.title("Correlation saturate")
plt.show()

In [None]:
plt.figure(figsize=(14, 3))

correlation["Index Score 2013"][COLUMNS_SCORES].plot.bar(
    x=None, y=None, width=0.8, legend=None)

plt.ylabel("Spearman R")
plt.title("Correlation wellbeing")
plt.show()

In [None]:
plt.figure(figsize=(14, 3))

correlation["energy_tot"][COLUMNS_SCORES].plot.bar(
    x=None, y=None, width=0.8, legend=None)

plt.ylabel("Spearman R")
plt.title("Correlation energy tot")
plt.show()

In [None]:
reg_features = 'Q("Life Expectancy") + Q("Incapacity Benefit rate") + Q("Unemployment rate") + Q("Crime rate - Index") \
+ Q("Childhood Obesity") + Q("Homes with access to open space & nature, and % greenspace")'

mod = smf.ols(formula='h_nutrients_calories ~ ' +
              reg_features, data=wellbeing_grocery_analysis)
res = mod.fit()
print(res.summary())

In [None]:
mod = smf.ols(formula='energy_fibre ~ ' + reg_features,
              data=wellbeing_grocery_analysis)
res = mod.fit()
print(res.summary())

In [None]:
wellbeing_scores_columns = wellbeing_scores.columns.values.tolist()
print(wellbeing_scores_columns)

In [None]:
wellbeing_scores_analysis = wellbeing_scores[COLUMNS_SCORES]\
    .dropna().copy()
wellbeing_scores_reduced_pca = PCA(n_components=2).fit(
    wellbeing_scores_analysis).transform(wellbeing_scores_analysis)

In [None]:
labels = wellbeing_scores_analysis.apply(
    lambda row: "g" if row['Index Score 2013'] >= 0 else "r", axis=1)
# Plot the data reduced in 2d space with PCA
plt.figure(figsize=(14, 3))
plt.scatter(wellbeing_scores_reduced_pca[:, 0],
            wellbeing_scores_reduced_pca[:, 1], c=labels, alpha=0.6)

In [None]:
columns_kmeans_health = ['h_nutrients_calories', 'energy_alcohol']
columns_kmeans = ['nutripoints']
wellbeing_grocery_kmeans = wellbeing_grocery[columns_kmeans].copy()


def plot_sse(X, start=2, end=11):
    sse = []
    for k in range(start, end):
        # Assign the labels to the clusters
        kmeans = KMeans(n_clusters=k, random_state=10).fit(X)
        sse.append({"k": k, "sse": kmeans.inertia_})
    sse = pd.DataFrame(sse)
    # Plot the data
    plt.plot(sse.k, sse.sse)
    plt.xlabel("K")
    plt.ylabel("Sum of Squared Errors")


plot_sse(wellbeing_grocery_kmeans)

In [None]:
#######    silhouette scores to choose k    #########
silhouettes = []
for k in range(2, 11):  # Try multiple k
    # Cluster the data and assigne the labels
    labels = KMeans(n_clusters=k, random_state=10).fit_predict(
        wellbeing_grocery_kmeans)
    # Get the Silhouette score
    score = silhouette_score(wellbeing_grocery_kmeans, labels)
    silhouettes.append({"k": k, "score": score})

silhouettes = pd.DataFrame(silhouettes)  # Convert to dataframe

# Plot the data
plt.plot(silhouettes.k, silhouettes.score)
plt.xlabel("K")
plt.ylabel("Silhouette score")

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(10, 8), sharey=True, sharex=True)

# Cluster the data with the current number of clusters
kmean = KMeans(n_clusters=5, random_state=42).fit(wellbeing_grocery_kmeans)

# Plot the data by using the labels as color
wellbeing_grocery_kmeans['label'] = kmean.labels_
sns.swarmplot(data=wellbeing_grocery_kmeans, x='nutripoints',
              ax=axs, hue=wellbeing_grocery_kmeans.label)
# Plot the centroids
# for c in kmean.cluster_centers_:
#axs.scatter(c[0], c[1], marker="+", color="red")

In [None]:
wellbeing_grocery["nutri_label"] = kmean.labels_
wellbeing_grocery.groupby("nutri_label")["nutripoints"].min()

## Predictive Models

In [None]:
X = wellbeing_grocery[COLUMNS_SCORES]
y = wellbeing_grocery["nutripoints"]

In [None]:
## Linear regression ##

# create the model
lin_reg = LinearRegression()

# train it
lin_reg.fit(X, y)

In [None]:
for f in range(len(COLUMNS_SCORES)):
    print(
        "{0} * {1} + ".format(lin_reg.coef_[f], COLUMNS_SCORES[f]))
print(lin_reg.intercept_)

In [None]:
reg2_features = 'Q("Life Expectancy")\
                + Q("Childhood Obesity")\
                + Q("Incapacity Benefit rate")\
                + Q("Unemployment rate")\
                + Q("Crime rate - Index")\
                + Q("Deliberate Fires")\
                + Q("Average Capped GCSE and Equivalent Point Score Per Pupil")\
                + Q("Unauthorised Absence in All Schools (%)")\
                + Q("Dependent children in out-of-work families")\
                + Q("Public Transport Accessibility")\
                + Q("Homes with access to open space & nature, and % greenspace")\
                + Q("Subjective well-being average score")\
                + Q("Index Score 2013")'

mod = smf.ols(formula='nutripoints ~ ' + reg2_features,
              data=wellbeing_grocery_analysis)
res = mod.fit()

print(res.summary())

### Train Gradient Boosting Regressor

In [None]:
## Gradient Boosting Regression ##
gradboost = GradientBoostingRegressor()

In [None]:
# Select appropriate features using recursive feature elimination

selector = RFE(gradboost, n_features_to_select=5, step=1)
selector = selector.fit(X, y)
print(X.columns)
print(selector.ranking_)

In [None]:
predicted_y = cross_val_predict(gradboost, X, y, cv=5)

In [None]:
# Plot the results
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(y, predicted_y, edgecolors=(0, 0, 0))
ax.set_xlabel('Original')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
r2 = r2_score(y, predicted_y)
mse = mean_squared_error(y, predicted_y)

print(r2, mse)

## Conclusions

According to our results, ...