In [None]:
# Imports
from utils import calculate_nutripoints
from sklearn.cluster import KMeans, DBSCAN
from statsmodels.stats import diagnostic
import statsmodels.formula.api as smf
from scipy import stats
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_predict, train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, auc, roc_curve, r2_score
from sklearn.feature_selection import RFE
import math
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline
sns.set_theme()

In [None]:
# Data Imports

year_grocery = pd.read_csv("data/year_osward_grocery.csv")
display(year_grocery.head())
print(year_grocery.shape)

In [None]:
# year_grocery=year_grocery.sort_values(by='representativeness_norm', ascending=False, ignore_index=True)
year_grocery = year_grocery.nlargest(
    int(0.8*len(year_grocery)), 'representativeness_norm')

In [None]:
#nutripoints=['nutri_energy','nutri_sugar', 'nutri_saturate', 'nutri_sodium', 'nutri_fibre', 'nutri_protein', 'nutri_fruit_veg']
year_grocery["nutripoints"] = year_grocery.apply(calculate_nutripoints, axis=1)
year_grocery["nutripoints"].describe()

In [None]:
display(year_grocery.head())

In [None]:
list_column = ["area_id", "energy_tot", "energy_fat", "energy_saturate", "energy_sugar", "energy_protein", "energy_carb",
               "energy_fibre", "energy_alcohol", "h_nutrients_calories", "nutripoints"]
year_grocery = year_grocery.loc[:,
                                year_grocery.columns.isin(list(list_column))]
display(year_grocery.head())

In [None]:
len(set(year_grocery["area_id"].values))

In [None]:
fig, ax = plt.subplots(4, 6, figsize=(16, 8), sharey=False)

for i in range(len(column_boxplot)):
    sbplt = ax[int(i/6), i % 6]

    sns.histplot(data=wellbeing_grocery_analysis.iloc[:, i], ax=sbplt)
    sbplt.set_xlabel('')
    sbplt.set_ylabel('')
    sbplt.set_title(wellbeing_grocery_analysis.columns[i], wrap=True)

fig.tight_layout()
fig.subplots_adjust(top=0.9)

fig.suptitle('boxplot for each column', fontsize=18)

In [None]:
# TO DO correlations

#fig = plt.figure(figsize=(10, 6))

# sns.heatmap(X[FEATURES['EEPD']].corr())

In [None]:
correlation = year_grocery.corr(method="spearman")
display(correlation)

In [None]:
plt.figure(figsize=(14, 3))
correlation["nutripoints"].plot.bar(x=None, y=None, width=0.8, legend=None)
plt.ylabel("Spearman R")
plt.title("Correlation nutripoints")
plt.show()

In [None]:
scaler = StandardScaler()
wellbeing_grocery_analysis[wellbeing_grocery_analysis.columns] = scaler.fit_transform(wellbeing_grocery_analysis
                                                                                      [wellbeing_grocery_analysis.columns])
wellbeing_grocery_analysis.describe()

In [None]:
reg_features = 'Q("Life Expectancy") + Q("Incapacity Benefit rate") + Q("Unemployment rate") + Q("Crime rate - Index") \
+ Q("Childhood Obesity") + Q("Homes with access to open space & nature, and % greenspace")'

mod = smf.ols(formula='h_nutrients_calories ~ ' +
              reg_features, data=wellbeing_grocery_analysis)
res = mod.fit()
print(res.summary())

In [None]:
mod = smf.ols(formula='energy_fibre ~ ' + reg_features,
              data=wellbeing_grocery_analysis)
res = mod.fit()
print(res.summary())

In [None]:
wellbeing_scores_columns = wellbeing_scores.columns.values.tolist()
print(wellbeing_scores_columns)

In [None]:
numerical_wellbeing_scores_columns = ['Life Expectancy', 'Childhood Obesity', 'Incapacity Benefit rate', 'Unemployment rate',
                                      'Crime rate - Index', 'Deliberate Fires',
                                      'Average Capped GCSE and Equivalent Point Score Per Pupil',
                                      'Unauthorised Absence in All Schools (%)', 'Dependent children in out-of-work families',
                                      'Public Transport Accessibility',
                                      'Homes with access to open space & nature, and % greenspace',
                                      'Subjective well-being average score', 'Index Score 2013']

wellbeing_scores_analysis = wellbeing_scores[numerical_wellbeing_scores_columns].dropna(
).copy()
wellbeing_scores_reduced_pca = PCA(n_components=2).fit(
    wellbeing_scores_analysis).transform(wellbeing_scores_analysis)

In [None]:
labels = wellbeing_scores_analysis.apply(
    lambda row: "g" if row['Index Score 2013'] >= 0 else "r", axis=1)
# Plot the data reduced in 2d space with PCA
plt.figure(figsize=(14, 3))
plt.scatter(wellbeing_scores_reduced_pca[:, 0],
            wellbeing_scores_reduced_pca[:, 1], c=labels, alpha=0.6)

In [None]:
columns_kmeans_health = ['h_nutrients_calories', 'energy_alcohol']
columns_kmeans = ['nutripoints']
wellbeing_grocery_kmeans = wellbeing_grocery[columns_kmeans].copy()


def plot_sse(X, start=2, end=11):
    sse = []
    for k in range(start, end):
        # Assign the labels to the clusters
        kmeans = KMeans(n_clusters=k, random_state=10).fit(X)
        sse.append({"k": k, "sse": kmeans.inertia_})
    sse = pd.DataFrame(sse)
    # Plot the data
    plt.plot(sse.k, sse.sse)
    plt.xlabel("K")
    plt.ylabel("Sum of Squared Errors")


plot_sse(wellbeing_grocery_kmeans)

In [None]:
#######    silhouette scores to choose k    #########
silhouettes = []
for k in range(2, 11):  # Try multiple k
    # Cluster the data and assigne the labels
    labels = KMeans(n_clusters=k, random_state=10).fit_predict(
        wellbeing_grocery_kmeans)
    # Get the Silhouette score
    score = silhouette_score(wellbeing_grocery_kmeans, labels)
    silhouettes.append({"k": k, "score": score})

silhouettes = pd.DataFrame(silhouettes)  # Convert to dataframe

# Plot the data
plt.plot(silhouettes.k, silhouettes.score)
plt.xlabel("K")
plt.ylabel("Silhouette score")

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(10, 8), sharey=True, sharex=True)

# Cluster the data with the current number of clusters
kmean = KMeans(n_clusters=5, random_state=42).fit(wellbeing_grocery_kmeans)

# Plot the data by using the labels as color
#axs.scatter(wellbeing_grocery_kmeans, wellbeing_grocery_kmeans, c=kmean.labels_)
wellbeing_grocery_kmeans['label'] = kmean.labels_
sns.swarmplot(data=wellbeing_grocery_kmeans, x='nutripoints',
              ax=axs, hue=wellbeing_grocery_kmeans.label)
# Plot the centroids
# for c in kmean.cluster_centers_:
#axs.scatter(c[0], c[1], marker="+", color="red")

In [None]:
wellbeing_grocery["nutri_label"] = kmean.labels_
wellbeing_grocery.groupby("nutri_label")["nutripoints"].min()

In [None]:
## linear regression ##
lin_reg = LinearRegression()  # create the model
lin_reg.fit(X, y)  # train it

In [None]:
for f in range(len(numerical_wellbeing_scores_columns)):
    print(
        "{0} * {1} + ".format(lin_reg.coef_[f], numerical_wellbeing_scores_columns[f]))
print(lin_reg.intercept_)

In [None]:
reg2_features = 'Q("Life Expectancy")+Q("Childhood Obesity")+Q("Incapacity Benefit rate")+Q("Unemployment rate")\
                + Q("Crime rate - Index")+Q("Deliberate Fires")+\
                Q("Average Capped GCSE and Equivalent Point Score Per Pupil")+\
                Q("Unauthorised Absence in All Schools (%)")+Q("Dependent children in out-of-work families")+\
                Q("Public Transport Accessibility")+ Q("Homes with access to open space & nature, and % greenspace")+\
                Q("Subjective well-being average score")+Q("Index Score 2013")'
mod = smf.ols(formula='nutripoints ~ ' + reg2_features,
              data=wellbeing_grocery_analysis)
res = mod.fit()
print(res.summary())

In [None]:
reg2_features = 'Q("Life Expectancy")+Q("Childhood Obesity")+Q("Incapacity Benefit rate")+Q("Unemployment rate")\
                + Q("Crime rate - Index")+Q("Deliberate Fires")+\
                Q("Average Capped GCSE and Equivalent Point Score Per Pupil")+\
                Q("Unauthorised Absence in All Schools (%)")+Q("Dependent children in out-of-work families")+\
                Q("Public Transport Accessibility")+ Q("Homes with access to open space & nature, and % greenspace")+\
                Q("Subjective well-being average score")'
mod = smf.ols(formula='nutripoints ~ ' + reg2_features,
              data=wellbeing_grocery_analysis)
res = mod.fit()
print(res.summary())

In [None]:
X = wellbeing_grocery_analysis[numerical_wellbeing_scores_columns]
y = wellbeing_grocery_analysis["nutripoints"]

In [None]:
# train a gradient boosting regressor
gradboost = GradientBoostingRegressor()

# compute r^2 for this new model
#r2_random_gradboost = gradboost_random.score(train_X, train_y)

#print(f"R² for the Gradient Boost Regression: {r2_random_gradboost}")

In [None]:
selector = RFE(gradboost, n_features_to_select=5, step=1)
selector = selector.fit(X, y)
print(X.columns)
print(selector.ranking_)

In [None]:
predicted_y = cross_val_predict(gradboost, X, y, cv=5)

In [None]:
# Plot the results
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(y, predicted_y, edgecolors=(0, 0, 0))
ax.set_xlabel('Original')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
r2 = r2_score(y, predicted_y)
mse = mean_squared_error(y, predicted_y)
print(r2, mse)