In [None]:
# Imports
from utils import *
from sklearn.cluster import KMeans, DBSCAN
from statsmodels.stats import diagnostic
import statsmodels.formula.api as smf
from scipy import stats
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_predict, train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, auc, roc_curve, r2_score
from sklearn.feature_selection import RFE
import math
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline
sns.set_theme()

In [None]:
# Data Imports

year_grocery = pd.read_csv("data/year_osward_grocery.csv")
display(year_grocery.head())
print(year_grocery.shape)

In [None]:
# year_grocery=year_grocery.sort_values(by='representativeness_norm', ascending=False, ignore_index=True)
year_grocery = year_grocery.nlargest(
    int(0.8*len(year_grocery)), 'representativeness_norm')

In [None]:
#nutripoints=['nutri_energy','nutri_sugar', 'nutri_saturate', 'nutri_sodium', 'nutri_fibre', 'nutri_protein', 'nutri_fruit_veg']
year_grocery["nutripoints"] = year_grocery.apply(calculate_nutripoints, axis=1)
year_grocery["nutripoints"].describe()

In [None]:
display(year_grocery.head())

In [None]:
weight = year_grocery[["area_id"]+NUTRIENTS].copy()

In [None]:
weight["weight_total"]=weight[NUTRIENTS].sum(axis=1)
display(weight)

In [None]:
list_column = ["area_id", "energy_tot", "energy_fat", "energy_saturate", "energy_sugar", "energy_protein", "energy_carb",
               "energy_fibre", "energy_alcohol", "h_nutrients_calories", "nutripoints"]
year_grocery = year_grocery.loc[:,
                                year_grocery.columns.isin(list(list_column))]
display(year_grocery.head())

In [None]:
len(set(year_grocery["area_id"].values))

In [None]:
grocery_analysis = year_grocery.copy()
grocery_analysis=grocery_analysis.drop("area_id",axis=1)
display(grocery_analysis.head())
print(len(grocery_analysis.columns))

In [None]:
fig, ax = plt.subplots(2, 5, figsize=(16, 8), sharey=False)

for i in range(len(grocery_analysis.columns)):
    sbplt = ax[int(i/5), i % 5]

    sns.histplot(data=grocery_analysis.iloc[:, i], ax=sbplt)
    sbplt.set_xlabel('')
    sbplt.set_ylabel('')
    sbplt.set_title(grocery_analysis.columns[i], wrap=True)

fig.tight_layout()
fig.subplots_adjust(top=0.9)

fig.suptitle('histplot for each column', fontsize=18)

In [None]:
plt.figure(figsize=(15,8))

sns.histplot(data=grocery_analysis["nutripoints"])

plt.xlabel('nutripoints')
plt.ylabel('density')
plt.title('nutripoints distribution')
plt.show();

In [None]:
fig = plt.figure(figsize=(10, 6))
sns.heatmap(grocery_analysis.corr())

In [None]:
correlation = grocery_analysis.corr(method="spearman")
display(correlation)

In [None]:
plt.figure(figsize=(14, 3))
correlation["nutripoints"].plot.bar(x=None, y=None, width=0.8, legend=None)
plt.ylabel("Spearman R")
plt.title("Correlation nutripoints")
plt.show()

In [None]:
columns_kmeans = ['nutripoints']
grocery_kmeans = grocery_analysis[columns_kmeans].copy()


def plot_sse(X, start=2, end=11):
    sse = []
    for k in range(start, end):
        # Assign the labels to the clusters
        kmeans = KMeans(n_clusters=k, random_state=10).fit(X)
        sse.append({"k": k, "sse": kmeans.inertia_})
    sse = pd.DataFrame(sse)
    # Plot the data
    plt.plot(sse.k, sse.sse)
    plt.xlabel("K")
    plt.ylabel("Sum of Squared Errors")


plot_sse(grocery_kmeans)

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(10, 8), sharey=True, sharex=True)

# Cluster the data with the current number of clusters
kmean = KMeans(n_clusters=5, random_state=42).fit(grocery_kmeans)

# Plot the data by using the labels as color
#axs.scatter(wellbeing_grocery_kmeans, wellbeing_grocery_kmeans, c=kmean.labels_)
grocery_kmeans['label'] = kmean.labels_
sns.swarmplot(data=grocery_kmeans, x='nutripoints',ax=axs, hue=grocery_kmeans.label)
# Plot the centroids
# for c in kmean.cluster_centers_:
#axs.scatter(c[0], c[1], marker="+", color="red")

In [None]:
year_grocery["nutri_class"] = kmean.labels_
year_grocery.groupby("nutri_class")["nutripoints"].min()

In [None]:
display(year_grocery)

In [None]:
def addcolor(row_list):
    if row_list == 3:
        return "#038141"
    elif row_list == 1:
        return "#85BB2F"
    elif row_list == 4:
        return "#FECC02"
    elif row_list == 0:  
        return "#EE8300"
    elif row_list == 2:
        return "#E63F11"
    else: 
        return ""

      
year_grocery["color"] = year_grocery.apply(
    lambda row: addcolor(row["nutri_class"]), axis=1)
display(year_grocery)

In [None]:
plt.figure(figsize=(15,8))

plt.scatter(year_grocery["nutripoints"],year_grocery["energy_tot"], c=year_grocery["color"])

plt.xlabel('nutripoint')
plt.ylabel('energy_tot')
plt.title('nutripoints in function of energy_tot')
plt.show();

In [None]:
fig, ax = plt.subplots(2, 5, figsize=(16, 8), sharey=False)

for i in range(len(grocery_analysis.columns)):
    sbplt = ax[int(i/5), i % 5]
    sbplt.scatter(grocery_analysis.iloc[:,i], grocery_analysis["h_nutrients_calories"],c=year_grocery["color"])
    sbplt.set_xlabel('')
    sbplt.set_ylabel('')
    sbplt.set_title(grocery_analysis.columns[i], wrap=True)

fig.tight_layout()
fig.subplots_adjust(top=0.9)

fig.suptitle('histplot for each column', fontsize=18)

In [None]:
def nutri_labels_to_letter(row_list):
    if row_list == 3:
        return "A"
    elif row_list == 1:
        return "B"
    elif row_list == 4:
        return "C"
    elif row_list == 0:  
        return "D"
    elif row_list == 2:
        return "E"
    else: 
        return ""

      
year_grocery["nutrilabel"] = year_grocery.apply(
    lambda row: nutri_labels_to_letter(row["nutri_class"]), axis=1)
display(year_grocery)

In [None]:
## REVOIR ##

plt.figure(figsize=(15,8))

#plt.bar(y_pos, height, color=['black', 'red', 'green', 'blue', 'cyan'])
#plt.xticks(y_pos, bars)
#plt.show()
height = [3, 12, 5, 18, 45]
bars = ('A', 'B', 'C', 'D', 'E')
y_pos = np.arange(len(bars))

clrs = [ "#038141","#85BB2F", "#FECC02", "#EE8300", "#E63F11"]

plt.hist(x=year_grocery["nutrilabel"])

#plt.xticks([0, 1, 2, 3, 4 ], ['A','B','C','D','E'])
#plt.xticks(y_pos, bars)
plt.xlabel('nutrilabel')
plt.ylabel('density')
plt.title('nutrilabel distribution')

plt.show();

In [None]:
display(weight)

In [None]:
# typical londonner product
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
weight_mean=weight.mean(axis=0)
display(weight_mean)

In [None]:
nutrients_labels = ["Fibre", "Protein", "Carb", "Fat", "Salt"]
fig1, ax1 = plt.subplots(figsize=(16,8))
ax1.pie(weight_mean[0:5], labels=nutrients_labels, autopct='%1.1f%%', startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

In [None]:
# Merge year_grocery and weight
weight_label=pd.merge(left=weight, right=year_grocery[["area_id","nutrilabel"]], left_on="area_id", right_on = "area_id")
display(weight_label[(weight_label["nutrilabel"]=="A") | (weight_label["nutrilabel"]=="B")].head(50))

In [None]:
weight_label=weight_label.groupby("nutrilabel").mean()
display(weight_label)

In [None]:
fig, ax = plt.subplots(1, 5, figsize=(16, 16))

for i in range(len(weight_label)):
    sbplt = ax[i]
    sbplt.pie(weight_label.iloc[i,0:5], labels=nutrients_labels, autopct='%1.1f%%', startangle=90)
    sbplt.axis('equal')
    sbplt.set_title(weight_label.index[i])
fig.tight_layout()
fig.subplots_adjust(top=0.9)

fig.suptitle('pie for each nutrilabel', fontsize=18)