# 1 Dependencies import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scripts.quickml.reader import read_csv
from scripts.quickml.plot import feature_distribution_univar, feature_filling, correlation_heatmap, feature_trend_bivar, feature_distribution_bivar, dataframe_distribution_overview, feature_distribution_bivar_box
from scripts.quickml.compare import index_lth
from scripts.quickml.conversion import feature_to_list
from scripts.utils import fill_infos

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

np.random.seed(294697)

***
# 2 Data import

In [None]:
data = pd.read_csv("datas/sample.csv", index_col=[0], sep=";")

In [None]:
data.shape

***
# 3 Overview

In [None]:
dataframe_distribution_overview(data, figsize=(60, 6))

In [None]:
data.info()

In [None]:
data.describe()

***
# 4 Categorization

In [None]:
categs = data.select_dtypes(include="object")
categs.info()

In [None]:
categs.drop(columns=["code", "url", "quantity", "nutriscore_grade", "ecoscore_grade_fr", "image_url", "image_ingredients_url", "image_nutrition_url"], inplace=True)
categs.info()

In [None]:
def categorize(feature):
    new_feature = feature.astype("category").cat.codes
    return new_feature

In [None]:
categs_to_nums = categs.apply(lambda col: categorize(col))
categs_to_nums

In [None]:
cols = categs_to_nums.columns
cols

In [None]:
for col in cols:
    categs_to_nums.rename(columns={col: col + "_CATEG"}, inplace=True)

categs_to_nums

In [None]:
data = data.join([categs_to_nums])

In [None]:
data.info()

***
# 5 Global correlations

***
## 5.1 Original dataset

In [None]:
correlation_heatmap(data)

***
## 5.2 Filtered dataset

In [None]:
data = data.loc[:, data.columns.difference(data.loc[:, "energy-kcal_100g":"iron_100g"].columns)]

In [None]:
data = data.loc[:, data.columns.difference(["created_t", "last_modified_t", "categories_en_CATEG", "creator_CATEG", "product_name_CATEG", "serving_quantity", "countries_en_CATEG", "labels_en_CATEG", "brand_owner_CATEG"])]

In [None]:
correlation_heatmap(data)

***
# 6 Nutriscore / Ecoscore trends

In [None]:
feature_trend_bivar(data, "nutriscore_score", "Nutriscore", "ecoscore_score_fr", "Ecoscore", 1000)

Nutriscore: the bigger the value, the worst the product is.  
Ecoscore: the bigger the value, the worst the environemental impact is.

## Conclusion:
The worst is a product for the health (high nutriscore), the better it is for the environement (low ecoscore).

***
# 7 Nutriscore / Ecoscore distribution

In [None]:
feature_distribution_bivar(data, "nutriscore_score", "Nutriscore", "ecoscore_score_fr", "Ecoscore", divider=10)

***
# 8 Nutriscore / countries distribution

In [None]:
data.iloc[10:].countries_en.head(10)

In [None]:
def keep_last_country(row):
    cotr = row["countries_en"]
    splt = cotr.split(",")
    row["countries_en"] = splt[-1]
    return row

data = data[data.countries_en.notna()].apply(lambda row: keep_last_country(row), axis=1)

In [None]:
data.iloc[10:].countries_en.head(10)

Let's see now how the nutriscore is distributed by country.

In [None]:
data_by_countries_nutriscore = data.groupby(["countries_en", "nutriscore_grade"]).nutriscore_grade.agg([len]).reset_index()
data_by_countries_nutriscore.head()

In [None]:
feature_distribution_univar(data_by_countries_nutriscore, "nutriscore_grade", "Nutriscore")

In [None]:
nutriscore_distrib_per_country = data_by_countries_nutriscore.loc[data_by_countries_nutriscore.countries_en.isin(["France", "Germany", "Belgium", "Canada", "Italy", "Mexico", "Netherlands", "Poland", "Portugal", "Spain", "Sweden", "Switzerland", "United Kingdom", "United States"])]

In [None]:
plt.figure(figsize=(30,6))

sns.catplot(x="countries_en", y="len", hue="nutriscore_grade", data=nutriscore_distrib_per_country, kind="bar", height=5, aspect=5)

plt.title("Nutriscore grade distribution", size=20)
plt.xticks(size=16)
plt.yticks(size=16)
plt.xlabel("Nutriscore grade", size=16)
plt.ylabel("Volume", size=16)
plt.show()

***
# 9 Ecoscore / countries distribution

Let's see now how the ecoscore is distributed by country.

In [None]:
data_by_countries_ecoscore = data.groupby(["countries_en", "ecoscore_grade_fr"]).ecoscore_grade_fr.agg([len]).reset_index()
data_by_countries_ecoscore.head()

In [None]:
feature_distribution_univar(data_by_countries_ecoscore, "ecoscore_grade_fr", "Ecoscore")

In [None]:
ecoscore_distrib_per_country = data_by_countries_ecoscore.loc[data_by_countries_ecoscore.countries_en.isin(["France", "Germany", "Belgium", "Canad", "Italy", "Mexico", "Netherlands", "Poland", "Portugal", "Spain", "Sweden", "Switzerland", "United Kingdom", "United States"])]

In [None]:
plt.figure(figsize=(30,6))

sns.catplot(x="countries_en", y="len", hue="ecoscore_grade_fr", data=ecoscore_distrib_per_country, kind="bar", height=5, aspect=5)

plt.title("Nutriscore grade distribution", size=20)
plt.xticks(size=16)
plt.yticks(size=16)
plt.xlabel("Nutriscore grade", size=16)
plt.ylabel("Volume", size=16)
plt.show()

***
# 10 Nutriscore / brands distribution

***
## 10.1 Data transformation

In [None]:
data.brands.describe()

In [None]:
data = feature_to_list(data, "brands").explode("brands")

In [None]:
result = data.groupby(["brands"]).nutriscore_score.agg(["count", "min", "mean", "max", "std"]).dropna().sort_values(by="mean", ascending=False).reset_index()
result = result.loc[result["count"] >= 10]
pd.concat([result.head(15), result.tail(15)])

***
## 10.2 The 10 worst brands

In [None]:
worst_5 = result.iloc[:10]
worst_5

In [None]:
feature_distribution_bivar_box(data.loc[data.brands.isin(worst_5["brands"].values)], "brands", "nutriscore_score", (20, 10))

***
## 10.3 The 10 better brands

In [None]:
better_5 = result.iloc[-10:]
better_5

In [None]:
feature_distribution_bivar_box(data.loc[data.brands.isin(better_5["brands"].values)], "brands", "nutriscore_score", (20, 10))