# 1 Dependencies import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scripts.quickml.reader import read_csv
from scripts.quickml.plot import dataframe_distribution_overview, feature_filling, feature_distribution_univar, feature_trend_bivar, feature_distribution_bivar, feature_distribution_multivar, feature_distribution_univar_box
from scripts.quickml.compare import index_lth
from scripts.utils import fill_infos

# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)

np.random.seed(294697)

***
# 2 Data import

For performance reasons, i choose to write a function that allow me to "chunk" the dataset and only take a percentage. I use a method for taking randomly the rows in each chunk, the result is that the output dataset have the same shape than if it was the original one.

In [None]:
original_dataset = read_csv(pd, np, "./datas/en.openfoodfacts.org.products.csv", 5, delimiter="\t", chunk_size=10000, nrows=None)

In [None]:
rows_nb, cols_nb = original_dataset.shape
original_dataset.shape

In [None]:
original_dataset.head()

In [None]:
data = original_dataset.copy()

***
# 3 Cleaning

In [None]:
dataframe_distribution_overview(data, figsize=(60, 2))

### Shape comparison with original dataset

Original dataset  
![dataset_100_percent](images/global_overview_100p.png)

In [None]:
features = []

***
## 3.1 General informations group

### 3.1.1 Overview

In [None]:
general_infos_features = data.loc[:, "code":"quantity"].columns
general_infos_features

In [None]:
general_infos = data.loc[:, general_infos_features]

In [None]:
dataframe_distribution_overview(general_infos)

In [None]:
general_infos.head()

In [None]:
general_infos.info()

***
### 3.1.2 Emptyness

In [None]:
dataframe_distribution_overview(general_infos)

In [None]:
general_infos_empty_features = index_lth(pd, np, general_infos, 10)
general_infos_empty_features

In [None]:
general_infos.drop(columns=general_infos_empty_features, inplace=True)

***
### 3.1.3 Redundancy

In [None]:
dataframe_distribution_overview(general_infos)

***
#### 3.1.3.1 created_t, created_datetime

In [None]:
general_infos.loc[:, ["created_t", "created_datetime"]].head()

In [None]:
feature_distribution_multivar([
    ("Created timestamp", general_infos["created_t"].astype("category").cat.codes),
    ("Created datetime", general_infos["created_datetime"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["created_t_code"] = general_infos["created_t"].astype("category").cat.codes
temp["created_datetime_tags_code"] = general_infos["created_datetime"].astype("category").cat.codes

temp.corr()

Because "created_datetime" is the datetime created from "created_t" timestamp feature, we will keep only "created_t".

In [None]:
general_infos.drop(columns=["created_datetime"], inplace=True)

***
#### 3.1.3.2 last_modified_t, last_modified_datetime

In [None]:
general_infos.loc[:, ["last_modified_t", "last_modified_datetime"]].head()

In [None]:
feature_distribution_multivar([
    ("Last_modified timestamp", general_infos["last_modified_t"].astype("category").cat.codes),
    ("Last_modified datetime", general_infos["last_modified_datetime"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["last_modified_t_code"] = general_infos["last_modified_t"].astype("category").cat.codes
temp["last_modified_datetime_tags_code"] = general_infos["last_modified_datetime"].astype("category").cat.codes

temp.corr()

Because "last_modified_datetime" is the datetime last_modified from "last_modified_t" timestamp feature, we will keep only "last_modified_t".

In [None]:
general_infos.drop(columns=["last_modified_datetime"], inplace=True)

***
### 3.1.4 Conclusion

In [None]:
dataframe_distribution_overview(general_infos)

#### 3.1.4.1 Original

In [None]:
dataframe_distribution_overview(data[general_infos_features])

***
## 3.2 Tags group

### 3.2.1 Overview

In [None]:
tags_features = data.loc[:, "packaging":"countries_en"].columns
tags_features

In [None]:
tags = data.loc[:, tags_features]

In [None]:
dataframe_distribution_overview(tags)

In [None]:
tags.head()

In [None]:
tags.iloc[:, :13].info()

In [None]:
tags.iloc[:, 13:].info()

***
### 3.2.2 Emptyness

In [None]:
dataframe_distribution_overview(tags)

In [None]:
tags_empty_features = index_lth(pd, np, tags, 10)
tags_empty_features

In [None]:
tags.drop(columns=tags_empty_features, inplace=True)

***
### 3.2.3 Redundancy

In [None]:
dataframe_distribution_overview(tags)

#### 3.2.3.1 Redundancy: packaging, packaging_tags

In [None]:
tags.loc[:, ["packaging", "packaging_tags"]].loc[tags.packaging.notna() & tags.packaging_tags.notna()].iloc[:20]

In [None]:
feature_distribution_multivar([
    ("Packaging", tags["packaging"].astype("category").cat.codes),
    ("Packagin tags", tags["packaging_tags"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["packaging_code"] = tags["packaging"].astype("category").cat.codes
temp["packaging_tags_code"] = tags["packaging_tags"].astype("category").cat.codes

temp.corr()

We can see that each time one of the features "packaging" or "packaging_tags" is filled, the other is also filled.
We will keep the feature "packaging" because it transport more information, plus, it's easier to get rid of the "en:", "fr:" than "en-", because in "packaging_tags" each separator is a "-" and not in "packaging".

In [None]:
tags.drop(columns=["packaging_tags"], inplace=True)

***
#### 3.2.3.2 Redundancy: brands, brands_tags

In [None]:
tags.iloc[:20].loc[:, ["brands", "brands_tags"]]

In [None]:
feature_distribution_multivar([
    ("Brands", tags["brands"].astype("category").cat.codes),
    ("Brands tags", tags["brands_tags"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["brands_code"] = tags["brands"].astype("category").cat.codes
temp["brands_tags_code"] = tags["brands_tags"].astype("category").cat.codes

temp.corr()

The features "brands" and "brands_tags" are almost equally filled. We will keep "brands" because this feature carry most information (spaces, capital letters...)

In [None]:
tags.drop(columns=["brands_tags"], inplace=True)

***
#### 3.2.3.3 Redundancy: categories, categories_tags, categories_en

In [None]:
tags.loc[:, ["categories", "categories_tags", "categories_en"]].iloc[:20]

In [None]:
feature_distribution_multivar([
    ("Categories", tags["categories"].astype("category").cat.codes),
    ("Categories tags", tags["categories_tags"].astype("category").cat.codes),
    ("Categories en", tags["categories_en"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["categories_code"] = tags["categories"].astype("category").cat.codes
temp["categories_tags_code"] = tags["categories_tags"].astype("category").cat.codes
temp["categories_en_code"] = tags["categories_en"].astype("category").cat.codes

temp.corr()

We will keep "categories_en".

In [None]:
tags.drop(columns=["categories", "categories_tags"], inplace=True)

***
#### 3.2.3.4 Redundancy: labels, labels_tags, labels_en

In [None]:
tags.loc[:, ["labels", "labels_tags", "labels_en"]].loc[tags.labels.notna() & tags.labels_en.notna() & tags.labels_en.notna()].iloc[:20]

In [None]:
feature_distribution_multivar([
    ("Labels", tags["labels"].astype("category").cat.codes),
    ("Labels tags", tags["labels_tags"].astype("category").cat.codes),
    ("Labels en", tags["labels_en"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["labels_code"] = tags["labels"].astype("category").cat.codes
temp["labels_tags_code"] = tags["labels_tags"].astype("category").cat.codes
temp["labels_en_code"] = tags["labels_en"].astype("category").cat.codes

temp.corr()

We will keep "labels_en"

In [None]:
tags.drop(columns=["labels", "labels_tags"], inplace=True)

***
#### 3.2.3.5 Redundancy: countries, countries_tags, countries_en

In [None]:
tags.iloc[:20,:].loc[:,["countries", "countries_tags", "countries_en"]]

In [None]:
feature_distribution_multivar([
    ("countries", tags["countries"].astype("category").cat.codes),
    ("countries tags", tags["countries_tags"].astype("category").cat.codes),
    ("countries en", tags["countries_en"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["countries_code"] = tags["countries"].astype("category").cat.codes
temp["countries_tags_code"] = tags["countries_tags"].astype("category").cat.codes
temp["countries_en_code"] = tags["countries_en"].astype("category").cat.codes

temp.corr()

In [None]:
tags.drop(columns=["countries", "countries_tags"], inplace=True)

***
### 3.2.4 Conclusion

In [None]:
tags.describe()

In [None]:
tags.info()

In [None]:
dataframe_distribution_overview(tags)

#### 3.2.4.1 Original

In [None]:
dataframe_distribution_overview(data[tags_features])

***
## 3.3 Ingredients group

### 3.3.1 Overview

In [None]:
ingredients_features = data.loc[:, "ingredients_text":"traces_tags"].columns
ingredients_features

In [None]:
ingredients = data.loc[:, ingredients_features]

In [None]:
dataframe_distribution_overview(ingredients)

In [None]:
ingredients.info()

In [None]:
ingredients.head()

***
### 3.3.2 Emptyness

In [None]:
dataframe_distribution_overview(ingredients)

In [None]:
ingredients_empty_features = index_lth(pd, np, ingredients, 10)
ingredients_empty_features

In [None]:
ingredients.drop(columns=ingredients_empty_features, inplace=True)

***
### 3.3.3 Redundancy

In [None]:
dataframe_distribution_overview(ingredients)

***
### 3.3.4 Conclusion

In [None]:
ingredients.describe()

In [None]:
ingredients.info()

In [None]:
dataframe_distribution_overview(ingredients)

#### 3.3.4.1 Original

In [None]:
dataframe_distribution_overview(data[ingredients_features])

***
## 3.4 Misc datas group

#### 3.4.1 Overview

In [None]:
misc_features = data.loc[:,"serving_size":"image_nutrition_small_url"].columns
misc_features

In [None]:
misc = data.loc[:, misc_features]

In [None]:
dataframe_distribution_overview(misc)

In [None]:
misc.iloc[:, :16].info()

In [None]:
misc.iloc[:, 16:].info()

In [None]:
misc.head(1)

***
### 3.4.2 Emptyness

In [None]:
dataframe_distribution_overview(misc)

In [None]:
misc_empty_features = index_lth(pd, np, misc, 10)
misc_empty_features

In [None]:
misc.drop(columns=misc_empty_features, inplace=True)

***
### 3.4.3 Redundancy

In [None]:
dataframe_distribution_overview(misc)

#### 3.4.3.1 Redundancy: serving_size, serving_quantity

In [None]:
misc.loc[:, ["serving_size", "serving_quantity"]].loc[misc.serving_size.notna() & misc.serving_quantity.notna()].head()

In [None]:
feature_distribution_multivar([
    ("Serging size", misc["serving_size"].astype("category").cat.codes),
    ("Serging quantity", misc["serving_quantity"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["serving_size_code"] = misc["serving_size"].astype("category").cat.codes
temp["serving_quantity_code"] = misc["serving_quantity"].astype("category").cat.codes

temp.corr()

It appears that the feature "serving_size" is non homogeneous in term of units. The feature "serving_quantity" on the other hand, seems to be only in the same unit (g).
We will keep "serving_quantity"

In [None]:
misc.drop(columns=["serving_size"], inplace=True)

***
#### 3.4.3.2 Redundancy: additives_n, additives_tags, additives_en

In [None]:
misc.loc[:, ["additives_n", "additives_tags", "additives_en"]].loc[misc.additives_n.notna() & misc.additives_tags.notna() & misc.additives_en.notna()].head()

In [None]:
feature_distribution_multivar([
    ("Additives count", misc["additives_n"]),
    ("Additives tags", misc["additives_tags"].astype("category").cat.codes),
    ("Additives en", misc["additives_en"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["additives_n"] = misc["additives_n"]
temp["additives_tags_code"] = misc["additives_tags"].astype("category").cat.codes
temp["additives_en_code"] = misc["additives_en"].astype("category").cat.codes

temp.corr()

The feature "additives_en" contains more informations and seems to be easier to parse compare to "additives_tags". Regarding "additives_n" it's the number of additives, there is no direct redundancy for that feature, we can keep it.
We will keep "additives_en" and "additives_n".

In [None]:
misc.drop(columns=["additives_tags"], inplace=True)

***
#### 3.4.3.3 Redundancy: ingredients_from_palm_oil_n, ingredients_that_may_be_from_palm_oil_n

In [None]:
misc.loc[:, ["ingredients_from_palm_oil_n", "ingredients_that_may_be_from_palm_oil_n"]].loc[misc.ingredients_from_palm_oil_n.notna() & misc.ingredients_that_may_be_from_palm_oil_n.notna()].head()

In [None]:
feature_distribution_multivar([
    ("Ingredients from palm oil count", misc["ingredients_from_palm_oil_n"]),
    ("Ingredients that may be from palm oil count", misc["ingredients_that_may_be_from_palm_oil_n"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["ingredients_from_palm_oil_n"] = misc["ingredients_from_palm_oil_n"]
temp["ingredients_that_may_be_from_palm_oil_n_code"] = misc["ingredients_that_may_be_from_palm_oil_n"].astype("category").cat.codes

temp.corr()

***
#### 3.4.3.4 Redundancy: nutriscore_score, nutriscore_grade

In [None]:
misc.loc[:, ["nutriscore_score", "nutriscore_grade"]].loc[misc.nutriscore_score.notna() & misc.nutriscore_grade.notna()].iloc[:20]

In [None]:
feature_distribution_multivar([
    ("Nutriscore score", misc["nutriscore_score"].astype("category").cat.codes),
    ("Nutriscore grade", misc["nutriscore_grade"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["nutriscore_score_code"] = misc["nutriscore_score"].astype("category").cat.codes
temp["nutriscore_grade_code"] = misc["nutriscore_grade"].astype("category").cat.codes

temp.corr()

We will keep both fields as one is quantitative and other is qualitative.

***
#### 3.4.3.5 Redundancy: pnns_groups_1, pnns_groups_2

In [None]:
misc.loc[:, ["pnns_groups_1", "pnns_groups_2"]].loc[misc.pnns_groups_1.notna()].iloc[:20]

In [None]:
feature_distribution_multivar([
    ("Pnns groups 1", misc["pnns_groups_1"].astype("category").cat.codes),
    ("Pnns groups 2", misc["pnns_groups_2"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["pnns_groups_1_code"] = misc["pnns_groups_1"].astype("category").cat.codes
temp["pnns_groups_2_code"] = misc["pnns_groups_2"].astype("category").cat.codes

temp.corr()

Both features don't contains same value, we will keep them.

***
#### 3.4.3.6 Redundancy: states, states_tags, states_en

In [None]:
misc.loc[:, ["states", "states_tags", "states_en"]].iloc[:10]

In [None]:
feature_distribution_multivar([
    ("States", misc["states"].astype("category").cat.codes),
    ("States tags", misc["states_tags"].astype("category").cat.codes),
    ("States en", misc["states_en"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["states_code"] = misc["states"].astype("category").cat.codes
temp["states_tags_code"] = misc["states_tags"].astype("category").cat.codes
temp["states_en_code"] = misc["states_en"].astype("category").cat.codes

temp.corr()

Seems to be all useless. Let's see how many rows doesn't contains 'to-be-completed' or 'To be completed' in those features.

These features contains no usefull informations, we will get rid of them.

In [None]:
misc.drop(columns=["states", "states_tags", "states_en"], inplace=True)

***
#### 3.4.3.7 Redundancy: ecoscore_score_fr, ecoscore_grade_fr

In [None]:
misc.loc[:, ["ecoscore_score_fr", "ecoscore_grade_fr"]].loc[misc.ecoscore_score_fr.notna() & misc.ecoscore_grade_fr.notna()].iloc[:20]

In [None]:
feature_distribution_multivar([
    ("Ecoscore score fr", misc["ecoscore_score_fr"].astype("category").cat.codes),
    ("States tags", misc["ecoscore_grade_fr"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["ecoscore_score_fr_code"] = misc["ecoscore_score_fr"].astype("category").cat.codes
temp["ecoscore_grade_fr_code"] = misc["ecoscore_grade_fr"].astype("category").cat.codes

temp.corr()

As the nutriscore, both ecoscore are usefull, we will keep both.

***
#### 3.4.3.8 Redundancy: main_category, main_category_en

In [None]:
misc.loc[:, ["main_category", "main_category_en"]].iloc[:20]

In [None]:
feature_distribution_multivar([
    ("main_category", misc["main_category"].astype("category").cat.codes),
    ("main_category en", misc["main_category_en"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["main_category_code"] = misc["main_category"].astype("category").cat.codes
temp["main_category_en_code"] = misc["main_category_en"].astype("category").cat.codes

temp.corr()

We will keep "main_category_en"

In [None]:
misc.drop(columns=["main_category"], inplace=True)

***
#### 3.4.3.9 Redundancy: image_url, image_small_url

In [None]:
misc.loc[:, ["image_url", "image_small_url"]].loc[misc.image_url.notna() & misc.image_small_url.notna()].head()

In [None]:
feature_distribution_multivar([
    ("Image url", misc["image_url"].astype("category").cat.codes),
    ("Image small url", misc["image_small_url"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["image_url_code"] = misc["image_url"].astype("category").cat.codes
temp["image_small_url_en_code"] = misc["image_small_url"].astype("category").cat.codes

temp.corr()

In [None]:
misc.drop(columns=["image_small_url"], inplace=True)

***
#### 3.4.3.10 Redundancy: image_ingredients_url, image_ingredients_small_url

In [None]:
misc.loc[:, ["image_ingredients_url", "image_ingredients_small_url"]].loc[misc.image_ingredients_url.notna() & misc.image_ingredients_small_url.notna()].head()

In [None]:
feature_distribution_multivar([
    ("Image ingredients url", misc["image_ingredients_url"].astype("category").cat.codes),
    ("Image ingredients small url", misc["image_ingredients_small_url"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["image_ingredients_url_code"] = misc["image_ingredients_url"].astype("category").cat.codes
temp["image_ingredientsimage<_small_url_en_code"] = misc["image_ingredients_small_url"].astype("category").cat.codes

temp.corr()

In [None]:
misc.drop(columns=["image_ingredients_small_url"], inplace=True)

***
#### 3.4.3.11 Redundancy: image_nutrition_url, image_nutrition_small_url

In [None]:
misc.loc[:, ["image_nutrition_url", "image_nutrition_small_url"]].loc[misc.image_nutrition_url.notna() & misc.image_nutrition_small_url.notna()].head()

In [None]:
feature_distribution_multivar([
    ("Image nutrition url", misc["image_nutrition_url"].astype("category").cat.codes),
    ("Image nutrition small url", misc["image_nutrition_small_url"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["image_nutrition_url_code"] = misc["image_nutrition_url"].astype("category").cat.codes
temp["image_nutritionimage<_small_url_en_code"] = misc["image_nutrition_small_url"].astype("category").cat.codes

temp.corr()

In [None]:
misc.drop(columns=["image_nutrition_small_url"], inplace=True)

***
### 3.4.4 Usability

In [None]:
dataframe_distribution_overview(misc, figsize=(30, 2))

In [None]:
misc.info()

***
#### 3.4.4.1 Usability: serving_quantity

In [None]:
feature_distribution_univar_box(misc, "serving_quantity")

In [None]:
misc.serving_quantity.describe()

In [None]:
sq_mean = misc.serving_quantity.mean()
sq_std = misc.serving_quantity.std()
sq_max_ok = sq_mean + sq_std
misc.serving_quantity = misc.serving_quantity.apply(lambda x: sq_max_ok if x > sq_max_ok else x)

In [None]:
feature_distribution_univar_box(misc, "serving_quantity")

***
#### 3.4.4.2 Usability: additives_n

In [None]:
feature_distribution_univar_box(misc, "additives_n")

In [None]:
misc.additives_n.describe()

In [None]:
misc.loc[misc.additives_n.notna()].additives_n.head(10)

***
#### 3.4.4.1 Usability: ingredients_from_palm_oil_n

In [None]:
feature_distribution_univar_box(misc, "ingredients_from_palm_oil_n")

In [None]:
misc.ingredients_from_palm_oil_n.describe()

***
#### 3.4.4.1 Usability: ingredients_that_may_be_from_palm_oil_n

In [None]:
feature_distribution_univar_box(misc, "ingredients_that_may_be_from_palm_oil_n")

In [None]:
misc.ingredients_that_may_be_from_palm_oil_n.describe()

***
#### 3.4.4.1 Usability: nutriscore_score

In [None]:
feature_distribution_univar_box(misc, "nutriscore_score")

***
#### 3.4.4.1 Usability: ecoscore_score_fr

In [None]:
feature_distribution_univar_box(misc, "ecoscore_score_fr")

In [None]:
misc.ecoscore_score_fr.describe()

***
### 3.4.5 Conclusion

In [None]:
misc.info()

In [None]:
misc.describe()

In [None]:
misc.loc[:, :"main_category_en"].describe(include=[object])

In [None]:
dataframe_distribution_overview(misc)

#### 3.4.4.1 Original

In [None]:
dataframe_distribution_overview(data[misc_features])

***
## 3.5 Nutrition facts group

### 3.5.1 Overview

In [None]:
nutr_features = data.loc[:, "energy-kj_100g":"carnitine_100g"].columns
nutr_features

In [None]:
nutr = data.loc[:, nutr_features]

In [None]:
dataframe_distribution_overview(nutr, figsize=(30, 2))

In [None]:
nutr.iloc[:, :10].info()

In [None]:
nutr.iloc[:, 10:20].info()

In [None]:
nutr.iloc[:, 20:30].info()

In [None]:
nutr.iloc[:, 30:40].info()

In [None]:
nutr.iloc[:, 40:50].info()

In [None]:
nutr.iloc[:, 50:60].info()

In [None]:
nutr.iloc[:, 60:70].info()

In [None]:
nutr.iloc[:, 70:80].info()

In [None]:
nutr.iloc[:, 80:90].info()

In [None]:
nutr.iloc[:, 90:100].info()

In [None]:
nutr.iloc[:, 100:110].info()

In [None]:
nutr.iloc[:, 110:120].info()

***
### 3.5.2 Emptyness

In [None]:
dataframe_distribution_overview(nutr, figsize=(30, 2))

In [None]:
nutr_empty_features = index_lth(pd, np, nutr, 10)
nutr_empty_features

In [None]:
nutr.drop(columns=nutr_empty_features, inplace=True)

***
### 3.5.3 Redundancy

In [None]:
dataframe_distribution_overview(nutr, figsize=(30, 2))

#### 3.5.3.1 Redundancy: nutrition-score-fr_100g

nutrition-score-fr_100g is redundant with "nutriscore_score" and "nutriscore_grade". It will be dropped.

In [None]:
feature_distribution_multivar([
    ("nutriscore score", data["nutriscore_score"].astype("category").cat.codes),
    ("nutriscore grade", data["nutriscore_grade"].astype("category").cat.codes),
    ("nutriscore fr 100g", nutr["nutrition-score-fr_100g"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["nutriscore_score_code"] = data["nutriscore_score"].astype("category").cat.codes
temp["nutriscore_grade_code"] = data["nutriscore_grade"].astype("category").cat.codes
temp["nutrition-score-fr_100g_code"] = nutr["nutrition-score-fr_100g"].astype("category").cat.codes

temp.corr()

In [None]:
nutr.drop(columns=["nutrition-score-fr_100g"], inplace=True)

***
#### 3.5.3.2 Redundancy: energy-kj_100g, energy-kcal_100g, energy_100g

In [None]:
feature_distribution_multivar([
    ("Energy Kcal 100g", nutr["energy-kcal_100g"].astype("category").cat.codes),
    ("Energy 100g", nutr["energy_100g"].astype("category").cat.codes)
])

In [None]:
temp = pd.DataFrame()
temp["energy-kcal_100g_code"] = nutr["energy-kcal_100g"].astype("category").cat.codes
temp["energy_100g_code"] = nutr["energy_100g"].astype("category").cat.codes

temp.corr()

In [None]:
nutr.drop(columns=["energy_100g"], inplace=True)

***
### 3.5.4 Usability

In [None]:
dataframe_distribution_overview(nutr, figsize=(30, 2))

***
#### 3.5.4.1 Usability: fat_100g

In [None]:
feature_distribution_univar_box(nutr, "fat_100g")

In [None]:
nutr["fat_100g"].describe()

In [None]:
nutr.fat_100g = nutr.fat_100g.apply(lambda x: x if x <= 100 else 100)

In [None]:
feature_distribution_univar_box(nutr, "fat_100g")

***
#### 3.5.4.2 Usability: saturated-fat_100g

In [None]:
feature_distribution_univar_box(nutr, "saturated-fat_100g")

In [None]:
nutr["saturated-fat_100g"].describe()

In [None]:
nutr["saturated-fat_100g"] = nutr["saturated-fat_100g"].apply(lambda x: x if x <= 100 else 100)

In [None]:
feature_distribution_univar_box(nutr, "saturated-fat_100g")

***
#### 3.5.4.3 Usability: trans-fat_100g

In [None]:
feature_distribution_univar_box(nutr, "trans-fat_100g")

In [None]:
nutr["trans-fat_100g"].describe()

In [None]:
nutr["trans-fat_100g"] = nutr["trans-fat_100g"].apply(lambda x: x if x <= 100 else 100)

In [None]:
feature_distribution_univar_box(nutr, "trans-fat_100g")

***
#### 3.5.4.4 Usability: cholesterol_100g

In [None]:
feature_distribution_univar_box(nutr, "cholesterol_100g")

In [None]:
nutr["cholesterol_100g"].describe()

In [None]:
nutr["cholesterol_100g"] = nutr["cholesterol_100g"].apply(lambda x: x if x <= 100 else 100)

In [None]:
feature_distribution_univar_box(nutr, "cholesterol_100g")

***
#### 3.5.4.5 Usability: carbohydrates_100g

In [None]:
feature_distribution_univar_box(nutr, "carbohydrates_100g")

In [None]:
nutr["carbohydrates_100g"].describe()

In [None]:
nutr["carbohydrates_100g"] = nutr["carbohydrates_100g"].apply(lambda x: x if x <= 100 else 100)

In [None]:
feature_distribution_univar_box(nutr, "carbohydrates_100g")

***
#### 3.5.4.6 Usability: sugars_100g

In [None]:
feature_distribution_univar_box(nutr, "sugars_100g")

In [None]:
nutr["sugars_100g"].describe()

In [None]:
nutr["sugars_100g"] = nutr["sugars_100g"].apply(lambda x: x if x <= 100 else 100)

In [None]:
feature_distribution_univar_box(nutr, "sugars_100g")

***
#### 3.5.4.7 Usability: fiber_100g

In [None]:
feature_distribution_univar_box(nutr, "fiber_100g")

In [None]:
nutr["fiber_100g"].describe()

In [None]:
nutr["fiber_100g"] = nutr["fiber_100g"].apply(lambda x: x if x <= 100 else 100)

In [None]:
feature_distribution_univar_box(nutr, "fiber_100g")

***
#### 3.5.4.7 Usability: proteins_100g

In [None]:
feature_distribution_univar_box(nutr, "proteins_100g")

In [None]:
nutr["proteins_100g"].describe()

In [None]:
nutr["proteins_100g"] = nutr["proteins_100g"].apply(lambda x: x if x <= 100 else 100)

In [None]:
feature_distribution_univar_box(nutr, "proteins_100g")

***
#### 3.5.4.7 Usability: salt_100g

In [None]:
feature_distribution_univar_box(nutr, "salt_100g")

In [None]:
nutr["salt_100g"].describe()

In [None]:
nutr["salt_100g"] = nutr["salt_100g"].apply(lambda x: x if x <= 100 else 100)

In [None]:
feature_distribution_univar_box(nutr, "salt_100g")

***
#### 3.5.4.7 Usability: sodium_100g

In [None]:
feature_distribution_univar_box(nutr, "sodium_100g")

In [None]:
nutr["sodium_100g"].describe()

In [None]:
nutr["sodium_100g"] = nutr["sodium_100g"].apply(lambda x: x if x <= 100 else 100)

In [None]:
feature_distribution_univar_box(nutr, "sodium_100g")

***
#### 3.5.4.7 Usability: vitamin-a_100g

In [None]:
feature_distribution_univar_box(nutr, "vitamin-a_100g")

In [None]:
nutr["vitamin-a_100g"].describe()

In [None]:
nutr["vitamin-a_100g"] = nutr["vitamin-a_100g"].apply(lambda x: x if x <= 100 else 100)

In [None]:
feature_distribution_univar_box(nutr, "vitamin-a_100g")

***
#### 3.5.4.7 Usability: vitamin-c_100g

In [None]:
feature_distribution_univar_box(nutr, "vitamin-c_100g")

In [None]:
nutr["vitamin-c_100g"].describe()

In [None]:
nutr["vitamin-c_100g"] = nutr["vitamin-c_100g"].apply(lambda x: x if x <= 100 else 100)

In [None]:
feature_distribution_univar_box(nutr, "vitamin-c_100g")

***
#### 3.5.4.7 Usability: calcium_100g

In [None]:
feature_distribution_univar_box(nutr, "calcium_100g")

In [None]:
nutr["calcium_100g"].describe()

In [None]:
nutr.calcium_100g = nutr.calcium_100g.apply(lambda x: x if x <= 100 else 100)

In [None]:
feature_distribution_univar_box(nutr, "calcium_100g")

***
#### 3.5.4.7 Usability: iron_100g

In [None]:
feature_distribution_univar_box(nutr, "iron_100g")

In [None]:
nutr["iron_100g"].describe()

In [None]:
nutr.iron_100g = nutr.iron_100g.apply(lambda x: x if x <= 100 else 100)

In [None]:
feature_distribution_univar_box(nutr, "iron_100g")

***
### 3.5.5 Conclusion

In [None]:
nutr.info()

In [None]:
nutr.describe()

In [None]:
dataframe_distribution_overview(nutr)

#### 3.5.4.1 Original

In [None]:
dataframe_distribution_overview(data[nutr_features], figsize=(30, 2))

***
## 3.6 Global

### 3.6.1 Overview

In [None]:
features = general_infos.join([tags, ingredients, misc, nutr])

In [None]:
features.shape

In [None]:
dataframe_distribution_overview(features, figsize=(30, 2))

***
### 3.6.2 Redundancy

In [None]:
dataframe_distribution_overview(features, figsize=(30, 2))

***
#### 3.6.2.1 Redundancy: categories_en, main_category_en

In [None]:
features.loc[:, ["categories_en", "main_category_en"]].loc[features.categories_en.notna() & features.main_category_en.notna()].head()

In [None]:
feature_distribution_multivar([
    ("Categories", features["categories_en"].astype("category").cat.codes),
    ("Main category", features["main_category_en"].astype("category").cat.codes),
])

In [None]:
temp = pd.DataFrame()
temp["categories_en_code"] = features["categories_en"].astype("category").cat.codes
temp["main_category_en_code"] = features["main_category_en"].astype("category").cat.codes

temp.corr()

***
## 3.7 Conclusion

In [None]:
features.shape

In [None]:
dataframe_distribution_overview(features, figsize=(30, 2))

***
# 4 Saving

In [None]:
features.shape

In [None]:
features.to_csv("datas/sample.csv", sep=";")