In [1]:
import pandas as pd
import json
pd.set_option('display.max_colwidth', None)
pd.set_option('display.min_rows', 60)

import matplotlib.pylab as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

- https://www.who.int/news-room/fact-sheets/detail/food-additives
- https://en.wikipedia.org/wiki/Food_additive
- https://www.fda.gov/food/food-ingredients-packaging/overview-food-ingredients-additives-colors
- https://food.ec.europa.eu/safety/food-improvement-agents/additives_en
- https://www.food.gov.uk/safety-hygiene/food-additives

Food additives are substances added to food products during the production process to enhance their taste, appearance, texture, and shelf life. These substances can be natural or synthetic and are used in varying amounts, depending on the type of food product and the desired effect.

Some common types of food additives include preservatives, sweeteners, colorings, flavorings, thickeners, emulsifiers, and stabilizers. Preservatives are used to prevent spoilage and extend the shelf life of food products, while sweeteners are used to add sweetness without adding calories. Colorings are added to enhance the appearance of food products, while flavorings are added to enhance the taste. Thickeners, emulsifiers, and stabilizers are used to give food products a certain texture or consistency.

While many food additives are considered safe, some may have potential health risks, particularly when consumed in large amounts or by people with specific health conditions. As a result, food additives are closely regulated by food safety authorities in many countries.

(by ChatGPT)

Quel additif est le plus souvent avec quel autre (force-directed graph)

In [2]:
FILENAME = '../../../datasets/products_0.995_cleaned.csv'
df = pd.read_csv(FILENAME)

  df = pd.read_csv(FILENAME)


## Adding additives dangerousness (`additives_max_dangerousness`)

In [3]:
from collections import defaultdict

In [4]:
with open('additives_dangerousness.json', 'r') as json_file:
    ADDITIVES_DANGEROUSNESS = { additive['code']: 4 - additive['dangerousnessLevel'] for additive in json.load(json_file) }


NOT_FOUND = []
def get_codes_by_level(level: int) -> list[str]:
    return [k for k,v in ADDITIVES_DANGEROUSNESS if v == level]

def get_dangerousness(additives):
    if type(additives) != str: return 0, 0, 0
    dangerousnesses = []
    n = 0
    for additive in additives.split(','):
        code = additive.split('-')[0].strip()
        if code in ADDITIVES_DANGEROUSNESS:
            dangerousnesses.append(ADDITIVES_DANGEROUSNESS[code])
            n += 1
    if len(dangerousnesses) == 0: return 0, 0, 0
    return min(dangerousnesses), sum(dangerousnesses) / n, max(dangerousnesses)


df[['additives_min_dangerousness', 'additives_average_dangerousness', 'additives_max_dangerousness']] = df.apply(lambda r: get_dangerousness(r['additives']), axis=1, result_type="expand")

FileNotFoundError: [Errno 2] No such file or directory: 'additives_dangerousness.json'

In [None]:

def get_additives_count_hazard(additives, level):
    if type(additives) != str: return 0
    hazard = 0
    for additive in additives.split(','):
        code = additive.split('-')[0].strip()
        if ADDITIVES_DANGEROUSNESS.get(code, -1) == level: hazard += 1
    return hazard

df['additives_0_count'] = df.apply(lambda r: get_additives_count_hazard(r['additives'], 0), axis=1)
df['additives_1_count'] = df.apply(lambda r: get_additives_count_hazard(r['additives'], 1), axis=1)
df['additives_2_count'] = df.apply(lambda r: get_additives_count_hazard(r['additives'], 2), axis=1)
df['additives_3_count'] = df.apply(lambda r: get_additives_count_hazard(r['additives'], 3), axis=1)

## Adding additives count (`additives_count`)

In [None]:
df['has_additives'] = df['additives'].notna()
df['additives_count'] = df['additives'].apply(lambda x: len(x.split(',')) if type(x) == str else 0)
df['ingredients_count'] = df['ingredients_tags'].apply(lambda x: len(x.split(',')) if type(x) == str else 0)

In [None]:
df['has_additives'].mean()

0.45712767696583295

In [None]:
def percentage_containing(df: pd.DataFrame) -> float:
    """
        Returns the percentage of products that contain additives in the
        given dataframe
    """
    return 1 - df['additives'].isnull().mean()

def average_number(df: pd.DataFrame) -> float:
    """
        Returns the average number of additives in products that already contain
        additives
    """
    return df[df['additives'].notnull()]['additives'].str.split(',').apply(len).mean()

def average_number_with_null(df: pd.DataFrame) -> float:
    """
        Returns the average number of additives in products that already contain
        additives
    """
    return df[df['additives'].notnull()]['additives'].str.split(',').apply(len).sum() / len(df)

## Graph : Circle Packing (Bubble Chart) of additives 

In [None]:
bubble_chart_additives = df['additives'].str.split(pat=',').explode(ignore_index=True).value_counts().to_frame().reset_index().head(50)
bubble_chart_additives['dangerousness'] = bubble_chart_additives.apply(lambda r: ADDITIVES_DANGEROUSNESS.get(r['index'].split('-')[0].strip(), 0), axis=1)
bubble_chart_additives.columns = ['additive', 'count', 'dangerosity']
bubble_chart_additives.to_json('graph/bubble_chart_additives.json', orient='records', indent=4)

## Others

In [None]:
def predict(x, y):
    X = df[[y, x]].dropna()
    Y = X[y]
    X = X.drop(columns=[y])

    regressor = LinearRegression()
    regressor.fit(np.array(X).reshape(-1, X.shape[1]), Y)
    return regressor.intercept_

## Vegan and vegetarian

In [None]:
def get_vegan(df: pd.DataFrame) -> pd.DataFrame:
    return df[df['is_vegan'] == True]

def get_non_vegan(df: pd.DataFrame) -> pd.DataFrame:
    return df[df['is_vegan'] == False]

def get_vegetarian(df: pd.DataFrame) -> pd.DataFrame:
    return df[df['is_vegetarian'] == True]

def get_non_vegetarian(df: pd.DataFrame) -> pd.DataFrame:
    return df[df['is_vegetarian'] == False]

def get_vegan_or_vegetarian(df: pd.DataFrame) -> pd.DataFrame:
    return df[(df['is_vegan'] == True) | (df['is_vegetarian'] == True)]

def get_non_vegan_and_non_vegetarian(df: pd.DataFrame) -> pd.DataFrame:
    return df[(df['is_vegan'] == False) & (df['is_vegetarian'] == False)]

def get_organic(df: pd.DataFrame) -> pd.DataFrame:
    return df[df['is_organic'] == True]

def get_non_organic(df: pd.DataFrame) -> pd.DataFrame:
    return df[df['is_organic'] == False]

In [None]:
additives_percentage_in_organic = percentage_containing(get_organic(df))
additives_percentage_in_non_organic = percentage_containing(get_non_organic(df))
additives_decrease_organic = (additives_percentage_in_non_organic / additives_percentage_in_organic - 1) * 100
print(f"Organic products contain {additives_decrease_organic:.2f}% less additives than non organic")

print(average_number_with_null(get_non_organic(df)) / average_number_with_null(get_organic(df)) * 100)

Organic products contain 148.16% less additives than non organic
439.7392080839228


In [None]:
pv = percentage_containing(get_vegan(df))
pnv = percentage_containing(get_non_vegan(df))
print(f"Vegan products contain {(pnv / pv - 1) * 100:.2f}% less additives than non vegan")

pve = percentage_containing(get_vegetarian(df))
pnve = percentage_containing(get_non_vegetarian(df))
print(f"Vegetarian products contain {(pnve / pve - 1) * 100:.2f}% less additives than non vegetarian")

pvove = percentage_containing(get_vegan_or_vegetarian(df))
pnvanve = percentage_containing(get_non_vegan_and_non_vegetarian(df))
print(f"Vegan or vegetarian products contain {(pnvanve / pvove - 1) * 100:.2f}% less additives than non ones")

Vegan products contain 108.00% less additives than non vegan
Vegetarian products contain 146.90% less additives than non vegetarian
Vegan or vegetarian products contain 146.96% less additives than non ones


In [None]:
av = average_number(get_vegan(df))
anv = average_number(get_non_vegan(df))
print(f"Vegan products containing additives contain {(anv / av - 1) * 100:.2f}% less additives than non vegan")

ave = average_number(get_vegetarian(df))
anve = average_number(get_non_vegetarian(df))
print(f"Vegetarian products containing additives contain {(anve / ave - 1) * 100:.2f}% less additives than non vegetarian")

avove = average_number(get_vegan_or_vegetarian(df))
anvanve = average_number(get_non_vegan_and_non_vegetarian(df))
print(f"Vegan or vegetarian products containing additives contain {(anvanve / avove - 1) * 100:.2f}% less additives than non ones")

Vegan products containing additives contain 68.28% less additives than non vegan
Vegetarian products containing additives contain 63.11% less additives than non vegetarian
Vegan or vegetarian products containing additives contain 63.05% less additives than non ones


## Nutriscore

In [None]:
df.groupby('nutriscore_fr')[['additives_min_dangerousness', 'additives_average_dangerousness', 'additives_max_dangerousness', 'additives_count', 'has_additives']].mean().sort_values(by='additives_average_dangerousness')

Unnamed: 0_level_0,additives_min_dangerousness,additives_average_dangerousness,additives_max_dangerousness,additives_count,has_additives
nutriscore_fr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,0.094908,0.178847,0.279225,0.586899,0.243844
B,0.100906,0.278181,0.495213,1.135834,0.427943
C,0.141228,0.51575,0.979386,1.7875,0.6125
D,0.194289,0.524827,0.938924,1.608947,0.491086
E,0.341737,0.82662,1.417017,2.107493,0.662115


In [None]:
df['nutriscore_fr_as_number'] = df.apply(lambda r: ord(r['nutriscore_fr']) - ord('A') if type(r['nutriscore_fr']) == str else None, axis=1)

In [None]:
predict('nutriscore_fr_as_number', 'additives_count')

0.7461945337535004

In [None]:
predict('nutriscore_fr_as_number', 'additives_average_dangerousness')

0.1635983577826528

In [None]:
df.groupby('nutriscore_fr')[['additives_min_dangerousness', 'additives_average_dangerousness', 'additives_max_dangerousness']].mean().sort_values(by='nutriscore_fr').reset_index().to_json(orient='records')

'[{"nutriscore_fr":"A","additives_min_dangerousness":0.0949079608,"additives_average_dangerousness":0.1788466355,"additives_max_dangerousness":0.2792254363},{"nutriscore_fr":"B","additives_min_dangerousness":0.1009055627,"additives_average_dangerousness":0.2781810528,"additives_max_dangerousness":0.4952134541},{"nutriscore_fr":"C","additives_min_dangerousness":0.1412280702,"additives_average_dangerousness":0.5157502073,"additives_max_dangerousness":0.9793859649},{"nutriscore_fr":"D","additives_min_dangerousness":0.1942885441,"additives_average_dangerousness":0.5248268332,"additives_max_dangerousness":0.9389237372},{"nutriscore_fr":"E","additives_min_dangerousness":0.3417366947,"additives_average_dangerousness":0.8266200253,"additives_max_dangerousness":1.4170168067}]'

## Eco Score

In [None]:
df.groupby('ecoscore_grade')[['additives_min_dangerousness', 'additives_average_dangerousness', 'additives_max_dangerousness', 'additives_count', 'has_additives']].mean().sort_values(by='additives_average_dangerousness')

Unnamed: 0_level_0,additives_min_dangerousness,additives_average_dangerousness,additives_max_dangerousness,additives_count,has_additives
ecoscore_grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,0.098383,0.224964,0.390836,0.986074,0.391285
b,0.118304,0.257959,0.452514,1.048965,0.372494
c,0.175995,0.459262,0.818881,1.295034,0.438313
d,0.197324,0.589893,1.050446,1.548774,0.515608
e,0.27205,0.665674,1.104811,1.576174,0.531501


## Food Category

In [None]:

df['categories_splitted'] = df['categories'].str.split(',')
df.explode(['categories_splitted']).groupby('categories_splitted')['additives_average_dangerousness'].agg(['mean', 'count']).query('count > 50').sort_values(by='mean', ascending=False).head(10)

Unnamed: 0_level_0,mean,count
categories_splitted,Unnamed: 1_level_1,Unnamed: 2_level_1
cured sausages,1.981717,410
raw cured ham,1.848241,199
artificially sweetened beverages,1.818556,66
cured ham,1.80352,232
dried hams,1.750815,184
salami,1.715769,99
dried meats,1.688272,54
chorizo,1.673998,84
processed cheese,1.647783,58
sodas,1.5888,84


In [None]:
from sklearn.metrics import r2_score
from scipy import stats

# Nutriscore

## Charts

In [None]:
df.groupby('nutriscore_fr')[['additives_0_count', 'additives_1_count', 'additives_2_count', 'additives_3_count']].mean().reset_index().to_json('nutriscore_stacked_bar_chart.json', indent=4, orient='records')

In [None]:
df.groupby('nutriscore_fr')[['additives_count']].sum().reset_index().to_json('nutriscore_stacked_bar_chart.json', indent=4, orient='records')

In [None]:
df.groupby('nutriscore_fr')[['additives_min_dangerousness', 'additives_average_dangerousness', 'additives_max_dangerousness']] \
    .mean() \
    .reset_index() \
    .to_json('nutriscore_line_area_chart.json', indent=4, orient='records')

# Vegan & Vegetarian

## Facts

Following a vegetarian diet reduce the average number of additives per product by 1.30

In [None]:
predicted = 'additives_count'

X = df[[predicted, 'is_vegetarian', 'is_vegan', 'is_organic', 'ingredients_count']].dropna()
Y = X[predicted]
X = X.drop(columns=[predicted])

lm = LinearRegression()
lm.fit(np.array(X).reshape(-1, X.shape[1]), Y)
print(r2_score(Y, lm.predict(X)))
print(lm.coef_)

params = np.append(lm.intercept_,lm.coef_)
predictions = lm.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
print(f"P-value = {p_val}")

0.40136313700300064
[-0.34880243  0.15029316 -0.56258442  0.07326231]
P-value = [1.0597038802018233e-09, 0.0, 0.0003385430365343556, 0.0, 0.0]




Following a vegetarian diet reduce the average dangerosity of additives by 0.43 

In [None]:
predicted = 'additives_average_dangerousness'

X = df[[predicted, 'is_vegetarian']].dropna()
Y = X[predicted]
X = X.drop(columns=[predicted])

regressor = LinearRegression()
regressor.fit(np.array(X).reshape(-1, X.shape[1]), Y)
print(regressor.coef_)

[-0.43289928]


Following a vegan diet reduce the average number of additives per product by 1.11

In [None]:
predicted = 'additives_count'

X = df[[predicted, 'is_vegan']].dropna()
Y = X[predicted]
X = X.drop(columns=[predicted])

regressor = LinearRegression()
regressor.fit(np.array(X).reshape(-1, X.shape[1]), Y)
print(regressor.coef_)

[-1.11264023]


Following a vegan diet reduce the average dangerosity of additives by 0.39

In [None]:
predicted = 'additives_average_dangerousness'

X = df[[predicted, 'is_vegan']].dropna()
Y = X[predicted]
X = X.drop(columns=[predicted])

regressor = LinearRegression()
regressor.fit(np.array(X).reshape(-1, X.shape[1]), Y)
print(regressor.coef_)

[-0.39077919]


Meat products represent 26.2% of the products in the database but contain 71.4% of the additives

The average hazard of additives in meat products is 3.5 times higher than in non-meat products.

In [None]:
df['is_meat_based'] = df['categories'].str.contains('meat')

def get_meat_based_products(df: pd.DataFrame) -> pd.DataFrame:
    return df[df['is_meat_based'] == True]

def get_above_dangerosity(df: pd.DataFrame, dangerosity: int) -> pd.DataFrame:
    return df[df['additives_min_dangerousness'] > dangerosity]

print(df['is_meat_based'].mean())
print(get_above_dangerosity(get_meat_based_products(df), 2)['additives_count'].sum() / get_above_dangerosity(df, 2)['additives_count'].sum())
print(get_meat_based_products(df)['additives_average_dangerousness'].mean() / df[df['is_meat_based'] == False]['additives_average_dangerousness'].mean())

0.2624280806324765
0.7141414141414142
3.455496730430973


In [None]:
# get_meat_based_products(df)['additives_count'].mean() / get_vegan_or_vegetarian(df)['additives_count'].mean() # ~ 4.77
# get_meat_based_products(df)['additives_average_dangerousness'].mean() / get_vegan_or_vegetarian(df)['additives_average_dangerousness'].mean() # ~7.60

## Graphs

In [None]:
labels = ['Meat based products', 'All products', 'Vegetarian', 'Vegan', 'Vegetarian or Vegan']
average_additives_counts = [
    get_meat_based_products(df)['additives_count'].mean(),
    df['additives_count'].mean(),
    get_vegetarian(df)['additives_count'].mean(),
    get_vegan(df)['additives_count'].mean(),
    get_vegan_or_vegetarian(df)['additives_count'].mean()
]
pd.DataFrame({
    'label': labels,
    'average_additives_count': average_additives_counts,
}).to_json('graph/vegetarian_vegan_additives_count.json', indent=4, orient='records')

In [None]:
labels = ['Meat based products', 'All products', 'Vegetarian', 'Vegan', 'Vegetarian or Vegan']
average_hazards = [
    get_meat_based_products(df)['additives_average_dangerousness'].mean(),
    df['additives_average_dangerousness'].mean(),
    get_vegetarian(df)['additives_average_dangerousness'].mean(),
    get_vegan(df)['additives_average_dangerousness'].mean(),
    get_vegan_or_vegetarian(df)['additives_average_dangerousness'].mean()
]
pd.DataFrame({
    'label': labels,
    'average_hazard': average_hazards
}).to_json('graph/vegetarian_vegan_average_hazards.json', indent=4, orient='records')

# NOVA Index

## Facts

In [None]:
df.groupby('nova_group')['additives_count'].mean()

nova_group
1.0    0.046915
2.0    0.059937
3.0    0.313321
4.0    2.699152
Name: additives_count, dtype: float64

In [None]:
df.groupby('nova_group')['additives_average_dangerousness'].mean()

nova_group
1.0    0.020614
2.0    0.062303
3.0    0.206557
4.0    0.781140
Name: additives_average_dangerousness, dtype: float64

## Graphs

In [None]:
df.groupby('nova_group')[['additives_0_count', 'additives_1_count', 'additives_2_count', 'additives_3_count']] \
    .mean() \
    .reset_index() \
    .to_json('nova_stacked_bar_chart.json', indent=4, orient='records')

In [None]:
from itertools import combinations

force_directed_graph_ = {
    'nodes': [],
    'links': []
}

threshold = 100

additives_links = df['additives'] \
    .str.split(',') \
    .apply(lambda a: list(combinations(a, 2)) if type(a) != float else []) \
    .explode() \
    .value_counts() \
    .to_frame() \
    .reset_index()

additives_links.columns = ['additives', 'value']
additives_links[['source', 'target']] = additives_links['additives'].apply(pd.Series)
additives_links.drop(['additives'], axis=1, inplace=True)
additives_links.drop(additives_links[additives_links.value < threshold].index, inplace = True)
force_directed_graph_['links'] = additives_links.to_dict('records')

s = set()
for link in force_directed_graph_['links']:
    s.add(link['source'])
    s.add(link['target'])
force_directed_graph_['nodes'] = [{ 'id': n, 'group': ADDITIVES_DANGEROUSNESS.get(n.split('-')[0].strip(), 0) } for i, n in enumerate(s)]

with open('additives_force_directed_chart.json', 'w') as json_file:
    json.dump(force_directed_graph_, json_file, indent=4)

# Organic

## Facts

Consuming organic products reduce the average number of additives per product by 1.23

In [None]:
predicted = 'additives_count'

X = df[[predicted, 'is_organic']].dropna()
Y = X[predicted]
X = X.drop(columns=[predicted])

regressor = LinearRegression()
regressor.fit(np.array(X).reshape(-1, X.shape[1]), Y)
print(regressor.coef_)

[-1.22839101]


Consuming organic products reduce the average dangerosity of additives by 0.41

In [None]:
predicted = 'additives_average_dangerousness'

X = df[[predicted, 'is_organic']].dropna()
Y = X[predicted]
X = X.drop(columns=[predicted])

regressor = LinearRegression()
regressor.fit(np.array(X).reshape(-1, X.shape[1]), Y)
print(regressor.coef_)

[-0.40691048]


In [None]:
print(df[df['is_organic'] == True]['additives_count'].mean())
print(df[df['is_organic'] == False]['additives_count'].mean())
print(df[df['is_organic'] == False]['additives_count'].mean() / df[df['is_organic'] == True]['additives_count'].mean())

0.3615688094685542
1.5899598194354878
4.397392080839228


# MAYBE STOP DOING LINEAR REGRESSION ?

## Organic (same analysis)

## Palm Oil

In [None]:
predicted = 'additives_count'

X = df[[predicted, 'palm_oil']].dropna()
Y = X[predicted]
X = X.drop(columns=[predicted])

regressor = LinearRegression()
regressor.fit(np.array(X).reshape(-1, X.shape[1]), Y)
print(regressor.intercept_)
print(regressor.coef_)

1.15931188370361
[0.63501536]


In [None]:
predicted = 'additives_average_dangerousness'

X = df[[predicted, 'palm_oil']].dropna()
Y = X[predicted]
X = X.drop(columns=[predicted])

regressor = LinearRegression()
regressor.fit(np.array(X).reshape(-1, X.shape[1]), Y)
print(regressor.intercept_)
print(regressor.coef_)

0.427297101189754
[0.01710506]


## Nova
- https://world.openfoodfacts.org/nova
- https://www.nature.com/articles/s41430-022-01099-1
- https://en.wikipedia.org/wiki/Ultra-processed_food

In [None]:
df.groupby('nova_group')[['additives_min_dangerousness', 'additives_average_dangerousness', 'additives_max_dangerousness', 'additives_count', 'has_additives']].mean().sort_values(by='additives_average_dangerousness')

Unnamed: 0_level_0,additives_min_dangerousness,additives_average_dangerousness,additives_max_dangerousness,additives_count,has_additives
nova_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.019903,0.020614,0.021609,0.046915,0.041228
2.0,0.050473,0.062303,0.070978,0.059937,0.048896
3.0,0.137682,0.206557,0.276663,0.313321,0.202049
4.0,0.248572,0.78114,1.446425,2.699152,0.828371


## Brands

In [None]:
a = df.groupby('brands')[['additives_min_dangerousness', 'additives_average_dangerousness', 'additives_max_dangerousness', 'additives_count', 'has_additives']] \
    .agg(['mean', 'count']).sort_values(by=('additives_average_dangerousness', 'mean'), ascending=False)
a[a[('additives_count', 'count')] > 100].head(10)

Unnamed: 0_level_0,additives_min_dangerousness,additives_min_dangerousness,additives_average_dangerousness,additives_average_dangerousness,additives_max_dangerousness,additives_max_dangerousness,additives_count,additives_count,has_additives,has_additives
Unnamed: 0_level_1,mean,count,mean,count,mean,count,mean,count,mean,count
brands,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Monique Ranou,0.339623,106,1.29115,106,2.5,106,3.150943,106,0.90566,106
Dia,0.320388,103,0.73673,103,1.213592,103,2.009709,103,0.582524,103
Leader Price,0.267677,198,0.71186,198,1.257576,198,1.893939,198,0.611111,198
Cora,0.211382,123,0.710548,123,1.300813,123,1.642276,123,0.593496,123
Hacendado,0.267717,127,0.616476,127,0.968504,127,1.472441,127,0.503937,127
Fleury Michon,0.003731,268,0.593781,268,1.261194,268,1.488806,268,0.817164,268
Casino,0.207273,275,0.538504,275,1.021818,275,1.683636,275,0.556364,275
U,0.180662,393,0.499027,393,0.903308,393,1.424936,393,0.496183,393
Auchan,0.140549,619,0.489472,619,0.964459,619,1.749596,619,0.542811,619
Carrefour,0.11192,1099,0.438895,1099,0.871702,1099,1.594177,1099,0.541401,1099


In [None]:
# predicted = 'additives_count'

# X = df[[predicted, 'palm_oil', 'nutriscore_fr_as_number', 'nova_group', 'ingredients_count', 'is_vegan', 'is_vegetarian']].dropna()
# Y = X[predicted]
# X = X.drop(columns=[predicted])

# regressor = LinearRegression()
# regressor.fit(np.array(X).reshape(-1, X.shape[1]), Y)
# print(regressor.intercept_)
# print(regressor.coef_)

# pred = regressor.predict(X)

# print(r2_score(Y, pred))

In [None]:
# bubble_chart_additives = df['additives'].str.split(pat=',').explode(ignore_index=True).value_counts().to_frame().reset_index()
# bubble_chart_additives['dangerousness'] = bubble_chart_additives.apply(lambda r: ADDITIVES_DANGEROUSNESS.get(r['index'].split('-')[0].strip(), 0), axis=1)
# bubble_chart_additives.columns = ['additive', 'count', 'dangerosity']
# bubble_chart_additives['group'] = 1
# bubble_chart_additives.to_json('graph/test.json', orient='records', indent=4)

In [None]:
bubble_chart_additives = df['additives'].str.split(pat=',').explode(ignore_index=True).value_counts().to_frame().reset_index()
bubble_chart_additives['dangerousness'] = bubble_chart_additives.apply(lambda r: ADDITIVES_DANGEROUSNESS.get(r['index'].split('-')[0].strip(), 0), axis=1)
bubble_chart_additives.columns = ['additive', 'count', 'dangerosity']
bubble_chart_additives['group'] = 1
bubble_chart_additives.to_json('graph/test.json', orient='records', indent=4)

test2 = df.explode(['categories_splitted']).groupby('categories_splitted')['additives_average_dangerousness'].agg(['mean', 'count']).query('count > 25').reset_index()
test2.columns = ['categories', 'dangerosity', 'count']
test2['group'] = 1
test2.to_json('test2.json', orient='records', indent=4)

# test2 = df[['product_name', 'additives_average_dangerousness']]
# test2.columns = ['product_name', 'dangerosity']
# test2['group'] = 1
# test2['count'] = 1
# test2.to_json('test2.json', orient='records', indent=4)