In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report

# Load the data
data = pd.read_csv('C:/Users/morit/OneDrive/Dokumente/ESADE/Term3/AI Prototypes/Assignment 2/data/beverages_combined.csv')
print("Data loaded successfully")
print(data.info())

Data loaded successfully
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10586 entries, 0 to 10585
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Fat Unit                     10586 non-null  object 
 1   Categories                   10586 non-null  object 
 2   Food Groups Tags             10586 non-null  object 
 3   Nutrition Data Prepared Per  10586 non-null  object 
 4   Countries                    10586 non-null  object 
 5   Categories Tags              10586 non-null  object 
 6   Sugars (g)                   10586 non-null  float64
 7   Proteins (g)                 10586 non-null  float64
 8   Proteins Unit                10586 non-null  object 
 9   Nutriscore Grade             10586 non-null  object 
 10  Nutriscore Score             10586 non-null  float64
 11  Ingredients Origin Score     10586 non-null  float64
 12  Saturated Fat (g)            10586 non-null  floa

In [13]:
# Select relevant features and target
features = ['Energy (kcal)', 'Sugars (g)', 'Fat (g)', 'Saturated Fat (g)', 'Salt (g)', 'Proteins (g)']
target = 'Nutriscore Grade'

# Filter data to include only valid Nutriscore grades
valid_grades = ['a', 'b', 'c', 'd', 'e']
data_filtered = data[data[target].isin(valid_grades)]

# Select features and target from the filtered data
X = data_filtered[features]
y = data_filtered[target]


In [3]:
# Define feature columns and target column
feature_columns = ['Sugars (g)', 'Proteins (g)', 'Fat (g)', 'Salt (g)', 'Energy (kcal)', 'Categories', 'Brands']
target_column = 'Nutriscore Grade'

# Separate features and target
X = data[feature_columns]
y = data[target_column]


In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split into training and testing sets")

Data split into training and testing sets


In [15]:
# Preprocessing pipelines for numeric features
numeric_features = features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Create a pipeline that preprocesses the data then fits a decision tree model
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Train the model
clf.fit(X_train, y_train)
print("Model trained successfully")

Model trained successfully


In [16]:
# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           a       0.76      0.94      0.84       287
           b       0.87      0.80      0.83       561
           c       0.80      0.82      0.81       366
           d       0.79      0.82      0.81       327
           e       0.87      0.81      0.84       577

    accuracy                           0.83      2118
   macro avg       0.82      0.84      0.82      2118
weighted avg       0.83      0.83      0.83      2118



In [18]:
import joblib

# Save the trained model to a file
joblib.dump(clf, 'nutriscore_decision_tree_model.pkl')
print("Model saved successfully")


Model saved successfully
