Standard_Scaled_Dataset (text,label,Label_embaded,OHE) ->

In [1]:
import pandas as pd

df = pd.read_csv('/content/final_dataset._v2.csv')
df.head()

Unnamed: 0,energy_100g,saturated_fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,fruits_veg_nuts_100g,sweetener_present,nutriscore_grade
0,6000.0,0.0,37.5,7.5,2.5,0.0,75.0,0,c
1,840.0,0.0,8.0,88.0,0.0,0.23,50.0,0,e
2,962.0,2.0,0.98,9.0,22.0,0.95,0.0,0,c
3,67.0,0.06,0.24,88.0,1.0,0.3,0.0,0,b
4,1852.0,2.6,25.0,1.4,0.5,0.53,22.666667,0,e


In [2]:
from sklearn.preprocessing import StandardScaler

columns_to_scale = [
    'energy_100g',
     'saturated_fat_100g',
     'sugars_100g',
     'fiber_100g',
     'proteins_100g',
     'salt_100g',
     'fruits_veg_nuts_100g']

scaler = StandardScaler()

df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

df.head()

Unnamed: 0,energy_100g,saturated_fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,fruits_veg_nuts_100g,sweetener_present,nutriscore_grade
0,14.847065,-0.205128,2.526062,-0.003399,0.115812,-0.059011,1.955972,0,c
1,1.546936,-0.205128,-0.012153,-0.003399,-0.176308,-0.015551,1.137043,0,e
2,1.861396,0.544892,-0.616162,-0.003399,2.394351,0.120498,-0.500817,0,c
3,-0.445506,-0.182628,-0.679833,-0.003399,-0.05946,-0.002324,-0.500817,0,b
4,4.155411,0.769898,1.450547,-0.003399,-0.117884,0.041136,0.241679,0,e


In [3]:
columns_to_include = [
    'energy_100g',
     'saturated_fat_100g',
     'sugars_100g',
     'fiber_100g',
     'proteins_100g',
     'salt_100g',
     'fruits_veg_nuts_100g']

df["text"] = df[columns_to_include].apply(lambda row: ','.join(f"{col}:{int(row[col]) if row[col] % 1 == 0 else row[col]}" for col in columns_to_include), axis=1)

**Label Encoding best for Tree-based models (Decision Trees, Random Forest, XGBoost, CatBoost, LightGBM)**

In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df["label_encoded"] = label_encoder.fit_transform(df["nutriscore_grade"])

**One-Hot Encoding best for Logistic Regression, SVM, KNN, Naïve Bayes, Neural Networks (MLP, LSTM, CNN)**

In [6]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, dtype=int)

encoded_labels = encoder.fit_transform(df[['nutriscore_grade']])

ohe_df = pd.DataFrame(encoded_labels, columns=encoder.get_feature_names_out(['nutriscore_grade']))

df = pd.concat([df, ohe_df], axis=1)

**Word Embeddings (BERT, Word2Vec, FastText) Best for Deep Learning Models (LSTMs, CNNs, Transformers, MLP)**

In [7]:
X = df['text']

In [8]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

#BERT (Bidirectional Encoder Representations from Transformers)
model = AutoModel.from_pretrained("bert-base-uncased")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def get_embeddings(text):
    tokens = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    with torch.no_grad():
        output = model(**tokens.to(device))
    return output.last_hidden_state.mean(dim=1).squeeze()

X_embeddings = []

for text in X:
    if not isinstance(text, str):
        text = str(text)
    embeddings = get_embeddings(text)
    X_embeddings.append(embeddings)

X_embeddings = torch.stack(X_embeddings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [9]:
y = df['label_encoded']

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

(torch.Size([69252, 768]), torch.Size([17313, 768]), (69252,), (17313,))

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

# XGBoost (BERT Embading + Label Encoder)

In [12]:
xgb_model = XGBClassifier(tree_method='hist', device='cuda')
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_preds)

In [13]:
print("XGBoost Metrics:")
print(classification_report(y_test, xgb_preds))

XGBoost Metrics:
              precision    recall  f1-score   support

           0       0.61      0.31      0.41       376
           1       0.73      0.70      0.71      3574
           2       0.62      0.64      0.63      4853
           3       0.64      0.45      0.53      2676
           4       0.74      0.86      0.80      5834

    accuracy                           0.69     17313
   macro avg       0.67      0.59      0.62     17313
weighted avg       0.68      0.69      0.68     17313



In [16]:
if isinstance(X_train, torch.Tensor):
    X_train = X_train.cpu().numpy()
if isinstance(y_train, torch.Tensor):
    y_train = y_train.cpu().numpy()

lr_model = LogisticRegression(max_iter=1000,
                              solver='lbfgs',
                              multi_class='multinomial')
lr_model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [20]:
lr_preds = lr_model.predict(X_test)
print("Logistic Regression Metrics:")
print(classification_report(y_test, lr_preds))

Logistic Regression Metrics:
              precision    recall  f1-score   support

           0       0.74      0.09      0.17       376
           1       0.68      0.69      0.69      3574
           2       0.57      0.59      0.58      4853
           3       0.56      0.35      0.43      2676
           4       0.73      0.86      0.79      5834

    accuracy                           0.65     17313
   macro avg       0.66      0.52      0.53     17313
weighted avg       0.65      0.65      0.64     17313



In [21]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

svm_model = CalibratedClassifierCV(LinearSVC())
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)
print("SVM Metrics:")
print(classification_report(y_test, svm_preds))

SVM Metrics:
              precision    recall  f1-score   support

           0       0.63      0.25      0.36       376
           1       0.70      0.73      0.72      3574
           2       0.61      0.59      0.60      4853
           3       0.57      0.39      0.46      2676
           4       0.74      0.87      0.80      5834

    accuracy                           0.68     17313
   macro avg       0.65      0.57      0.59     17313
weighted avg       0.67      0.68      0.67     17313



In [22]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

dt_preds = dt_model.predict(X_test)

print("Decision Tree Model Performance:")
print(classification_report(y_test, dt_preds))

Decision Tree Model Performance:
              precision    recall  f1-score   support

           0       0.32      0.29      0.30       376
           1       0.60      0.63      0.62      3574
           2       0.55      0.54      0.54      4853
           3       0.45      0.46      0.45      2676
           4       0.68      0.66      0.67      5834

    accuracy                           0.58     17313
   macro avg       0.52      0.52      0.52     17313
weighted avg       0.58      0.58      0.58     17313



In [23]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

print("Random Forest Model Performance:")
print(classification_report(y_test, rf_preds))

Random Forest Model Performance:
              precision    recall  f1-score   support

           0       0.66      0.31      0.42       376
           1       0.76      0.63      0.69      3574
           2       0.58      0.66      0.62      4853
           3       0.73      0.37      0.49      2676
           4       0.69      0.87      0.77      5834

    accuracy                           0.67     17313
   macro avg       0.68      0.57      0.60     17313
weighted avg       0.68      0.67      0.66     17313

