In [None]:
# Install dependencies as needed:

import pandas as pd
import numpy as np
import kagglehub
import os
import shutil
import torch
import gdown

In [98]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# https://drive.google.com/file/d/1zBjApk015tgVWISIRxQdgIDV4kuxSeeC/view?usp=drive_link
filepath = './data/ingredient_to_culture.json'
gid = '1zBjApk015tgVWISIRxQdgIDV4kuxSeeC'

if not os.path.exists(filepath):
    gdown.download(f'https://drive.google.com/uc?id={gid}', filepath, quiet=False)


cuisine_df = pd.read_json('./data/ingredient_to_culture.json')
cuisine_df = cuisine_df.dropna()
cuisine_df = cuisine_df.drop(columns=['id'])

#turn the ingredients into a string
cuisine_df['ingredients'] = cuisine_df['ingredients'].apply(' '.join)

print(cuisine_df.shape)
print(cuisine_df['cuisine'].value_counts())
cuisine_df.head()

(39774, 2)
cuisine
italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: count, dtype: int64


Unnamed: 0,cuisine,ingredients
0,greek,romaine lettuce black olives grape tomatoes ga...
1,southern_us,plain flour ground pepper salt tomatoes ground...
2,filipino,eggs pepper salt mayonaise cooking oil green c...
3,indian,water vegetable oil wheat salt
4,indian,black pepper shallots cornflour cayenne pepper...


In [100]:
X = cuisine_df['ingredients']
y = cuisine_df['cuisine']

In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)

X = vectorizer.fit_transform(X)

print(X[0])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16 stored elements and shape (1, 1000)>
  Coords	Values
  (0, 742)	0.3426671291173114
  (0, 491)	0.2663871237012894
  (0, 72)	0.1398962004921347
  (0, 584)	0.26100924108701357
  (0, 385)	0.35031170238526027
  (0, 920)	0.15183517837377775
  (0, 363)	0.10531073154596084
  (0, 633)	0.10208411357610164
  (0, 694)	0.23913220198081458
  (0, 586)	0.1645493089953018
  (0, 782)	0.23007896012035983
  (0, 361)	0.38853112215987895
  (0, 57)	0.20748802168948122
  (0, 315)	0.3040361765035925
  (0, 151)	0.14568369866765699
  (0, 238)	0.3343204746101372


In [102]:
# Standardize the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)

X = scaler.fit_transform(X)

print(X[0])


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16 stored elements and shape (1, 1000)>
  Coords	Values
  (0, 742)	13.143820106532235
  (0, 491)	6.628604662909429
  (0, 72)	2.175009523740244
  (0, 584)	5.890218529194871
  (0, 385)	12.649892384273961
  (0, 920)	2.3434229248419163
  (0, 363)	1.8707886422650988
  (0, 633)	1.3266520049232076
  (0, 694)	4.9165686882179065
  (0, 586)	2.7337198734698163
  (0, 782)	4.229989753485227
  (0, 361)	18.133277737627218
  (0, 57)	3.4803446379414287
  (0, 315)	8.169227793095388
  (0, 151)	1.7475559340561924
  (0, 238)	11.055045759535092


In [103]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
import torch

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = TensorDataset(torch.tensor(X_train.toarray(), dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test.toarray(), dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

print(len(train_dataset))
print(len(test_dataset))


train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)



31819
7955


In [104]:
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

#include regularization


model = nn.Sequential(
    nn.Linear(1000, 512),
    nn.ReLU(),
    nn.Linear(512, 1024),
    nn.ReLU(),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Linear(1024, 512),
    nn.ReLU(),
    nn.Linear(512, 512),
    nn.ReLU(),
    nn.Linear(512, 20)
)

criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

#train model
for epoch in range(100):
    model.train()
    for X_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1} Loss: {loss.item()}')



100%|██████████| 125/125 [00:02<00:00, 55.89it/s]


Epoch 1 Loss: 0.9561951756477356


100%|██████████| 125/125 [00:02<00:00, 52.34it/s]


Epoch 2 Loss: 0.9994404911994934


100%|██████████| 125/125 [00:02<00:00, 56.03it/s]


Epoch 3 Loss: 0.5533958077430725


  6%|▌         | 7/125 [00:00<00:09, 12.06it/s]


KeyboardInterrupt: 

In [None]:
#evaluate on training data
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in tqdm(train_loader):
        y_pred = model(X_batch)
        _, predicted = torch.max(y_pred, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print(f'Train Accuracy: {correct / total}')
        

#evaluate model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in tqdm(test_loader):
        y_pred = model(X_batch)
        _, predicted = torch.max(y_pred, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print(f'Test Accuracy: {correct / total}')






100%|██████████| 125/125 [00:01<00:00, 122.29it/s]


Train Accuracy: 0.974323517395267


100%|██████████| 32/32 [00:00<00:00, 104.90it/s]

Test Accuracy: 0.7461973601508485



