In [18]:
# Install dependencies as needed:

import pandas as pd
import numpy as np
import kagglehub
import os
import shutil
import torch
import gdown

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [20]:
# https://drive.google.com/file/d/1zBjApk015tgVWISIRxQdgIDV4kuxSeeC/view?usp=drive_link
filepath = './data/ingredient_to_culture.json'
gid = '1zBjApk015tgVWISIRxQdgIDV4kuxSeeC'

if not os.path.exists(filepath):
    gdown.download(f'https://drive.google.com/uc?id={gid}', filepath, quiet=False)


cuisine_df = pd.read_json('./data/ingredient_to_culture.json')
cuisine_df = cuisine_df.dropna()
cuisine_df = cuisine_df.drop(columns=['id'])

#turn the ingredients into a string
cuisine_df['ingredients'] = cuisine_df['ingredients'].apply(' '.join)

print(cuisine_df.shape)
print(cuisine_df['cuisine'].value_counts())
cuisine_df.head()

(39774, 2)
cuisine
italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: count, dtype: int64


Unnamed: 0,cuisine,ingredients
0,greek,romaine lettuce black olives grape tomatoes ga...
1,southern_us,plain flour ground pepper salt tomatoes ground...
2,filipino,eggs pepper salt mayonaise cooking oil green c...
3,indian,water vegetable oil wheat salt
4,indian,black pepper shallots cornflour cayenne pepper...


In [21]:
X = cuisine_df['ingredients']
y = cuisine_df['cuisine']

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)

X = vectorizer.fit_transform(X)

print(X[0])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16 stored elements and shape (1, 1000)>
  Coords	Values
  (0, 742)	0.3426671291173114
  (0, 491)	0.2663871237012894
  (0, 72)	0.1398962004921347
  (0, 584)	0.26100924108701357
  (0, 385)	0.35031170238526027
  (0, 920)	0.15183517837377775
  (0, 363)	0.10531073154596084
  (0, 633)	0.10208411357610164
  (0, 694)	0.23913220198081458
  (0, 586)	0.1645493089953018
  (0, 782)	0.23007896012035983
  (0, 361)	0.38853112215987895
  (0, 57)	0.20748802168948122
  (0, 315)	0.3040361765035925
  (0, 151)	0.14568369866765699
  (0, 238)	0.3343204746101372


In [23]:
# Standardize the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)

X = scaler.fit_transform(X)

print(X[0])


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16 stored elements and shape (1, 1000)>
  Coords	Values
  (0, 742)	13.143820106532235
  (0, 491)	6.628604662909429
  (0, 72)	2.175009523740244
  (0, 584)	5.890218529194871
  (0, 385)	12.649892384273961
  (0, 920)	2.3434229248419163
  (0, 363)	1.8707886422650988
  (0, 633)	1.3266520049232076
  (0, 694)	4.9165686882179065
  (0, 586)	2.7337198734698163
  (0, 782)	4.229989753485227
  (0, 361)	18.133277737627218
  (0, 57)	3.4803446379414287
  (0, 315)	8.169227793095388
  (0, 151)	1.7475559340561924
  (0, 238)	11.055045759535092


In [24]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
import torch

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = TensorDataset(torch.tensor(X_train.toarray(), dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test.toarray(), dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

print(len(train_dataset))
print(len(test_dataset))


train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)



31819
7955


In [None]:
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

#include regularization


model = nn.Sequential(
    nn.Linear(1000, 512),
    nn.ReLU(),
    nn.Linear(512, 1024),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(1024, 512),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(512, 512),
    nn.ReLU(),
    nn.Linear(512, 20)
)

criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

#train model
for epoch in range(25):
    model.train()
    model.to(device)
    for X_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1} Loss: {loss.item()}')



100%|██████████| 32/32 [00:00<00:00, 83.39it/s]


Epoch 1 Loss: 2.0754776000976562


100%|██████████| 32/32 [00:00<00:00, 99.32it/s]


Epoch 2 Loss: 1.1514091491699219


100%|██████████| 32/32 [00:00<00:00, 100.48it/s]


Epoch 3 Loss: 1.0194541215896606


100%|██████████| 32/32 [00:00<00:00, 98.56it/s]


Epoch 4 Loss: 0.7559131979942322


100%|██████████| 32/32 [00:00<00:00, 62.60it/s]


Epoch 5 Loss: 0.9639334082603455


100%|██████████| 32/32 [00:00<00:00, 82.41it/s]


Epoch 6 Loss: 0.3957095146179199


100%|██████████| 32/32 [00:00<00:00, 95.46it/s]


Epoch 7 Loss: 0.6777928471565247


100%|██████████| 32/32 [00:00<00:00, 101.86it/s]


Epoch 8 Loss: 0.19024671614170074


100%|██████████| 32/32 [00:00<00:00, 63.92it/s] 


Epoch 9 Loss: 0.26361727714538574


100%|██████████| 32/32 [00:00<00:00, 96.36it/s]


Epoch 10 Loss: 0.15790903568267822


100%|██████████| 32/32 [00:00<00:00, 103.40it/s]


Epoch 11 Loss: 0.11428079009056091


100%|██████████| 32/32 [00:00<00:00, 106.36it/s]


Epoch 12 Loss: 0.1857510656118393


100%|██████████| 32/32 [00:00<00:00, 65.73it/s]


Epoch 13 Loss: 0.20608703792095184


100%|██████████| 32/32 [00:00<00:00, 103.10it/s]


Epoch 14 Loss: 0.030621934682130814


100%|██████████| 32/32 [00:00<00:00, 103.75it/s]


Epoch 15 Loss: 0.010946160182356834


100%|██████████| 32/32 [00:00<00:00, 101.33it/s]


Epoch 16 Loss: 0.11400569975376129


100%|██████████| 32/32 [00:00<00:00, 65.57it/s]


Epoch 17 Loss: 0.04336802288889885


100%|██████████| 32/32 [00:00<00:00, 96.29it/s] 


Epoch 18 Loss: 0.012048439122736454


100%|██████████| 32/32 [00:00<00:00, 102.72it/s]


Epoch 19 Loss: 0.047763075679540634


100%|██████████| 32/32 [00:00<00:00, 94.87it/s]


Epoch 20 Loss: 0.010094141587615013


100%|██████████| 32/32 [00:00<00:00, 62.03it/s]


Epoch 21 Loss: 0.019161511212587357


100%|██████████| 32/32 [00:00<00:00, 100.12it/s]


Epoch 22 Loss: 0.03860984370112419


100%|██████████| 32/32 [00:00<00:00, 98.58it/s]


Epoch 23 Loss: 0.007565674837678671


100%|██████████| 32/32 [00:00<00:00, 96.89it/s]


Epoch 24 Loss: 0.02070079930126667


100%|██████████| 32/32 [00:00<00:00, 65.58it/s]


Epoch 25 Loss: 0.04178166016936302


100%|██████████| 32/32 [00:00<00:00, 103.29it/s]


Epoch 26 Loss: 0.10102719813585281


100%|██████████| 32/32 [00:00<00:00, 104.34it/s]


Epoch 27 Loss: 0.024828489869832993


100%|██████████| 32/32 [00:00<00:00, 104.53it/s]


Epoch 28 Loss: 0.020776592195034027


100%|██████████| 32/32 [00:00<00:00, 68.19it/s] 


Epoch 29 Loss: 0.013687736354768276


100%|██████████| 32/32 [00:00<00:00, 93.76it/s] 


Epoch 30 Loss: 0.006315405946224928


100%|██████████| 32/32 [00:00<00:00, 100.79it/s]


Epoch 31 Loss: 0.007152023259550333


100%|██████████| 32/32 [00:00<00:00, 98.97it/s] 


Epoch 32 Loss: 0.009094387292861938


100%|██████████| 32/32 [00:00<00:00, 61.24it/s]


Epoch 33 Loss: 0.007621609140187502


100%|██████████| 32/32 [00:00<00:00, 104.07it/s]


Epoch 34 Loss: 0.010758527554571629


100%|██████████| 32/32 [00:00<00:00, 96.08it/s]


Epoch 35 Loss: 0.0016682130517438054


100%|██████████| 32/32 [00:00<00:00, 99.97it/s]


Epoch 36 Loss: 0.020096123218536377


100%|██████████| 32/32 [00:00<00:00, 60.39it/s]


Epoch 37 Loss: 0.006736814044415951


100%|██████████| 32/32 [00:00<00:00, 97.18it/s]


Epoch 38 Loss: 0.002129302127286792


100%|██████████| 32/32 [00:00<00:00, 95.90it/s]


Epoch 39 Loss: 0.010025973431766033


100%|██████████| 32/32 [00:00<00:00, 94.73it/s]


Epoch 40 Loss: 0.04435793310403824


100%|██████████| 32/32 [00:00<00:00, 61.93it/s]


Epoch 41 Loss: 0.0675201416015625


100%|██████████| 32/32 [00:00<00:00, 103.04it/s]


Epoch 42 Loss: 0.012028060853481293


100%|██████████| 32/32 [00:00<00:00, 105.92it/s]


Epoch 43 Loss: 0.02685832418501377


100%|██████████| 32/32 [00:00<00:00, 103.70it/s]


Epoch 44 Loss: 0.031929899007081985


100%|██████████| 32/32 [00:00<00:00, 68.12it/s] 


Epoch 45 Loss: 0.0028202743269503117


100%|██████████| 32/32 [00:00<00:00, 101.40it/s]


Epoch 46 Loss: 0.007781645283102989


100%|██████████| 32/32 [00:00<00:00, 94.07it/s]


Epoch 47 Loss: 0.023562656715512276


100%|██████████| 32/32 [00:00<00:00, 99.26it/s] 


Epoch 48 Loss: 0.0775643140077591


100%|██████████| 32/32 [00:00<00:00, 64.83it/s] 


Epoch 49 Loss: 0.0070254080928862095


100%|██████████| 32/32 [00:00<00:00, 98.60it/s]


Epoch 50 Loss: 0.005762496031820774


100%|██████████| 32/32 [00:00<00:00, 99.51it/s] 


Epoch 51 Loss: 0.018719268962740898


100%|██████████| 32/32 [00:00<00:00, 100.32it/s]


Epoch 52 Loss: 0.004628264345228672


100%|██████████| 32/32 [00:00<00:00, 65.26it/s] 


Epoch 53 Loss: 0.004492870066314936


100%|██████████| 32/32 [00:00<00:00, 93.42it/s]


Epoch 54 Loss: 0.0006671471637673676


100%|██████████| 32/32 [00:00<00:00, 101.69it/s]


Epoch 55 Loss: 0.004851638339459896


100%|██████████| 32/32 [00:00<00:00, 104.96it/s]


Epoch 56 Loss: 0.0029773120768368244


100%|██████████| 32/32 [00:00<00:00, 103.28it/s]


Epoch 57 Loss: 0.002114004921168089


100%|██████████| 32/32 [00:00<00:00, 61.19it/s]


Epoch 58 Loss: 0.0822737067937851


100%|██████████| 32/32 [00:00<00:00, 97.20it/s]


Epoch 59 Loss: 0.0010275898966938257


100%|██████████| 32/32 [00:00<00:00, 95.56it/s]


Epoch 60 Loss: 0.01648911088705063


100%|██████████| 32/32 [00:00<00:00, 89.43it/s]


Epoch 61 Loss: 0.025325508788228035


100%|██████████| 32/32 [00:00<00:00, 63.86it/s]


Epoch 62 Loss: 0.030409330502152443


100%|██████████| 32/32 [00:00<00:00, 103.62it/s]


Epoch 63 Loss: 0.0024369603488594294


100%|██████████| 32/32 [00:00<00:00, 103.20it/s]


Epoch 64 Loss: 0.017598485574126244


100%|██████████| 32/32 [00:00<00:00, 97.51it/s]


Epoch 65 Loss: 0.01801016740500927


100%|██████████| 32/32 [00:00<00:00, 66.31it/s]


Epoch 66 Loss: 0.011930195614695549


100%|██████████| 32/32 [00:00<00:00, 83.68it/s]


Epoch 67 Loss: 0.013286560773849487


100%|██████████| 32/32 [00:00<00:00, 104.45it/s]


Epoch 68 Loss: 0.053672581911087036


100%|██████████| 32/32 [00:00<00:00, 94.71it/s]


Epoch 69 Loss: 0.013333640061318874


100%|██████████| 32/32 [00:00<00:00, 68.01it/s]


Epoch 70 Loss: 0.021861137822270393


100%|██████████| 32/32 [00:00<00:00, 104.58it/s]


Epoch 71 Loss: 0.019312582910060883


100%|██████████| 32/32 [00:00<00:00, 103.31it/s]


Epoch 72 Loss: 0.03244667872786522


100%|██████████| 32/32 [00:00<00:00, 104.90it/s]


Epoch 73 Loss: 0.08984451740980148


100%|██████████| 32/32 [00:00<00:00, 67.44it/s] 


Epoch 74 Loss: 0.012748894281685352


100%|██████████| 32/32 [00:00<00:00, 105.42it/s]


Epoch 75 Loss: 0.008834858424961567


100%|██████████| 32/32 [00:00<00:00, 98.14it/s] 


Epoch 76 Loss: 0.11287613958120346


100%|██████████| 32/32 [00:00<00:00, 105.24it/s]


Epoch 77 Loss: 0.02215055748820305


100%|██████████| 32/32 [00:00<00:00, 67.93it/s]


Epoch 78 Loss: 0.0015588041860610247


100%|██████████| 32/32 [00:00<00:00, 102.70it/s]


Epoch 79 Loss: 0.0036832201294600964


100%|██████████| 32/32 [00:00<00:00, 104.11it/s]


Epoch 80 Loss: 0.0024518489371985197


100%|██████████| 32/32 [00:00<00:00, 103.22it/s]


Epoch 81 Loss: 0.04488261789083481


100%|██████████| 32/32 [00:00<00:00, 67.07it/s]


Epoch 82 Loss: 0.007720374967902899


100%|██████████| 32/32 [00:00<00:00, 88.54it/s]


Epoch 83 Loss: 0.010923025198280811


100%|██████████| 32/32 [00:00<00:00, 94.57it/s]


Epoch 84 Loss: 0.0010111709125339985


100%|██████████| 32/32 [00:00<00:00, 91.92it/s]


Epoch 85 Loss: 0.0003112891281489283


100%|██████████| 32/32 [00:00<00:00, 64.81it/s]


Epoch 86 Loss: 0.0021864112932235003


100%|██████████| 32/32 [00:00<00:00, 104.49it/s]


Epoch 87 Loss: 0.02480100467801094


100%|██████████| 32/32 [00:00<00:00, 104.11it/s]


Epoch 88 Loss: 0.0015425586607307196


100%|██████████| 32/32 [00:00<00:00, 103.75it/s]


Epoch 89 Loss: 0.002315273741260171


100%|██████████| 32/32 [00:00<00:00, 68.16it/s] 


Epoch 90 Loss: 0.016334550455212593


100%|██████████| 32/32 [00:00<00:00, 103.25it/s]


Epoch 91 Loss: 0.0024306243285536766


100%|██████████| 32/32 [00:00<00:00, 103.72it/s]


Epoch 92 Loss: 0.0022508837282657623


100%|██████████| 32/32 [00:00<00:00, 104.66it/s]


Epoch 93 Loss: 0.0012695983750745654


100%|██████████| 32/32 [00:00<00:00, 64.88it/s]


Epoch 94 Loss: 0.0037705660797655582


100%|██████████| 32/32 [00:00<00:00, 91.22it/s]


Epoch 95 Loss: 0.009041406214237213


100%|██████████| 32/32 [00:00<00:00, 102.54it/s]


Epoch 96 Loss: 0.011244933120906353


100%|██████████| 32/32 [00:00<00:00, 100.14it/s]


Epoch 97 Loss: 0.03616483509540558


100%|██████████| 32/32 [00:00<00:00, 64.13it/s]


Epoch 98 Loss: 0.0024233332369476557


100%|██████████| 32/32 [00:00<00:00, 95.44it/s]


Epoch 99 Loss: 0.002994848182424903


100%|██████████| 32/32 [00:00<00:00, 102.00it/s]

Epoch 100 Loss: 0.007744675502181053





In [29]:
#evaluate on training data
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in tqdm(train_loader):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = model(X_batch)
        _, predicted = torch.max(y_pred, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print(f'Train Accuracy: {correct / total}')
        

#evaluate model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in tqdm(test_loader):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = model(X_batch)
        _, predicted = torch.max(y_pred, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print(f'Test Accuracy: {correct / total}')






100%|██████████| 32/32 [00:00<00:00, 72.42it/s]


Train Accuracy: 0.9983343285458374


100%|██████████| 8/8 [00:00<00:00, 112.17it/s]

Test Accuracy: 0.7497171590194845



