In [1]:
# 73 accuracy%
import numpy as np # linear algebra
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# Loading and Preprocessing Data

In [2]:
recipe = pd.read_csv('train2.csv')

In [3]:
recipe.head()

Unnamed: 0.1,Unnamed: 0,id,cuisine,ingredients
0,0,10259,greek,"['romaine lettuce', 'black olives', 'grape tom..."
1,1,25693,southern_us,"['plain flour', 'ground pepper', 'salt', 'toma..."
2,2,20130,filipino,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki..."
3,3,22213,indian,"['water', 'vegetable oil', 'wheat', 'salt']"
4,4,13162,indian,"['black pepper', 'shallots', 'cornflour', 'cay..."


In [4]:
column_to_delete = 'Unnamed: 0'

if column_to_delete in recipe.columns:
    # Delete the specified column
    recipe = recipe.drop(column_to_delete, axis=1)

In [5]:
recipe.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"['romaine lettuce', 'black olives', 'grape tom..."
1,25693,southern_us,"['plain flour', 'ground pepper', 'salt', 'toma..."
2,20130,filipino,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki..."
3,22213,indian,"['water', 'vegetable oil', 'wheat', 'salt']"
4,13162,indian,"['black pepper', 'shallots', 'cornflour', 'cay..."


In [6]:
print('Shape:',recipe.shape) 
print('Columns:',recipe.columns)

print('Whether Null exists:\n',recipe.isnull().sum())

Shape: (39774, 3)
Columns: Index(['id', 'cuisine', 'ingredients'], dtype='object')
Whether Null exists:
 id             0
cuisine        0
ingredients    0
dtype: int64


In [7]:
recipe['cuisine'].nunique()

20

In [8]:
recipe['cuisine'].unique()

array(['greek', 'southern_us', 'filipino', 'indian', 'jamaican',
       'spanish', 'italian', 'mexican', 'chinese', 'british', 'thai',
       'vietnamese', 'cajun_creole', 'brazilian', 'french', 'japanese',
       'irish', 'korean', 'moroccan', 'russian'], dtype=object)

In [9]:
recipe['ingredients'][0]

"['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']"

In [10]:
recipe['ingredients'][6]

"['olive oil', 'salt', 'medium shrimp', 'pepper', 'garlic', 'chopped cilantro', 'jalapeno chilies', 'flat leaf parsley', 'skirt steak', 'white vinegar', 'sea salt', 'bay leaf', 'chorizo sausage']"

In [11]:
recipe['cuisine'].value_counts()

cuisine
italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: count, dtype: int64

In [12]:
# bar plot for count of entries for each cuisine
x = recipe['cuisine'].value_counts().index
y = recipe['cuisine'].value_counts().values

df = pd.DataFrame({
    'Cuisine':x,
    'These many entries':y
})
#fig = sns.countplot(recipe['cuisine'])
fig = px.bar(df,
             x='Cuisine',
             y='These many entries',
             color='Cuisine')
fig.show()

# Look at all ingredients

In [14]:
all_ingredients = []  # list to store all ingredients

for indiv_ingredient_list_str in recipe['ingredients'].values:
    # Convert the string representation of a list to an actual list
    indiv_ingredient_list = eval(indiv_ingredient_list_str)
    
    for ingredient in indiv_ingredient_list:
        all_ingredients.append(ingredient)

#print(all_ingredients[:30])


In [15]:
# Convert that list in a Pandas DataFrame so that we can apply value_counts
ingredients_together = pd.DataFrame(all_ingredients)

In [16]:
ingredients_together

Unnamed: 0,0
0,romaine lettuce
1,black olives
2,grape tomatoes
3,garlic
4,pepper
...,...
428270,garlic
428271,white sugar
428272,roma tomatoes
428273,celery


In [17]:
ingredients_together.value_counts()[0:30] # for first 30

salt                      18049
olive oil                  7972
onions                     7972
water                      7457
garlic                     7380
sugar                      6434
garlic cloves              6237
butter                     4848
ground black pepper        4785
all-purpose flour          4632
pepper                     4438
vegetable oil              4385
eggs                       3388
soy sauce                  3296
kosher salt                3113
green onions               3078
tomatoes                   3058
large eggs                 2948
carrots                    2814
unsalted butter            2782
extra-virgin olive oil     2747
ground cumin               2747
black pepper               2627
milk                       2263
chili powder               2036
oil                        1970
red bell pepper            1939
purple onion               1896
scallions                  1891
grated parmesan cheese     1886
Name: count, dtype: int64

In [18]:
# bar plot for count of entries for each cuisine
x = ingredients_together.value_counts()[0:30].index.tolist()
y = ingredients_together.value_counts()[0:30].values

df = pd.DataFrame({
    'Ingredient':x,
    'These many entries':y
})
#fig = sns.countplot(recipe['cuisine'])
fig = px.pie(df,
             names='Ingredient',
             values='These many entries',
             color='Ingredient')
fig.show()

In [19]:
recipe['cuisine'].value_counts()

cuisine
italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: count, dtype: int64

In [20]:
recipe['cuisine']

0              greek
1        southern_us
2           filipino
3             indian
4             indian
            ...     
39769          irish
39770        italian
39771          irish
39772        chinese
39773        mexican
Name: cuisine, Length: 39774, dtype: object

## Replacing cuisine names with numbers

In [21]:
recipe['cuisine'] = recipe['cuisine'].str.strip().map({'italian':0,
                       'mexican':1,
                       'southern_us':2,
                       'indian':3,
                       'chinese':4,
                       'french':5,
                       'cajun_creole':6,
                       'thai':7,
                       'japanese':8,
                       'greek':9,
                       'spanish':10,
                       'korean':11,
                       'vietnamese':12,
                       'moroccan':13,
                       'british':14,
                       'filipino':15,
                       'irish':16,
                       'jamaican':17,
                       'russian':18,
                       'brazilian':19
})

In [22]:
recipe['cuisine']

0         9
1         2
2        15
3         3
4         3
         ..
39769    16
39770     0
39771    16
39772     4
39773     1
Name: cuisine, Length: 39774, dtype: int64

In [23]:
X = recipe.iloc[:,-1]
y = recipe['cuisine']
X

0        ['romaine lettuce', 'black olives', 'grape tom...
1        ['plain flour', 'ground pepper', 'salt', 'toma...
2        ['eggs', 'pepper', 'salt', 'mayonaise', 'cooki...
3              ['water', 'vegetable oil', 'wheat', 'salt']
4        ['black pepper', 'shallots', 'cornflour', 'cay...
                               ...                        
39769    ['light brown sugar', 'granulated sugar', 'but...
39770    ['KRAFT Zesty Italian Dressing', 'purple onion...
39771    ['eggs', 'citrus fruit', 'raisins', 'sourdough...
39772    ['boneless chicken skinless thigh', 'minced ga...
39773    ['green chile', 'jalapeno chilies', 'onions', ...
Name: ingredients, Length: 39774, dtype: object

In [24]:
import ast

# Assuming X is the "ingredients" column of your DataFrame
X_strings = recipe['ingredients']

In [25]:
X_lists = X_strings.apply(ast.literal_eval)

In [26]:
X_lists

0        [romaine lettuce, black olives, grape tomatoes...
1        [plain flour, ground pepper, salt, tomatoes, g...
2        [eggs, pepper, salt, mayonaise, cooking oil, g...
3                      [water, vegetable oil, wheat, salt]
4        [black pepper, shallots, cornflour, cayenne pe...
                               ...                        
39769    [light brown sugar, granulated sugar, butter, ...
39770    [KRAFT Zesty Italian Dressing, purple onion, b...
39771    [eggs, citrus fruit, raisins, sourdough starte...
39772    [boneless chicken skinless thigh, minced garli...
39773    [green chile, jalapeno chilies, onions, ground...
Name: ingredients, Length: 39774, dtype: object

In [27]:
X = X_lists

## Test Train split

In [30]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [31]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [32]:
X_train

Unnamed: 0,ingredients
31255,"[yellow onion, hot water, ground turmeric, fre..."
32399,"[lime juice, zucchini, chili powder, salt, dri..."
34614,"[fresh ginger, cauliflower florets, lemon juic..."
22301,"[fish sauce, pepper, garlic, calamansi, broth,..."
4688,"[water, salt, celery, onions, chicken stock, v..."
...,...
7813,"[orange juice concentrate, garlic cloves, fres..."
32511,"[grated parmesan cheese, fresh basil leaves, e..."
5192,"[basil leaves, olive oil, extra-virgin olive o..."
12172,"[powdered sugar, large egg yolks, salt, heavy ..."


## Join the data t one string

In [33]:
X_train['ingredients'] = X_train['ingredients'].apply(lambda x:  ' '.join(x))
X_test['ingredients'] = X_test['ingredients'].apply(lambda x:  ' '.join(x))

In [34]:
X_train

Unnamed: 0,ingredients
31255,yellow onion hot water ground turmeric fresh g...
32399,lime juice zucchini chili powder salt dried or...
34614,fresh ginger cauliflower florets lemon juice f...
22301,fish sauce pepper garlic calamansi broth gluti...
4688,water salt celery onions chicken stock vegetab...
...,...
7813,orange juice concentrate garlic cloves fresh c...
32511,grated parmesan cheese fresh basil leaves eggp...
5192,basil leaves olive oil extra-virgin olive oil ...
12172,powdered sugar large egg yolks salt heavy whip...


In [35]:
y_train

31255     3
32399     1
34614     3
22301    15
4688      2
         ..
7813      5
32511     0
5192      0
12172    18
33003     4
Name: cuisine, Length: 31819, dtype: int64

In [36]:
y_test

25674     3
3666      2
27481     0
36750     4
31926     1
         ..
23536     0
30857     6
36586     9
17770     8
36850    14
Name: cuisine, Length: 7955, dtype: int64

In [37]:
X_test

Unnamed: 0,ingredients
25674,water yoghurt peanut oil ground cumin ground c...
3666,lemon zest whipping cream yellow corn meal bak...
27481,salad dressing chuck roast garlic ground black...
36750,sesame seeds worcestershire sauce cucumber pep...
31926,bay leaves cayenne pepper ground cloves vegeta...
...,...
23536,shredded cheddar cheese ricotta cheese eggs Al...
30857,cajun seasoning onions water hot Italian sausa...
36586,eggplant large garlic cloves veal shanks large...
17770,ground pepper cilantro scallions seedless cucu...


# Creating LSTM Model

In [38]:
#RNN model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['ingredients'])

# Convert text to sequences and pad sequences to a fixed length
X_train_seq = tokenizer.texts_to_sequences(X_train['ingredients'])
max_sequence_length = 50
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post', truncating='post')






# Training LSTM Model

In [53]:
# Create a simple RNN model
embedding_dim = 50  
num_classes = 20
num_epochs = 30
batch_size = 32   #32
#y_train = y_train - 1
#y_test = y_test - 1
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(units=50))
model.add(Dense(units=num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_pad, y_train, epochs=num_epochs, batch_size=batch_size, validation_split=0.2)


X_test_seq = tokenizer.texts_to_sequences(X_test['ingredients'])
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post', truncating='post')

# Make predictions on the test set
y_pred_proba = model.predict(X_test_pad)
y_pred = y_pred_proba.argmax(axis=1)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Accuracy: 0.7267127592708988
Confusion Matrix:
[[1335   20   54    4    5   85   21    0    6   32   13    0    0    5
    10    0    1    0   12    4]
 [  31 1161   31   15    8    8    7    1    2    7   17    0    0    4
     7    5    2    6    2    8]
 [  24   27  610    9   13   24   48    1    6    4    3    0    0    3
    21    6   17    8    6    6]
 [   4   10    9  516    3    2    0    7    5    7    1    0    0   21
     1    2    2    5    3    3]
 [   7    1    4    0  438    6    1   26   21    0    1    9    7    0
     1   13    0    0    1    0]
 [  93    6   54    1    6  254   10    0    4    7    6    0    0    6
    1

## Accuracy 73 %

In [54]:
#predictions for test data file
test_data = pd.read_csv('test2.csv')

column_to_delete = 'Unnamed: 0'

if column_to_delete in test_data.columns:
    # Delete the specified column
    test_data = test_data.drop(column_to_delete, axis=1)



test_X = test_data['ingredients']

test_seq = tokenizer.texts_to_sequences(test_X)
test_pad = pad_sequences(test_seq, maxlen=max_sequence_length, padding='post', truncating='post')


# Checking on Test Data

In [55]:
test_data

Unnamed: 0,id,ingredients
0,18009,"['baking powder', 'eggs', 'all-purpose flour',..."
1,28583,"['sugar', 'egg yolks', 'corn starch', 'cream o..."
2,41580,"['sausage links', 'fennel bulb', 'fronds', 'ol..."
3,29752,"['meat cuts', 'file powder', 'smoked sausage',..."
4,35687,"['ground black pepper', 'salt', 'sausage casin..."
...,...,...
9939,30246,"['large egg yolks', 'fresh lemon juice', 'suga..."
9940,36028,"['hot sauce', 'butter', 'sweet potatoes', 'ado..."
9941,22339,"['black pepper', 'salt', 'parmigiano reggiano ..."
9942,42525,"['cheddar cheese', 'cayenne', 'paprika', 'plum..."


In [56]:
y_pred_proba = model.predict(test_pad)
y_pred = y_pred_proba.argmax(axis=1)




In [57]:
# Create a DataFrame with the results
results_df = pd.DataFrame({'id': test_data['id'], 'predicted_class': y_pred})

# Save the results to a CSV file
results_df.to_csv('predictions.csv', index=False)


In [58]:
results = pd.read_csv('predictions.csv')


Unnamed: 0,id,predicted_class
0,18009,0
1,28583,0
2,41580,2
3,29752,0
4,35687,0
...,...,...
9939,30246,5
9940,36028,2
9941,22339,0
9942,42525,2


In [44]:
results.head(50)

Unnamed: 0,id,predicted_class
0,18009,0
1,28583,0
2,41580,0
3,29752,0
4,35687,0
5,38527,0
6,19666,0
7,41217,0
8,28753,1
9,22659,14
