# **PREPROCESSING STEPS**

In [1]:
# Importing the required libraries

import pandas as pd
import numpy as np
import string
import nltk
import re

In [2]:
# Importing the dataset

data = pd.read_excel('/content/drive/MyDrive/MY PROJECTS/Indian Food Diet Chart chatbots/6000+ Indian Food Recipes Dataset/IndianFoodDatasetXLS.xlsx')

In [3]:
data = data.drop(columns = {'Srno', 'Ingredients', 'RecipeName'})

In [4]:
data.info()   # Basic information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6871 entries, 0 to 6870
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   TranslatedRecipeName    6871 non-null   object
 1   TranslatedIngredients   6865 non-null   object
 2   PrepTimeInMins          6871 non-null   int64 
 3   CookTimeInMins          6871 non-null   int64 
 4   TotalTimeInMins         6871 non-null   int64 
 5   Servings                6871 non-null   int64 
 6   Cuisine                 6871 non-null   object
 7   Course                  6871 non-null   object
 8   Diet                    6871 non-null   object
 9   TranslatedInstructions  6871 non-null   object
 10  URL                     6871 non-null   object
dtypes: int64(4), object(7)
memory usage: 590.6+ KB


In [5]:
data.describe()     # Basic statistics

Unnamed: 0,PrepTimeInMins,CookTimeInMins,TotalTimeInMins,Servings
count,6871.0,6871.0,6871.0,6871.0
mean,28.585213,30.832339,59.417552,5.61141
std,81.042007,34.019694,88.69994,26.221807
min,0.0,0.0,0.0,1.0
25%,10.0,20.0,30.0,4.0
50%,15.0,30.0,40.0,4.0
75%,20.0,35.0,55.0,4.0
max,2880.0,900.0,2925.0,1000.0


In [6]:
data.dtypes

TranslatedRecipeName      object
TranslatedIngredients     object
PrepTimeInMins             int64
CookTimeInMins             int64
TotalTimeInMins            int64
Servings                   int64
Cuisine                   object
Course                    object
Diet                      object
TranslatedInstructions    object
URL                       object
dtype: object

In [7]:
data.shape

(6871, 11)

In [8]:
# Checking whether null values present or not

data.isnull().sum()                # Null values present

TranslatedRecipeName      0
TranslatedIngredients     6
PrepTimeInMins            0
CookTimeInMins            0
TotalTimeInMins           0
Servings                  0
Cuisine                   0
Course                    0
Diet                      0
TranslatedInstructions    0
URL                       0
dtype: int64

In [9]:
data = data.dropna()     # Drop the null values

In [10]:
data.isnull().sum()      # Again checking null values

TranslatedRecipeName      0
TranslatedIngredients     0
PrepTimeInMins            0
CookTimeInMins            0
TotalTimeInMins           0
Servings                  0
Cuisine                   0
Course                    0
Diet                      0
TranslatedInstructions    0
URL                       0
dtype: int64

In [11]:
# Checking whether duplicates present or not

data.duplicated().sum()                 # No duplicates present

0

In [12]:
# Define the preprocess step

def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
  return text

In [13]:
data['TranslatedRecipeName'] = data['TranslatedRecipeName'].apply(preprocess_text)
data['Cuisine'] = data['Cuisine'].apply(preprocess_text)
data['Course'] = data['Course'].apply(preprocess_text)
data['Diet'] = data['Diet'].apply(preprocess_text)

In [14]:
data.head()

Unnamed: 0,TranslatedRecipeName,TranslatedIngredients,PrepTimeInMins,CookTimeInMins,TotalTimeInMins,Servings,Cuisine,Course,Diet,TranslatedInstructions,URL
0,masala karela recipe,"6 Karela (Bitter Gourd/ Pavakkai) - deseeded,S...",15,30,45,6,indian,side dish,diabetic friendly,"To begin making the Masala Karela Recipe,de-se...",https://www.archanaskitchen.com/masala-karela-...
1,spicy tomato rice recipe,"2-1 / 2 cups rice - cooked, 3 tomatoes, 3 teas...",5,10,15,3,south indian recipes,main course,vegetarian,"To make tomato puliogere, first cut the tomato...",http://www.archanaskitchen.com/spicy-tomato-ri...
2,ragi semiya upma recipe ragi millet vermicel...,"1-1/2 cups Rice Vermicelli Noodles (Thin),1 On...",20,30,50,4,south indian recipes,south indian breakfast,high protein vegetarian,"To begin making the Ragi Vermicelli Recipe, fi...",http://www.archanaskitchen.com/ragi-vermicelli...
3,gongura chicken curry recipe andhra style go...,"500 grams Chicken,2 Onion - chopped,1 Tomato -...",15,30,45,4,andhra,lunch,non vegeterian,To begin making Gongura Chicken Curry Recipe f...,http://www.archanaskitchen.com/gongura-chicken...
4,andhra style alam pachadi recipe adrak chutn...,"1 tablespoon chana dal, 1 tablespoon white ura...",10,20,30,4,andhra,south indian breakfast,vegetarian,"To make Andhra Style Alam Pachadi, first heat ...",https://www.archanaskitchen.com/andhra-style-a...


In [15]:
data['Diet'].value_counts()

Diet
vegetarian                      4706
high protein vegetarian          705
non vegeterian                   427
eggetarian                       344
diabetic friendly                260
high protein non vegetarian      225
no onion no garlic  sattvic       73
vegan                             61
gluten free                       50
sugar free diet                   14
Name: count, dtype: int64

In [16]:
data['Cuisine'].value_counts()

Cuisine
indian                  1157
continental             1020
north indian recipes     936
south indian recipes     681
italian recipes          235
                        ... 
jewish                     1
dessert                    1
side dish                  1
shandong                   1
lunch                      1
Name: count, Length: 82, dtype: int64

In [17]:
data['Course'].value_counts()

Course
lunch                           1763
side dish                        992
snack                            876
dinner                           781
dessert                          659
appetizer                        637
main course                      315
south indian breakfast           260
world breakfast                  260
north indian breakfast           122
indian breakfast                 101
vegetarian                        47
one pot dish                      33
high protein vegetarian            7
brunch                             4
vegan                              3
non vegeterian                     2
eggetarian                         1
no onion no garlic  sattvic        1
sugar free diet                    1
Name: count, dtype: int64

In [18]:
data.to_excel('/content/drive/MyDrive/MY PROJECTS/Indian Food Diet Chart chatbots/preprocessed data.xlsx')

In [26]:
# Load the DataFrame
df = pd.read_excel('/content/drive/MyDrive/MY PROJECTS/Indian Food Diet Chart chatbots/preprocessed data.xlsx')

# Define a function to check if a string contains Hindi characters
def contains_hindi(text):
    hindi_pattern = re.compile(r'[\u0900-\u097F]+')  # Hindi Unicode range
    return bool(hindi_pattern.search(text))

# Filter out rows containing Hindi text in the "TranslatedIngredients" column
df = df[~df['TranslatedIngredients'].apply(contains_hindi)]
df = df[~df['TranslatedInstructions'].apply(contains_hindi)]

# Drop rows with NaN values in the "TranslatedIngredients" column
df = df.dropna(subset=['TranslatedIngredients'])

# Reset index
df.reset_index(drop=True, inplace=True)

# Save the cleaned DataFrame
df.to_excel('/content/drive/MyDrive/MY PROJECTS/Indian Food Diet Chart chatbots/cleaned_data.xlsx', index=False)
