In [8]:
# Indian food soooo yummmy

# Importing visualization and data handling libraries

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

pd.set_option('display.max.rows', 100)
pd.set_option('display.max_colwidth', 100)

First things first, always take a quick look at the data so I can have a better idea of what I am working with.
And going from there I will decide what needs to be changed before I proceed on with the data analysis portion.

In [9]:
df = pd.read_csv('../data/indian_food.csv')
df.sample(15)

Unnamed: 0,name,ingredients,diet,prep_time,cook_time,flavor_profile,course,state,region
22,Chhena poda,"Sugar, chenna cheese",vegetarian,10,45,sweet,dessert,Odisha,East
179,Dahi vada,"Urad dal, bhuna chana, garam masala, dates, tamarind",vegetarian,30,30,-1,snack,Maharashtra,West
50,Pootharekulu,"Rice flour, powdered sugar, ghee",vegetarian,10,60,sweet,dessert,Andhra Pradesh,South
160,Theeyal,"Coconut, whole red beans, masala, sesame oil, tamarind",vegetarian,15,20,-1,main course,Kerala,South
217,Khaman,"Yogurt, fresh coconut, sesame seeds, semolina, gram flour",vegetarian,10,20,spicy,snack,Gujarat,West
239,Koldil Chicken,"Banana flower, chicken, green chili, mustard oil, lemon juice",non vegetarian,-1,-1,spicy,main course,Assam,North East
177,Daal Dhokli,"Whole wheat flour, dal, kokum, gur, bengal gram flour",vegetarian,20,30,spicy,main course,Gujarat,West
86,Dal tadka,"Pigeon peas, garam masala, ginger, red onion, kasuri methi",vegetarian,10,30,spicy,main course,Punjab,North
72,Aloo shimla mirch,"Potato, shimla mirch, garam masala, amchur powder, salt",vegetarian,10,40,spicy,main course,Punjab,North
224,Luchi,"Maida, vegetable oil",vegetarian,20,30,-1,main course,West Bengal,East


Before going further I want to make all of the data lower case. 

In [10]:
lower_columns = ['ingredients', 'name', 'state', 'region']
df[lower_columns] = df[lower_columns].map(lambda x: str(x).lower())
df[lower_columns]

Unnamed: 0,ingredients,name,state,region
0,"maida flour, yogurt, oil, sugar",balu shahi,west bengal,east
1,"gram flour, ghee, sugar",boondi,rajasthan,west
2,"carrots, milk, sugar, ghee, cashews, raisins",gajar ka halwa,punjab,north
3,"flour, ghee, kewra, milk, clarified butter, sugar, almonds, pistachio, saffron, green cardamom",ghevar,rajasthan,west
4,"milk powder, plain flour, baking powder, ghee, milk, sugar, water, rose water",gulab jamun,west bengal,east
...,...,...,...,...
250,"glutinous rice, black sesame seeds, gur",til pitha,assam,north east
251,"coconut milk, egg yolks, clarified butter, all purpose flour",bebinca,goa,west
252,"cottage cheese, dry dates, dried rose petals, pistachio, badam",shufta,jammu & kashmir,north
253,"milk powder, dry fruits, arrowroot powder, all purpose flour",mawa bati,madhya pradesh,central


#### Assessing -1 values in data set
There are 5 different columns that possess values that do not make sense.
Aside from the possibility that the data is merely missing I have some ideas as to why those values are like that.
* For `prep_time` and `cook_time`, maybe this is indicitive of things that take very little time to cook, but I doubt this.
* For `state` and `region` I think that it's possible that these dishes may be common in a wider area of India and don't have
as much of a home as others.
* As for `flavor_profile`, maybe the flavor for these dishes is more complicated than spicy, sweet, bitter and sour, but I doubt this.

In [11]:
df.map(lambda x: x == -1 or x == '-1').sum()

name               0
ingredients        0
diet               0
prep_time         30
cook_time         28
flavor_profile    29
course             0
state             24
region            13
dtype: int64

#### Looking at categorical data
Below I want to be able to see how many of each value is in most of the columns.
By doing so I will have a much better idea for what kind of categories lie within the data,
which will give me more ideas on what I can do with my imaginary restaurant menu.
I did not include certain columns such as 'name' and 'ingredients' as they are far too unique.
I also did not include 'prep_time' and 'cook_time' as they are better represented with something like a bar chart.

#### Key points from the categorical data
* Only 29 `non-vegetarian` options, as someone who is sensitive to plants I find this troubling, for my stomach.
* I live in Medellín, Colombia, and the people here do not eat spicy food, it is not easy to find peppers here, but that doesn't really matter, because you can always take the spice out. Which directly relates to this project in terms of the demographics of the people that would be eating this food.
* Mostly main courses. Only 2 `starters`, I will probably combine this with snack. Snack and starter are conceptually similar so it makes sense to combine them, especially considering there are only 2 `starters`, and having a category that small doesn't make much sense.
* There are many regions, many of them only have a few dishes. But some of them have many. `-1` in this category makes up 24 dishes.
* The western area of Indian seems to have the most dishes. `-1` in this category makes up 13 dishes.

In [12]:
for col in df.columns:
    if col not in ['prep_time', 'ingredients', 'cook_time', 'name']:
        print(df[col].value_counts(), '\n\n')

diet
vegetarian        226
non vegetarian     29
Name: count, dtype: int64 


flavor_profile
spicy     133
sweet      88
-1         29
bitter      4
sour        1
Name: count, dtype: int64 


course
main course    129
dessert         85
snack           39
starter          2
Name: count, dtype: int64 


state
gujarat            35
punjab             32
maharashtra        30
west bengal        24
-1                 24
assam              21
tamil nadu         20
andhra pradesh     10
uttar pradesh       9
kerala              8
odisha              7
karnataka           6
rajasthan           6
telangana           5
bihar               3
goa                 3
manipur             2
jammu & kashmir     2
madhya pradesh      2
uttarakhand         1
tripura             1
nagaland            1
nct of delhi        1
chhattisgarh        1
haryana             1
Name: count, dtype: int64 


region
west          74
south         59
north         49
east          31
north east    25
-1            13
ce

## `-1` data
Now I would like to take a closer look at the `-1` data.

#### `flavor_profile`
It looks like many of the items contain rice, or flour, which could mean that many of these items are
base foods, which is something that kind of serves as a foundation for a meal. Like rice dishes and bread.
I looked up online all the dishes here, and yes they are pretty much entirely rice and bread dishes, they all look wonderful.

I did some research online and found more appropriate categories for everything that is `-1`.

In [13]:
df[df['flavor_profile'] == '-1']

Unnamed: 0,name,ingredients,diet,prep_time,cook_time,flavor_profile,course,state,region
78,chapati,"whole wheat flour, olive oil, hot water, all purpose flour",vegetarian,10,10,-1,main course,maharashtra,west
104,naan,"whole wheat flour, honey, butter, garlic",vegetarian,60,30,-1,main course,punjab,north
116,rongi,"garam masala powder, tomato, kasuri methi, cinnamon, mustard oil",vegetarian,10,30,-1,main course,punjab,north
131,kanji,"carrot, yellow mustard, red chilli, black salt",vegetarian,10,45,-1,snack,kerala,south
145,pachadi,"coconut oil, cucumber, curd, curry leaves, mustard seeds",vegetarian,10,25,-1,main course,-1,south
146,paniyaram,"yogurt, ginger, curry leaves, baking soda, green chilli",vegetarian,10,20,-1,main course,tamil nadu,south
150,paruppu sadam,"arhar dal, sambar powder, tomato, curry leaves, fennel seeds",vegetarian,10,20,-1,main course,tamil nadu,south
153,puli sadam,"urad dal, lemon, tamarind, cooked rice, curry leaves",vegetarian,10,20,-1,main course,tamil nadu,south
155,puttu,"brown rice flour, sugar, grated coconut",vegetarian,495,40,-1,main course,kerala,south
157,sandige,"thin rice flakes, black sesame seeds, curry leaves",vegetarian,120,60,-1,main course,karnataka,south


In [20]:
flavor_sub1 = df[df['flavor_profile'] == '-1']['ingredients'].map(lambda x: str(x).split(', '))
flavor_sub1

78                           [whole wheat flour, olive oil, hot water, all purpose flour]
104                                            [whole wheat flour, honey, butter, garlic]
116                    [garam masala powder, tomato, kasuri methi, cinnamon, mustard oil]
131                                      [carrot, yellow mustard, red chilli, black salt]
145                            [coconut oil, cucumber, curd, curry leaves, mustard seeds]
146                             [yogurt, ginger, curry leaves, baking soda, green chilli]
150                        [arhar dal, sambar powder, tomato, curry leaves, fennel seeds]
153                                [urad dal, lemon, tamarind, cooked rice, curry leaves]
155                                             [brown rice flour, sugar, grated coconut]
157                                  [thin rice flakes, black sesame seeds, curry leaves]
158                                                      [sevai, parboiled rice, steamer]
159       

In [25]:
df[df['flavor_profile'] == '-1']['name']

78            chapati
104              naan
116             rongi
131             kanji
145           pachadi
146         paniyaram
150     paruppu sadam
153        puli sadam
155             puttu
157           sandige
158             sevai
159      thayir sadam
160           theeyal
171            bhakri
176        copra paak
179         dahi vada
180          dalithoy
189            kansar
216        farsi puri
222              khar
224             luchi
227    bengena pitika
228       bilahi maas
229        black rice
231        brown rice
236     chingri bhape
244           pakhala
245        pani pitha
248          red rice
Name: name, dtype: object

In [None]:
sweet_dishes = ['copra paak', 'kansar', 'dahi vada', 'puttu']
spicy_dishes = ['rongi', 'theeyal', 'khar', 'bengena pitika', 'bilahi maas', 'chingri bhape', 'paniyaram']
sour_dishes = ['kanji', 'puli sadam', 'pachadi', 'pakhala']
others = # Need to put all the remaining dishes here, concat the 3 above lists and get the ones that are not in those lists in this one.

# Updating the 'flavor' column based on the dish names
df.loc[df['name'].isin(sweet_dishes), 'flavor_profile'] = 'sweet'
df.loc[df['name'].isin(spicy_dishes), 'flavor_profile'] = 'spicy'
df.loc[df['name'].isin(others), 'flavor_profile'] = 'other'
df.loc[df['name'].isin(sour_dishes), 'flavor_profile'] = 'sour'

In [None]:
df['cook_time'].describe(), df['prep_time'].describe()

In [36]:
all_ingredients = []
for item in df['ingredients'].tolist():
    [all_ingredients.append(i) for i in item.split(', ') if i not in all_ingredients]
all_ingredients, len(all_ingredients)

(['maida flour',
  'yogurt',
  'oil',
  'sugar',
  'gram flour',
  'ghee',
  'carrots',
  'milk',
  'cashews',
  'raisins',
  'flour',
  'kewra',
  'clarified butter',
  'almonds',
  'pistachio',
  'saffron',
  'green cardamom',
  'milk powder',
  'plain flour',
  'baking powder',
  'water',
  'rose water',
  'sugar syrup',
  'lentil flour',
  'maida',
  'corn flour',
  'baking soda',
  'vinegar',
  'curd',
  'turmeric',
  'cardamom',
  'cottage cheese',
  'rice',
  'dried fruits',
  'nuts',
  'refined flour',
  'besan',
  'powdered sugar',
  'yoghurt',
  'firm white pumpkin',
  'kitchen lime',
  'alum powder',
  'condensed milk',
  'spices',
  'semolina',
  'khoa',
  'coconut',
  'molu leaf',
  'dry fruits',
  'chhena',
  'chenna cheese',
  'cream',
  'lemon juice',
  'coconut flakes',
  'chenna',
  'fried milk power',
  'fennel seeds',
  'besan flour',
  'jaggery',
  'rice flour',
  'wheat flour',
  'sweetened milk',
  'reduced milk',
  'vegetable oil',
  'elachi',
  'cardamom powder