# Introduction

This Project is based on a problem about a Restaurant Market place called Zomato, this company is an intermediate between customers and restaurants.

As a study case, we were hired to help the new CEO to understand how Zomato numbers are going, He sent to us a list of question that must be answered as soon as possible in an online dashboard to show a visual knowledge from the data for a better understanding.


## Imports

In [1]:
from IPython.core.display   import HTML
from forex_python.converter import CurrencyRates
from datetime               import datetime
from folium.plugins         import MarkerCluster, ScrollZoomToggler
from folium.map             import LayerControl

import numpy                as np
import pandas               as pd
import seaborn              as sns
import matplotlib.pyplot    as plt
import plotly.express       as px
import plotly.graph_objects as go
import inflection
import folium


## Useful Functions

In [2]:
# This Function create a better visual set to Jupyter

def jupyter_settings():
    %matplotlib inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

jupyter_settings()

# Fulfill the the country names with the matching ID

countries = {
    1:'India',
    14:'Australia',
    30:'Brazil',
    37:'Canada',
    94:'Indonesia',
    148:'New Zeland',
    162:'Philippines',
    166:'Qatar',
    184:'Singapure',
    189:'South Africa',
    191:'Sri Lanka',
    208:'Turkey',
    214:'United Arab Emirates',
    215:'England',
    216:'United States of America'
}

def country_name( country_id ):
    return countries[country_id]

# Create food type

def price_type( price_range ):
    if price_range == 1:
        return 'cheap'
    elif price_range == 2:
        return 'normal'
    elif price_range == 3:
        return 'expensive'
    else:
        return 'gourmet'

# Colors

colors = {
    '3F7E00': 'darkgreen',
    '5BA829': 'green',
    '9ACD32': 'lightgreen',
    'CDD614': 'orange',
    'FFBA00': 'red',
    'CBCBC8': 'darkred',
    'FF7800': 'darkred'
}

def color_name( color_code ):
    return colors[color_code]

def rename_columns( dataframe ):
    df = dataframe.copy()
    title = lambda x: inflection.titleize( x )
    snakecase = lambda x: inflection.underscore( x )
    spaces = lambda x: x.replace( ' ', '' )
    cols_old = list( df.columns )
    cols_old = list( map( title, cols_old ) )
    cols_old = list( map( spaces, cols_old ) )
    cols_new = list( map( snakecase, cols_old ) )
    df.columns = cols_new
    
    return df

# Your supervisor asked you to initially to consider all the restaurants only by one kind of cuisine

# df["cuisines"] = df.loc[:, "cuisines"].apply(lambda x: x.split(",")[0])

# I've decide to convert the values to USD dollar, and add a new column with the value converted

currencies = {
    'Philippines':'PHP',
    'Brazil':'BRL',
    'Australia':'AUD',
    'United States of America':'USD',
    'Canada':'CAD',
    'Singapure':'SGD',
    'United Arab Emirates':'AED',
    'India':'INR',
    'Indonesia':'IDR',
    'New Zeland':'NZD',
    'England':'GBP',
    'Qatar':'QAR',
    'South Africa':'ZAR',
    'Sri Lanka':'LKR',
    'Turkey':'TRY'
}

def define_currency( currency ):    
    return currencies[currency]

# Create a CurrencyRates object

c = CurrencyRates()

# Example: Convert 100 USD to EUR

usd_amount = 1000
eur_amount = c.convert('EUR', 'USD', usd_amount)
print(f"{usd_amount} USD is equal to {eur_amount} EUR")


1000 USD is equal to 1076.8 EUR


# Checking Data

In [3]:
df = pd.read_csv( 'zomato.csv' )
df.head()


Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6310675,Mama Lou's Italian Kitchen,162,Las Piñas City,"Block 1, Lot 36, Tropical Avenue Corner Tropic...",BF International,"BF International, Las Piñas City",121.009787,14.447615,Italian,1100,Botswana Pula(P),1,0,0,0,3,4.6,3F7E00,Excellent,619
1,6310675,Mama Lou's Italian Kitchen,162,Las Piñas City,"Block 1, Lot 36, Tropical Avenue Corner Tropic...",BF International,"BF International, Las Piñas City",121.009787,14.447615,Italian,1100,Botswana Pula(P),1,0,0,0,3,4.6,3F7E00,Excellent,619
2,6314542,Blackbird,162,Makati City,"Nielson Tower, Ayala Triangle Gardens, Salcedo...","Ayala Triangle Gardens, Salcedo Village, Makat...","Ayala Triangle Gardens, Salcedo Village, Makat...",121.024562,14.556042,"European, Asian",3100,Botswana Pula(P),0,0,0,0,4,4.7,3F7E00,Excellent,469
3,6301293,Banapple,162,Makati City,"Ayala Triangle Gardens, Salcedo Village, Makat...","Ayala Triangle Gardens, Salcedo Village, Makat...","Ayala Triangle Gardens, Salcedo Village, Makat...",121.023171,14.556196,"Filipino, American, Italian, Bakery",800,Botswana Pula(P),0,0,0,0,3,4.4,5BA829,Very Good,867
4,6315689,Bad Bird,162,Makati City,"Hole In The Wall, Floor 4, Century City Mall, ...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027708,14.565899,American,700,Botswana Pula(P),0,0,0,0,3,4.4,5BA829,Very Good,858


In [4]:
# Checking the data type

df.dtypes


Restaurant ID             int64
Restaurant Name          object
Country Code              int64
City                     object
Address                  object
Locality                 object
Locality Verbose         object
Longitude               float64
Latitude                float64
Cuisines                 object
Average Cost for two      int64
Currency                 object
Has Table booking         int64
Has Online delivery       int64
Is delivering now         int64
Switch to order menu      int64
Price range               int64
Aggregate rating        float64
Rating color             object
Rating text              object
Votes                     int64
dtype: object

In [5]:
# Look for NaNs

df.isna().sum()


Restaurant ID            0
Restaurant Name          0
Country Code             0
City                     0
Address                  0
Locality                 0
Locality Verbose         0
Longitude                0
Latitude                 0
Cuisines                15
Average Cost for two     0
Currency                 0
Has Table booking        0
Has Online delivery      0
Is delivering now        0
Switch to order menu     0
Price range              0
Aggregate rating         0
Rating color             0
Rating text              0
Votes                    0
dtype: int64

In [6]:
# Check the datset size, and check if we have duplicated restaurants

print( df.shape ) 
print( df['Restaurant ID'].nunique() )


(7527, 21)
6942


# Cleaning and modeling the data

In [7]:
# Using the functions to prepare the columns below:

# We could notice that there are duplicated Restaurants ID

df = df.drop_duplicates()

# Match the countries' name with their code

df['Country Code'] = df['Country Code'].apply( country_name )

# Standardize the currencies

df['Currency'] = df['Country Code'].apply( define_currency )

# Create a column with USD value converted

df['Dollar USD Value'] = df.apply( lambda x: c.convert( x['Currency'], 'USD', x['Average Cost for two'] ), axis=1 )
df['Dollar USD Value'] = df['Dollar USD Value'].round( 2 )

# Create the price range

df['Price range'] = df['Price range'].apply( price_type )

# Match the colors with thir codes

df['Rating color'] = df['Rating color'].apply( color_name )

# Select just the first cuisine tha shows up in the line

df['Cuisines'] = df.loc[:, 'Cuisines'].apply( lambda x: str(x).split( ',' )[0] )

# Rename the columns

df = rename_columns( df )
df.head()


Unnamed: 0,restaurant_id,restaurant_name,country_code,city,address,locality,locality_verbose,longitude,latitude,cuisines,average_cost_for_two,currency,has_table_booking,has_online_delivery,is_delivering_now,switch_to_order_menu,price_range,aggregate_rating,rating_color,rating_text,votes,dollar_usd_value
0,6310675,Mama Lou's Italian Kitchen,Philippines,Las Piñas City,"Block 1, Lot 36, Tropical Avenue Corner Tropic...",BF International,"BF International, Las Piñas City",121.009787,14.447615,Italian,1100,PHP,1,0,0,0,expensive,4.6,darkgreen,Excellent,619,19.66
2,6314542,Blackbird,Philippines,Makati City,"Nielson Tower, Ayala Triangle Gardens, Salcedo...","Ayala Triangle Gardens, Salcedo Village, Makat...","Ayala Triangle Gardens, Salcedo Village, Makat...",121.024562,14.556042,European,3100,PHP,0,0,0,0,gourmet,4.7,darkgreen,Excellent,469,55.4
3,6301293,Banapple,Philippines,Makati City,"Ayala Triangle Gardens, Salcedo Village, Makat...","Ayala Triangle Gardens, Salcedo Village, Makat...","Ayala Triangle Gardens, Salcedo Village, Makat...",121.023171,14.556196,Filipino,800,PHP,0,0,0,0,expensive,4.4,green,Very Good,867,14.3
4,6315689,Bad Bird,Philippines,Makati City,"Hole In The Wall, Floor 4, Century City Mall, ...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027708,14.565899,American,700,PHP,0,0,0,0,expensive,4.4,green,Very Good,858,12.51
5,6304833,Manam,Philippines,Makati City,"Level 1, Greenbelt 2, Ayala Center, Greenbelt,...","Greenbelt 2, San Lorenzo, Makati City","Greenbelt 2, San Lorenzo, Makati City, Makati ...",121.02038,14.552351,Filipino,700,PHP,0,0,0,0,expensive,4.7,darkgreen,Excellent,930,12.51


In [8]:
print( df.shape )
print( df.isna().sum() )
print( df.dtypes )


(6942, 22)
restaurant_id           0
restaurant_name         0
country_code            0
city                    0
address                 0
locality                0
locality_verbose        0
longitude               0
latitude                0
cuisines                0
average_cost_for_two    0
currency                0
has_table_booking       0
has_online_delivery     0
is_delivering_now       0
switch_to_order_menu    0
price_range             0
aggregate_rating        0
rating_color            0
rating_text             0
votes                   0
dollar_usd_value        0
dtype: int64
restaurant_id             int64
restaurant_name          object
country_code             object
city                     object
address                  object
locality                 object
locality_verbose         object
longitude               float64
latitude                float64
cuisines                 object
average_cost_for_two      int64
currency                 object
has_table_booking   

## Descriptive Statistics

In [9]:
# We are going to divide the columns into two datasets with different kind of variables ( Numeral and categorical )

num_attributes = df.select_dtypes( include=['float64', 'int64'] )
cat_attributes = df.select_dtypes( exclude=['float64', 'int64', 'datetime64[ns]'] )


### Numerical Attributes

In [10]:
# Central tendency - mean and median

ct1 = pd.DataFrame( num_attributes.apply( np.mean ) ).T
ct2 = pd.DataFrame( num_attributes.apply( np.median ) ).T

# Dispersion - std, min, max, range, skew, kurtosis

d1 = pd.DataFrame( num_attributes.apply( np.std ) ).T
d2 = pd.DataFrame( num_attributes.apply( min ) ).T
d3 = pd.DataFrame( num_attributes.apply( max ) ).T
d4 = pd.DataFrame( num_attributes.apply( lambda x: x.max() - x.min() ) ).T
d5 = pd.DataFrame( num_attributes.apply( lambda x: x.skew() ) ).T
d6 = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis() ) ).T

# Concatenate

m = pd.concat( [d2, d3, d4, ct1, ct2, d1, d5, d6] ).T.reset_index()
m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
m_rounded = m.round(2)
m_rounded


Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,restaurant_id,549.0,19040277.0,19039728.0,10761928.71,7900186.5,7037181.43,-0.14,-1.71
1,longitude,-122.7,175.31,298.01,31.44,73.76,79.05,-0.61,-0.73
2,latitude,-41.33,55.98,97.31,19.43,25.27,23.5,-1.08,0.55
3,average_cost_for_two,0.0,25000017.0,25000017.0,7506.58,250.0,302229.72,81.5,6737.98
4,has_table_booking,0.0,1.0,1.0,0.06,0.0,0.24,3.69,11.6
5,has_online_delivery,0.0,1.0,1.0,0.35,0.0,0.48,0.63,-1.6
6,is_delivering_now,0.0,1.0,1.0,0.17,0.0,0.38,1.71,0.94
7,switch_to_order_menu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,aggregate_rating,0.0,4.9,4.9,4.13,4.2,0.67,-3.94,21.16
9,votes,0.0,41333.0,41333.0,604.38,328.0,1115.66,11.38,291.56


In [13]:
# Extract unique values from the 'dollar_usd_value' column

range_values = df['dollar_usd_value'].unique()

# Sort the list in descending order

range_values_sorted = sorted( range_values, reverse=True )

range_values_sorted


[16298370.35,
 7699.68,
 6416.4,
 6095.58,
 5774.76,
 5133.12,
 4491.48,
 3849.84,
 3208.2,
 3079.87,
 2566.56,
 2309.9,
 1924.92,
 1668.26,
 1283.28,
 1026.62,
 781.63,
 756.42,
 693.38,
 650.0,
 641.64,
 600.0,
 579.92,
 567.31,
 560.68,
 523.19,
 516.89,
 500.0,
 485.37,
 479.07,
 472.76,
 466.46,
 453.85,
 441.24,
 440.0,
 428.64,
 397.12,
 384.51,
 378.21,
 371.31,
 365.6,
 363.88,
 359.3,
 353.0,
 350.0,
 346.69,
 340.39,
 330.0,
 327.78,
 315.17,
 314.47,
 302.57,
 300.0,
 289.96,
 289.31,
 283.66,
 277.35,
 275.0,
 271.05,
 264.75,
 252.49,
 252.14,
 250.0,
 245.84,
 239.53,
 233.93,
 233.23,
 226.93,
 226.42,
 222.79,
 220.62,
 215.36,
 214.32,
 210.0,
 208.02,
 204.22,
 201.71,
 201.26,
 200.51,
 200.0,
 195.41,
 189.1,
 185.66,
 185.42,
 182.8,
 180.0,
 176.5,
 175.0,
 170.19,
 170.0,
 169.96,
 163.89,
 163.52,
 163.38,
 162.98,
 159.66,
 157.59,
 151.28,
 150.94,
 150.0,
 148.52,
 148.25,
 144.98,
 140.0,
 138.68,
 138.37,
 133.67,
 132.37,
 130.39,
 130.0,
 129.96,
 126.07

The list before showed that we have only one outlier with the value, which is already converted to USD dollar, 15932993.75, 
so I decided to breake this line value down to 1593.3, it is probably a typing error, but in any case it will be demonstrade to the CEO, after we got a return from
the restaurant if the price is really that

In [21]:
df.loc[df['dollar_usd_value'] == 16298370.35, 'dollar_usd_value'] = 1630.00
df.loc[df['dollar_usd_value'] == 1630.00]


Unnamed: 0,restaurant_id,restaurant_name,country_code,city,address,locality,locality_verbose,longitude,latitude,cuisines,average_cost_for_two,currency,has_table_booking,has_online_delivery,is_delivering_now,switch_to_order_menu,price_range,aggregate_rating,rating_color,rating_text,votes,dollar_usd_value
385,16608070,d'Arry's Verandah Restaurant,Australia,Adelaide,"Osborn Rd, McLaren Vale",McLaren Vale,McLaren Vale,138.545242,-35.198372,Modern Australian,25000017,AUD,1,0,0,0,cheap,4.7,darkgreen,Excellent,203,1630.0


### Categorical Attributes

In [22]:
# Check the range of the variables

cat_attributes.apply( lambda x: x.unique().shape[0] )


restaurant_name     5914
country_code          15
city                 125
address             6760
locality            2272
locality_verbose    2357
cuisines             166
currency              15
price_range            4
rating_color           6
rating_text           28
dtype: int64

In [28]:
%pwd

'/home/gabri7sc/ds_in_progress/DS_2024/ftc_python_eda_zomato/jupyter_analyze'

# CEO's Questions

In [29]:
# Save a copy of df as CSV and start work with a checkpoint df1

# Get today's date

today_date = datetime.now().strftime("%d-%m-%Y")

# Define the filename

filename = f"zomato_treated_{today_date}.csv"

# Define the directory path

directory_path = '/home/gabri7sc/ds_in_progress/DS_2024/ftc_python_eda_zomato/dataset'

# Construct the full file path

file_path = f'{directory_path}/{filename}'

# Save DataFrame to CSV

df.to_csv( file_path, index=False )

print(f"DataFrame saved to: {file_path}")

# Checkpoint

df1 = df.copy()


DataFrame saved to: /home/gabri7sc/ds_in_progress/DS_2024/ftc_python_eda_zomato/dataset/zomato_treated_15-04-2024.csv


## General View

In [30]:
# How many unique Restaurants are registered?

print( f'The number of unique restaurants is {df1["restaurant_id"].nunique()}.' )

# How many unique Countries are registered?

print( f'The number of unique Countries is {df1["country_code"].nunique()}.' )

# How many unique Cities are registered?

print( f'The number of unique Cities is {df1["city"].nunique()}.' )
      
# What is the total of reviews?

print( f'The total of reviews is {df1["votes"].sum()}.' )

# What is the total of Cuisines?

print( f'The total of cuisines is {df1["cuisines"].nunique()}.' )
      

The number of unique restaurants is 6942.
The number of unique Countries is 15.
The number of unique Cities is 125.
The total of reviews is 4195634.
The total of cuisines is 166.


## Countries' View

In [31]:
# Which Country has more registered cities?

q1 =  df1[['country_code', 'city']].groupby( 'country_code' ).nunique().sort_values( 'city', ascending=False ).reset_index()
q1


Unnamed: 0,country_code,city
0,India,49
1,United States of America,22
2,Philippines,12
3,South Africa,12
4,England,5
5,New Zeland,4
6,United Arab Emirates,4
7,Australia,3
8,Brazil,3
9,Canada,3


In [32]:
# Which Country has more restaurants?

q2 =  df1[['country_code', 'restaurant_id']].groupby( 'country_code' ).count().sort_values( 'restaurant_id', ascending=False ).reset_index()
q2


Unnamed: 0,country_code,restaurant_id
0,India,3120
1,United States of America,1378
2,England,400
3,South Africa,346
4,United Arab Emirates,300
5,Brazil,240
6,New Zeland,239
7,Australia,180
8,Canada,180
9,Turkey,159


In [33]:
# Which is the country with more restaurants where price is equal to 4?

q3 = ( df1.loc[df1['price_range'] == 'gourmet', :][['country_code','restaurant_id']]
                                                  .groupby( 'country_code' ).count().sort_values( 'restaurant_id', ascending=False ).reset_index() )
q3


Unnamed: 0,country_code,restaurant_id
0,United States of America,417
1,South Africa,221
2,India,195
3,Brazil,148
4,United Arab Emirates,102
5,England,79
6,New Zeland,79
7,Turkey,59
8,Singapure,58
9,Canada,54


In [34]:
# Which country has more kind of cuisines?

q4 = df1[['country_code', 'cuisines']].groupby( 'country_code' ).nunique().sort_values( 'cuisines', ascending=False ).reset_index()
q4


Unnamed: 0,country_code,cuisines
0,India,78
1,United States of America,75
2,England,52
3,United Arab Emirates,46
4,Australia,43
5,Brazil,43
6,Canada,42
7,South Africa,42
8,New Zeland,37
9,Qatar,31


In [35]:
# Which country has more reviews?

q5 = df1[['country_code', 'votes']].groupby( 'country_code' ).sum().sort_values( 'votes', ascending=False ).reset_index()
q5


Unnamed: 0,country_code,votes
0,India,2800199
1,United States of America,523414
2,United Arab Emirates,177564
3,Australia,130357
4,Canada,105018
5,Turkey,100193
6,Indonesia,89026
7,South Africa,81939
8,New Zeland,52532
9,Philippines,48398


In [36]:
# Which country has more delivery food service?

q6 = ( df1.loc[df1['is_delivering_now'] == 1, :][['country_code', 'is_delivering_now']]
                                                .groupby( 'country_code' ).count().sort_values( 'is_delivering_now', ascending=False ).reset_index() )
q6


Unnamed: 0,country_code,is_delivering_now
0,India,1150
1,United Arab Emirates,52
2,Philippines,7
3,Qatar,3


In [37]:
# Which country has more quantity of table booking?

q7 = ( df1.loc[df1['has_table_booking'] == 1, :][['country_code', 'has_table_booking']]
                                                .groupby( 'country_code' ).count().sort_values( 'has_table_booking', ascending=False ).reset_index() )
q7


Unnamed: 0,country_code,has_table_booking
0,India,256
1,England,55
2,Australia,29
3,Philippines,22
4,New Zeland,19
5,Indonesia,14
6,United Arab Emirates,10
7,Qatar,4
8,Turkey,4
9,South Africa,3


In [38]:
# Which country has more review in average?

q8 = df1[['country_code', 'votes']].groupby( 'country_code' ).mean().sort_values( 'votes', ascending=False ).reset_index()
q8.round( 2 )


Unnamed: 0,country_code,votes
0,Indonesia,1112.82
1,India,897.5
2,Australia,724.21
3,Turkey,630.14
4,Philippines,604.98
5,United Arab Emirates,591.88
6,Canada,583.43
7,United States of America,379.84
8,Qatar,376.32
9,South Africa,236.82


In [39]:
# Which country has the highest average review?

q9 = df1[['country_code', 'aggregate_rating']].groupby( 'country_code' ).mean().sort_values( 'aggregate_rating', ascending=False ).reset_index()
q9.round( 2 )


Unnamed: 0,country_code,aggregate_rating
0,Indonesia,4.6
1,Philippines,4.46
2,Singapure,4.44
3,United States of America,4.4
4,Australia,4.37
5,Canada,4.32
6,Turkey,4.31
7,Qatar,4.24
8,New Zeland,4.16
9,South Africa,4.06


In [40]:
# Which country has the lowest average review?

q10 = df1[['country_code', 'aggregate_rating']].groupby( 'country_code' ).mean().sort_values( 'aggregate_rating', ascending=True ).reset_index()
q10.round( 2 )


Unnamed: 0,country_code,aggregate_rating
0,Brazil,3.32
1,United Arab Emirates,4.02
2,India,4.03
3,England,4.04
4,Sri Lanka,4.06
5,South Africa,4.06
6,New Zeland,4.16
7,Qatar,4.24
8,Turkey,4.31
9,Canada,4.32


In [41]:
# Which is the average price for two dishes?

q11 = df1[['country_code', 'average_cost_for_two']].groupby( 'country_code' ).mean().sort_values( 'average_cost_for_two', ascending=True ).reset_index()
q11.round( 2 )

# OBs: The cost is in their local currency, to make a real comparison it must be onverted to an unique one, like dollar for example


Unnamed: 0,country_code,average_cost_for_two
0,Canada,41.86
1,England,43.51
2,United States of America,55.02
3,New Zeland,62.15
4,Turkey,128.58
5,Brazil,138.81
6,Singapure,141.44
7,United Arab Emirates,153.72
8,Qatar,174.0
9,South Africa,339.23


In [42]:
# Now considering USD dollar as pattern

q11 = df1[['country_code', 'dollar_usd_value']].groupby( 'country_code' ).mean().sort_values( 'dollar_usd_value', ascending=True ).reset_index()
q11.round( 2 )


Unnamed: 0,country_code,dollar_usd_value
0,Turkey,4.17
1,India,8.48
2,South Africa,17.96
3,Indonesia,19.38
4,Philippines,21.94
5,Qatar,25.41
6,Brazil,27.91
7,Canada,31.05
8,New Zeland,37.92
9,England,54.73


## Cities' View

In [43]:
# Which city has more restaurants?

q12 = df1[['city', 'restaurant_id']].groupby( 'city' ).count().sort_values( 'restaurant_id', ascending=False ).reset_index()
q12.head( 20 )


Unnamed: 0,city,restaurant_id
0,Abu Dhabi,80
1,Kanpur,80
2,Mangalore,80
3,Manchester,80
4,Lucknow,80
5,London,80
6,Kolkata,80
7,Kochi,80
8,Jaipur,80
9,Edinburgh,80


In [44]:
# Which city has more restaurant with an average review higher than 4?

q13 = ( df1.loc[df1['aggregate_rating'] > 4, :][['city', 'restaurant_id', 'aggregate_rating']].groupby( 'city' )
       .agg( {'restaurant_id': 'count', 'aggregate_rating': 'mean'} ).sort_values( 'restaurant_id', ascending=False ).reset_index() )
q13.columns = ['city', 'restaurant_id', 'aggregate_mean_rating']
q13.round( 2 ).head( 20 )


Unnamed: 0,city,restaurant_id,aggregate_mean_rating
0,Bangalore,79,4.5
1,London,78,4.63
2,Houston,75,4.5
3,Jakarta,74,4.63
4,Chennai,72,4.48
5,Auckland,72,4.42
6,Kolkata,70,4.42
7,İstanbul,69,4.46
8,Pune,69,4.45
9,Mumbai,69,4.46


In [45]:
q13 = df1.loc[df1['aggregate_rating'] > 4, :][['city', 'country_code', 'restaurant_id']]
q13 = q13.groupby( ['city', 'country_code'] ).count().reset_index()
q13

Unnamed: 0,city,country_code,restaurant_id
0,Abu Dhabi,United Arab Emirates,58
1,Adelaide,Australia,51
2,Agra,India,41
3,Ahmedabad,India,58
4,Allahabad,India,33
5,Amritsar,India,39
6,Ankara,Turkey,62
7,Atlanta,United States of America,59
8,Auckland,New Zeland,72
9,Aurangabad,India,41


In [46]:
# Which city has more restaurants with an average review lower than 2.5?

q14 = ( df1.loc[ ( df1['aggregate_rating'] < 2.5) & ( df1['votes'] >= 1 ), :][['city', 'restaurant_id', 'aggregate_rating', 'votes']].groupby( 'city' )
       .agg( {'restaurant_id':'count', 'aggregate_rating':'mean', 'votes':'first'} ).sort_values( 'restaurant_id', ascending=False ).reset_index() )
q14.columns = ['city', 'restaurant_id', 'aggregate_mean_rating', 'votes']
q14.head( 10 ).round( 2 )


Unnamed: 0,city,restaurant_id,aggregate_mean_rating,votes
0,Gangtok,39,0.0,2
1,Ooty,17,0.0,2
2,Brasília,13,0.0,1
3,Rio de Janeiro,11,0.43,3
4,São Paulo,10,0.0,1
5,Shimla,4,0.0,1
6,Manchester,4,0.0,3
7,Clarens,4,0.0,2
8,Hamilton,2,2.35,71
9,Edinburgh,2,0.0,3


In [47]:
# Which city has the most expensive average dish for two?

q15 = df1[['city', 'average_cost_for_two']].groupby( 'city' ).mean().sort_values( 'average_cost_for_two', ascending=False ).reset_index()
q15.round( 2 )

# again, when it comes to price, we'll need to convert to a pattern currency to be able to fairly analyze it


Unnamed: 0,city,average_cost_for_two
0,Adelaide,416734.13
1,Jakarta,309605.26
2,Tangerang,180000.0
3,Bogor,175000.0
4,Pasay City,4000.0
5,Colombo,2579.38
6,Noida,1407.14
7,Gurgaon,1375.0
8,Thane,1375.0
9,Mumbai,1372.37


In [48]:
# Consider dollar value

q15 = df1[['city', 'dollar_usd_value']].groupby( 'city' ).mean().sort_values( 'dollar_usd_value', ascending=False ).reset_index()
q15.round( 2 )


Unnamed: 0,city,dollar_usd_value
0,Colombo,3310.06
1,Dubai,243.31
2,Abu Dhabi,200.21
3,Fujairah,196.35
4,Sharjah,135.92
5,Singapore,105.03
6,New York City,101.5
7,Miami,92.0
8,Chicago,83.75
9,Washington DC,80.38


In [49]:
# Which city has more types of cuisines?

q16 = df1[['city', 'cuisines']].groupby( 'city' ).nunique().sort_values( 'cuisines', ascending=False ).reset_index()
q16.head( 10 )


Unnamed: 0,city,cuisines
0,Birmingham,32
1,Doha,31
2,Montreal,30
3,São Paulo,30
4,Manchester,30
5,Houston,30
6,Perth,29
7,Philadelphia,29
8,Portland,28
9,Calgary,28


In [50]:
# Which city has more booking table service?

q17 = df1.loc[df1['has_table_booking'] == 1, :][['city', 'restaurant_id']].groupby( 'city' ).count().sort_values( 'restaurant_id', ascending=False ).reset_index()
q17.head( 10 )


Unnamed: 0,city,restaurant_id
0,Bangalore,42
1,Chennai,28
2,Pune,25
3,Mumbai,18
4,Brisbane,17
5,Auckland,17
6,New Delhi,16
7,London,15
8,Edinburgh,13
9,Jakarta,13


In [51]:
# Which city has more delivery service?

q18 = df1.loc[df1['is_delivering_now'] == 1, :][['city', 'restaurant_id']].groupby( 'city' ).count().sort_values( 'restaurant_id', ascending=False ).reset_index()
q18.head( 10 )


Unnamed: 0,city,restaurant_id
0,Vadodara,48
1,Amritsar,48
2,Aurangabad,47
3,Ludhiana,46
4,Bhopal,46
5,Dehradun,45
6,Ranchi,42
7,Jaipur,42
8,Ahmedabad,41
9,Varanasi,40


In [52]:
# Which city has more online ordering service?

q19 = df1.loc[df1['has_online_delivery'] == 1, :][['city', 'restaurant_id']].groupby( 'city' ).count().sort_values( 'restaurant_id', ascending=False ).reset_index()
q19.head( 10 )


Unnamed: 0,city,restaurant_id
0,Bhopal,75
1,Vadodara,74
2,Abu Dhabi,71
3,Sharjah,71
4,Nagpur,69
5,Aurangabad,69
6,Patna,69
7,Ranchi,69
8,Coimbatore,68
9,Allahabad,68


## Restaurants' View

In [53]:
# Which Restaurant has more reviews?

q20 = df1[['restaurant_id', 'restaurant_name', 'votes']].groupby( ['restaurant_id'] ).sum().sort_values( 'votes', ascending=False ).reset_index()
q20.head( 10 )


Unnamed: 0,restaurant_id,restaurant_name,votes
0,90896,Bawarchi,41333
1,58882,Byg Brewski Brewing Company,17394
2,51705,Toit,15270
3,51040,Truffles,14984
4,308322,Hauz Khas Social,13627
5,56618,AB's - Absolute Barbecues,12443
6,91662,Paradise,11910
7,18207447,Shah Ghouse Hotel & Restaurant,11836
8,20404,Peter Cat,11476
9,54162,The Black Pearl,10892


In [54]:
# Which restaurant has the highest average review?

q21 = df1[['restaurant_name', 'restaurant_id', 'aggregate_rating']].groupby( 'restaurant_name' ).agg( {'restaurant_id': 'first', 'aggregate_rating': 'mean'} )
q21 = q21.sort_values(by=['aggregate_rating', 'restaurant_id'], ascending=[False, True]).reset_index()
q21.head(10)


Unnamed: 0,restaurant_name,restaurant_id,aggregate_rating
0,Indian Grill Room,7528,4.9
1,Restaurant Mosaic @ The Orient,75989,4.9
2,Tapri Central,101212,4.9
3,Pousada By The Beach,130664,4.9
4,The Huddle Sports Bar and Grill - Citymax Hote...,203518,4.9
5,Mint Leaf of London,207770,4.9
6,Sharma Ji Ki Chai,800466,4.9
7,Kebabsville - Sayaji Hotel,1400177,4.9
8,The Sidewalk,1600500,4.9
9,The Public House Café n Restaurant,2600702,4.9


In [55]:
# Which restaurant has the most expensive dish for two?

q22 = df1[['restaurant_name', 'restaurant_id', 'average_cost_for_two']].sort_values( 'average_cost_for_two', ascending=False )
q22.head( 10 )


Unnamed: 0,restaurant_name,restaurant_id,average_cost_for_two
385,d'Arry's Verandah Restaurant,16608070,25000017
6049,The Café - Hotel Mulia,7403667,1200000
6086,GIA Restaurant & Bar,7423164,700000
6056,Shabu-Shabu Shaburi,7425027,600000
6044,SKYE,7402935,600000
6072,Union,7403002,550000
6091,Gioi,18407143,550000
6046,Union Deli,7422751,550000
6074,Sushi Masa,7420899,500000
6092,Holy Smokes,17977526,500000


In [56]:
# Considering USD dollar

q22 = df1[['restaurant_name', 'restaurant_id', 'dollar_usd_value']].sort_values( 'dollar_usd_value', ascending=False )
q22.head( 10 )


Unnamed: 0,restaurant_name,restaurant_id,dollar_usd_value
7322,The Gallery Cafe,5800240,7699.68
7296,Taprobane - Cinnamon Grand,5800161,6416.4
7355,Harbour Court - The Kingsbury,5800678,6416.4
7348,ON14 - Ozo,5801074,6095.58
7336,The Manhattan Fish Market,5800710,5774.76
7299,Chutneys - Cinnamon Grand,5800205,5133.12
7306,T.G.I. Friday's,5800746,5133.12
7354,Latitude - Taj Samudra,5800355,5133.12
7321,The Bavarian,5800011,5133.12
7344,Loon Tao,5800260,5133.12


In [57]:
# Which brazilian cuisine restaurant has the lowest average review?

q23 = ( df1.loc[( df1['cuisines'] == 'Brazilian') & ( df1['votes'] >= 1 ), :][['restaurant_id', 'restaurant_name', 'aggregate_rating']]
       .sort_values( by=['aggregate_rating', 'restaurant_id'], ascending=[True, True] ).reset_index() )
q23.head( 20 )


Unnamed: 0,index,restaurant_id,restaurant_name,aggregate_rating
0,126,6600100,Loca Como tu Madre,0.0
1,105,6600119,Café Savana,0.0
2,125,6600124,Inácia Poulet Rôti,0.0
3,151,6600456,Galeteria Beira Lago,0.0
4,129,6601251,Severina,0.0
5,167,6601525,Restaurante dos Amigos,0.0
6,166,6601894,NAVE MÃE FAST FOOD GOURMET,0.0
7,318,6713117,Bar do Luiz Fernandes,0.0
8,186,7304791,Rancho Inn,0.0
9,220,18177599,Quitéria,0.0


In [58]:
# Now which brazilian restaurant that has brazilian cuisine is the best reviewed?

q24 = ( df1.loc[( df1['country_code'] == 'Brazil' ) & ( df1['cuisines'] == 'Brazilian' ),:][['restaurant_id', 'restaurant_name', 'aggregate_rating']]
       .sort_values( by=['aggregate_rating', 'restaurant_id'], ascending=[False, True] ).reset_index() )
q24.head( 10 )


Unnamed: 0,index,restaurant_id,restaurant_name,aggregate_rating
0,207,7300955,Braseiro da Gávea,4.9
1,254,7302898,Aprazível,4.9
2,346,6709740,Mocotó,4.8
3,215,7300515,Garota de Ipanema,4.8
4,179,7301525,Fogo de Chão,4.8
5,217,7300483,Zazá Bistrô Tropical,4.7
6,261,7302859,Aconchego Carioca,4.7
7,347,6703176,Veloso,4.6
8,213,7300521,Balada Mix,4.6
9,345,6709580,Esquina Mocotó,4.5


In [59]:
# q25 The restaurants that has online ordering have in average more votes?

print( f"The average votes of restaurants with online service is {df1.loc[df1['has_online_delivery'] == 1, :]['votes'].mean():.2f}." )
print( f" The average votes of restaurants without online service is {df1.loc[df1['has_online_delivery'] == 0, :]['votes'].mean():.2f}." )


The average votes of restaurants with online service is 838.82.
 The average votes of restaurants without online service is 478.28.


In [60]:
# q26 The restaurants which has booking service, are also the ones with more expensive dishe for two?

print( f"The average price for two dishes in restaurants with booking service is {df1.loc[df1['has_table_booking'] == 1, :]['dollar_usd_value'].mean():.2f}" )
print( f"The average price for two dishes in restaurants without booking service is {df1.loc[df1['has_table_booking'] == 0, :]['dollar_usd_value'].mean():.2f}" )


The average price for two dishes in restaurants with booking service is 46.13
The average price for two dishes in restaurants without booking service is 73.54


In [61]:
# q27 The Japnese cuisene restaurants in USA has a mean price for two dishes higher than the steakhouses?

print( f"The average price for two dishes in Japanese restaurants in USA is {df1.loc[df1['cuisines'] == 'Japanese', :]['dollar_usd_value'].mean():.2f}" )
print( f"The average price for two dishes in BBQ restaurants in USA is {df1.loc[df1['cuisines'] == 'BBQ', :]['dollar_usd_value'].mean():.2f}" )


The average price for two dishes in Japanese restaurants in USA is 76.78
The average price for two dishes in BBQ restaurants in USA is 34.34


## Cuisines' View

In [62]:
# Among the restaurants with italian cuisine, which one has the highest aggregate rating?

q28 = ( df1.loc[df1['cuisines'] == 'Italian', :][['restaurant_id', 'restaurant_name', 'aggregate_rating']]
       .sort_values( by=['aggregate_rating', 'restaurant_id'], ascending=[False, True] ).reset_index( drop=True ) )
q28.head( 20 )


Unnamed: 0,restaurant_id,restaurant_name,aggregate_rating
0,10021,Darshan,4.9
1,6501298,Cafe Del Sol Classico,4.9
2,7100171,Ombra,4.9
3,7700796,Celino's,4.9
4,16587684,Andre's Cucina & Polenta Bar,4.9
5,16663419,Di Rienzo Grocery & Deli,4.9
6,16733182,Chicago Pizza & Oven Grinder Company,4.9
7,16797555,Regina Pizzeria,4.9
8,16831684,Bottega Louie,4.9
9,16923457,Perricone's Marketplace & Café,4.9


In [63]:
# Among the restaurants with italian cuisine, which one has the lowest aggregate rating?

q29 = ( df1.loc[( df1['cuisines'] == 'Italian' ) & ( df1['votes'] >= 1 ), :][['restaurant_id', 'restaurant_name', 'aggregate_rating']]
       .sort_values( by=['aggregate_rating', 'restaurant_id'], ascending=[True, True] ).reset_index( drop=True ) )
q29.head( 20 )


Unnamed: 0,restaurant_id,restaurant_name,aggregate_rating
0,6714499,Ristorantino,0.0
1,6715707,Più,0.0
2,6800666,Le Delicatezze Di Bruno,0.0
3,7304910,Bene - Sheraton Rio Hotel,0.0
4,18334183,La Bocca Bar e Trattoria,0.0
5,16596386,Siena's,2.3
6,18373064,RPM Italian,2.6
7,3700020,Satsanga,2.8
8,7700300,Jamie's Italian,2.9
9,7900354,Cock & Bull,3.0


In [64]:
# Among the restaurants with American cuisine, which one has the highest aggregate rating?

q30 = ( df1.loc[df1['cuisines'] == 'American', :][['restaurant_id', 'restaurant_name', 'aggregate_rating']]
       .sort_values( by=['aggregate_rating', 'restaurant_id'], ascending=[False, True] ).reset_index( drop=True ) )
q30.head( 20 )


Unnamed: 0,restaurant_id,restaurant_name,aggregate_rating
0,6102616,Burger & Lobster,4.9
1,6801374,Solita,4.9
2,16631515,OEB Breakfast Co.,4.9
3,16737455,Portillo's Hot Dogs,4.9
4,16776778,S'MAC,4.9
5,16777384,Shake Shack,4.9
6,16782050,5 Napkin Burger,4.9
7,16783153,Shake Shack,4.9
8,16785398,Shake Shack,4.9
9,16799018,Fat Cat,4.9


In [65]:
# Among the restaurants with American cuisine, which one has the lowest aggregate rating?

q31 = ( df1.loc[( df1['cuisines'] == 'American' ) & ( df1['votes'] >= 1 ), :][['restaurant_id', 'restaurant_name', 'aggregate_rating']]
       .sort_values( by=['aggregate_rating', 'restaurant_id'], ascending=[True, True] ).reset_index( drop=True ) )
q31.head( 20 )


Unnamed: 0,restaurant_id,restaurant_name,aggregate_rating
0,18445965,Guy Fieri's Kitchen & Bar,2.2
1,7303748,Bob's,2.6
2,6502573,Hard Rock Cafe,2.8
3,7100971,Thunderbird Cafe,3.2
4,16541324,Chili's,3.3
5,18245065,O'Learys,3.3
6,18692865,Lava Lounge,3.3
7,6601283,Capital Steakhouse,3.4
8,18253075,Yard & Coop,3.4
9,6600162,Outback Steakhouse,3.5


In [66]:
# Among the restaurants with Arab cuisine, which one has the highest aggregate rating?

q32 = ( df1.loc[df1['cuisines'] == 'Arabian', :][['restaurant_id', 'restaurant_name', 'aggregate_rating']]
       .sort_values( by=['aggregate_rating', 'restaurant_id'], ascending=[False, True] ).reset_index( drop=True ) )
q32.head( 20 )


Unnamed: 0,restaurant_id,restaurant_name,aggregate_rating
0,18535007,Mandi@36,4.7
1,5600517,Madfoon Al Khaimah,4.5
2,6200166,Wok of Fame,4.5
3,18770281,Three Kings,4.5
4,201824,Aroos Damascus,4.4
5,5601340,Aroos Damascus,4.4
6,209723,Azkadenya,4.3
7,18427358,Azkadenya,4.3
8,207265,Awani,4.2
9,902109,Sheba Kuzhimandhi,4.2


In [67]:
# Among the restaurants with Arab cuisine, which one has the lowest aggregate rating?

q33 = ( df1.loc[( df1['cuisines'] == 'Arabian' ) & ( df1['votes'] >= 1 ), :][['restaurant_id', 'restaurant_name', 'aggregate_rating']]
       .sort_values( by=['aggregate_rating', 'restaurant_id'], ascending=[True, True] ).reset_index( drop=True ) )
q33.head( 20 )


Unnamed: 0,restaurant_id,restaurant_name,aggregate_rating
0,6706513,Raful,0.0
1,6601535,Empório Árabe,2.7
2,6600547,Salim Sou Eu,3.1
3,18698592,V Empire,3.2
4,18698769,Abad Hot Chicken,3.2
5,16540460,Murjan - Nour Arjaan by Rotana,3.3
6,16541428,Bon Appetit,3.3
7,6600939,Libanus,3.4
8,16540483,Breeze - Radisson Blu Resort,3.4
9,16540485,Al Nokhada - Radisson Blu Resort,3.4


In [68]:
# Among the restaurants with Japanese cuisine, which one has the highest aggregate rating?

q34 = ( df1.loc[df1['cuisines'] == 'Japanese', :][['restaurant_id', 'restaurant_name', 'aggregate_rating']]
       .sort_values( by=['aggregate_rating', 'restaurant_id'], ascending=[False, True] ).reset_index( drop=True ) )
q34.head( 20 )


Unnamed: 0,restaurant_id,restaurant_name,aggregate_rating
0,6107336,Sushi Samba,4.9
1,6116563,Chotto Matte,4.9
2,6309831,Ramen Yushoken,4.9
3,6316125,Mendokoro Ramenba,4.9
4,6318506,Ooma,4.9
5,7300004,Sushi Leblon,4.9
6,16598168,Nobu Perth,4.9
7,16924138,Samurai,4.9
8,16989399,Noble Fish,4.9
9,18266425,Gyu - Kaku Japanese BBQ,4.9


In [69]:
# Among the restaurants with Japanese cuisine, which one has the lowest aggregate rating?

q35 = ( df1.loc[( df1['cuisines'] == 'Japanese' ) & ( df1['votes'] >= 1 ), :][['restaurant_id', 'restaurant_name', 'aggregate_rating']]
       .sort_values( by=['aggregate_rating', 'restaurant_id'], ascending=[True, True] ).reset_index( drop=True ) )
q35.head( 20 )


Unnamed: 0,restaurant_id,restaurant_name,aggregate_rating
0,6600203,Banzai Sushi,0.0
1,7300930,Orienthai,2.4
2,7304077,Hachiko,2.5
3,6600314,Haná,2.7
4,6600379,Sushi Loko,2.9
5,6600319,Nazo Sushi Bar,3.2
6,6600968,Kojma,3.4
7,6601143,Soho,3.6
8,6601606,Mormaii Surf Bar,3.6
9,6601623,Kojima,3.6


In [70]:
# Among the restaurants with Home-made cuisine, which one has the highest aggregate rating?

q36 = ( df1.loc[df1['cuisines'] == 'Home-made', :][['restaurant_id', 'restaurant_name', 'aggregate_rating']]
       .sort_values( by=['aggregate_rating', 'restaurant_id'], ascending=[False, True] ).reset_index( drop=True ) )
q36.head( 20 )


Unnamed: 0,restaurant_id,restaurant_name,aggregate_rating
0,5914190,Kanaat Lokantası,4.0
1,6007184,GurMekan Restaurant,3.7


In [71]:
# Which cuisine has the highest price for a dish for two people?

q37 = df1[['cuisines', 'dollar_usd_value']].groupby( 'cuisines' ).mean().round( 2 ).sort_values( 'dollar_usd_value', ascending=False ).reset_index()
q37.head( 20 )


Unnamed: 0,cuisines,dollar_usd_value
0,Sri Lankan,2910.27
1,Finger Food,385.72
2,German,375.66
3,Juices,246.62
4,British,235.47
5,Irish,201.6
6,New Mexican,180.0
7,Middle Eastern,173.28
8,Modern Australian,163.57
9,Vietnamese,151.1


In [72]:
# Which cuisine has the highest aggregate rating?

q38 = df1[['cuisines', 'aggregate_rating']].groupby( 'cuisines' ).mean().round( 2 ).sort_values( 'aggregate_rating', ascending=False ).reset_index()
q38.head( 20 )


Unnamed: 0,cuisines,aggregate_rating
0,Others,4.9
1,Ottoman,4.8
2,Egyptian,4.8
3,Ramen,4.8
4,Sunda,4.75
5,Fresh Fish,4.75
6,Author,4.7
7,Polish,4.7
8,Burmese,4.65
9,Crepes,4.6


In [73]:
# Which cuisine accept more online service?

q39 = ( df1.loc[df1['has_online_delivery'] == 1, :][['cuisines', 'has_online_delivery']]
       .groupby( 'cuisines' ).count().sort_values( 'has_online_delivery', ascending=False ).reset_index() )
q39


Unnamed: 0,cuisines,has_online_delivery
0,North Indian,647
1,Cafe,211
2,Fast Food,141
3,Chinese,141
4,Biryani,117
5,South Indian,108
6,Pizza,97
7,Burger,96
8,Continental,91
9,Indian,80
