In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load the dataset
df = pd.read_csv("/kaggle/input/zomato/zomato.csv")
# Display the first few rows of the dataset
df.head()

You have to perform the following tasks:
Data Cleaning:

Deleting redundant columns.
Renaming the columns.
Dropping duplicates.
Cleaning individual columns.
Remove the NaN values from the dataset
Check for some more Transformations
Data Visualization:

Restaurants delivering Online or not
Restaurants allowing table booking or not
Table booking Rate vs Rate
Best Location
Relation between Location and Rating
Restaurant Type
Gaussian Rest type and Rating
Types of Services
Relation between Type and Rating
Cost of Restaurant
No. of restaurants in a Location
Restaurant type
Most famous restaurant chains in Bengaluru
Regression Analysis:

Linear Regression
Decision Tree Regression
Random Forest Regression

In [None]:
# Lets check the contents first
df.info()

Deleting redundant columns.


In [None]:
columns_to_drop = ['url', 'address','phone','menu_item', 'dish_liked']

df.drop(columns_to_drop, axis = 1, inplace = True)

Dropping duplicates

In [None]:
#Lets see which are duplicates. Here we will use duplicated() to check for entire rows that are duplicates.
df[df.duplicated()]

In [None]:
# Lets drop the duplicate rows

df.drop_duplicates(inplace = True)
df.shape

Cleaning columns

In [None]:
df.tail(5)

In [None]:
# cleaning 'rate' column by splitting and returning float

def clean_column(x):
    try:
        x = x.split('/')[0]
        return float(x)
    except:
        return np.nan

df['cleaned_rate']= df['rate'].apply(clean_column)

In [None]:
df.drop('rate', axis=1, inplace = True)

In [None]:
df.info()

In [None]:
# cleaning cost column column by replacing ',' and returning float

def clean_column(x):
    try:
        x = x.replace(',','')
        return float(x)
    except:
        return np.nan

df['approx_cost(for two people)']= df['approx_cost(for two people)'].apply(clean_column)

In [None]:
df.info()

Removing Nan values from cost and rate columns

In [None]:
# Replacing null in rate with mean rate
#rate_mean = df['cleaned_rate'].mean()

rate_mean = df['cleaned_rate'].mean()
df['cleaned_rate'] = df['cleaned_rate'].fillna(rate_mean)

In [None]:
# Check that no nulls exist in rate column
df.info() 

In [None]:
# Replacing null in cost with mean cost
#rate_mean = df['cleaned_rate'].mean()

cost_mean = df['approx_cost(for two people)'].mean()
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].fillna(cost_mean)

In [None]:
# check that no nulls exist in cost column
df.info()

In [None]:
# check nulls in cuisines column

df[df['cuisines'].isnull()]

In [None]:
# replace nulls with 'n/a' in cuisines

df['cuisines'] = df['cuisines'].fillna('n/a')

In [None]:
# check for nulls
df.info()

In [None]:
# replace nulls with 'n/a' in rest_type

df['rest_type'] = df['rest_type'].fillna('n/a')

In [None]:
# check for nulls
df.info()

Now that we have a cleaned dataset, we can move to the next section, Visualizations

In [None]:
# replace nulls with 'n/a' in location

df['location'] = df['location'].fillna('n/a')

In [None]:
# check for nulls
df.info()

Visualisations

In [None]:
# Count of restaurants allowing booking

import seaborn as sns
import matplotlib.pyplot as plt

# check values
df['book_table'].value_counts()

# Create a figure and axes
fig, axes = plt.subplots()

# Create the plot
sns.countplot(df, x = 'book_table')

# Set labels and title
axes.set_xlabel('Booking Allowed')
axes.set_ylabel('Count')
axes.set_title('Count of Bookings')

# Show the plot
plt.show()

In [None]:
# Count of restaurants allowing online order

import seaborn as sns
import matplotlib.pyplot as plt

# check values
df['online_order'].value_counts()

# # Create a figure and axes
fig, axes = plt.subplots()

# # Create the plot
sns.countplot(df, x = 'online_order')

# # Set labels and title
axes.set_xlabel('Online Order Allowed')
axes.set_ylabel('Count')
axes.set_title('Count of Online Orders Allowed')

# Show the plot
plt.show()

In [None]:
# Relationship between online orders and rating

# # Create a figure and axes
fig, axes = plt.subplots()

# # Create the plot
sns.boxplot(df, x = 'online_order', y= 'cleaned_rate')

# # Set labels and title
axes.set_xlabel('Online Order Allowed')
axes.set_ylabel('Rating of Restaurant')
axes.set_title('Rating vs Online Order')

# Show the plot
plt.show()

In [None]:
# Location-wise distribution of restaurants


df_sorted = df.sort_values(by=['cleaned_rate'], ascending = False).head(100)

# Create figure and axes
fig, axes = plt.subplots()

# Create plot
sns.countplot(data = df_sorted, x= 'location')


# Rotate x-axis labels and set font size
plt.xticks(rotation=25, fontsize=10)


# # Set labels and title
axes.set_ylabel('Number of Restaurant')
axes.set_title('Top Locations for Zomato Restaurants')


# Adjust layout to prevent clipping
plt.tight_layout()


plt.show()

In [None]:
df_sorted.head()