In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.feature_selection import mutual_info_regression

Dataset: https://www.kaggle.com/datasets/ahsan81/food-ordering-and-delivery-app-dataset/data

In [2]:
df = pd.read_csv("../data/food_order.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'data/food_order.csv'

In [None]:
df

In [None]:
print(f"Number of customers are {len(df.customer_id.value_counts())}")

In [None]:
print(f"Number of orders are {len(df.order_id.value_counts())}")

In [None]:
print(f"Number of restaurants are {len(df.restaurant_name.value_counts())}")

### Objective

The food aggregator company has stored the data of the different orders made by the registered customers in their online portal. They want to analyze the data to get a fair idea about the demand of different restaurants which will help them in enhancing their customer experience. Suppose you are hired as a Data Scientist in this company and the Data Science team has shared some of the key questions that need to be answered. Perform the data analysis to find answers to these questions that will help the company to improve the business.

### Questions
1. Which restaurant receives the highest orders? And what is the demand accross other restaurants?
2. What is the average cost of orders made for each restaurant? And does cost have effect on patronage?
3. What is the standard deviation of food preparation time for each restaurant and does this have effect on patronage?
4. Which customers have made the most orders? And what is the average/variance delivery time to this customer?
5. What is the best restaurant according to rating?
6. What is the best cuisine according to rating?
7. What is the distribution of the cost of orders, food prepration time and delivery time
8. Are there missing data, duplicates or outliers in the data?

### 1.  Which restaurant receives the highest orders? And what is the demand accross other restaurants?

In [None]:
restaurant_counts = {}
for name, group in df.groupby("restaurant_name"):
    restaurant_counts[name] = len(group["order_id"])


In [None]:
print(f"There are {len(restaurant_counts)} restaurants in total")

In [None]:
#Because of legibility, I selected only the top 50 restaurants
items = sorted(restaurant_counts.items(), key=lambda kv: kv[1], reverse=True)
top50 = items[:50]
labels, values = zip(*top50)

plt.figure(figsize=(30,20))
x = range(len(labels))
plt.bar(x, values)

# annotate each bar with its value
for i, v in enumerate(values):
    plt.text(i, v, f"{v}", ha="center", va="bottom")

plt.xticks(x, labels, rotation=30, ha="right")
plt.ylabel("Value")
plt.title("The top 50 restaurants by demand")
plt.tight_layout()
plt.show()

Shake Shack has the highest order with 219 orders, that is 11.54% of total orders

### 2. What is the average cost of orders made for each restaurant? And does cost have effect on patronage?

In [None]:
restaurant_costs = {}
for name, group in df.groupby("restaurant_name"):
    restaurant_costs[name] = round(group["cost_of_the_order"].mean(),2)

In [None]:
#Because of legibility, I selected only the top 50 restaurants
items = sorted(restaurant_costs.items(), key=lambda kv: kv[1], reverse=True)
top50 = items[:50]
labels, values = zip(*top50)

plt.figure(figsize=(30,20))
x = range(len(labels))
plt.bar(x, values)

# annotate each bar with its value
for i, v in enumerate(values):
    plt.text(i, v, f"{v}", ha="center", va="bottom")

plt.xticks(x, labels, rotation=30, ha="right")
plt.ylabel("Value")
plt.title("The top 50 restaurants by cost of order")
plt.tight_layout()
plt.show()

In [None]:
restaurant_avg_costs_orders = {}
for name, group in df.groupby("restaurant_name"):
    restaurant_avg_costs_orders[name] = [
        group["order_id"].nunique(),                 # count orders
        round(group["cost_of_the_order"].mean(), 2)  # avg cost
    ]

per_rest = (pd.DataFrame.from_dict(restaurant_avg_costs_orders, orient="index",
                                   columns=["order_count", "avg_cost"])
              .reset_index()
              .rename(columns={"index": "restaurant_name"}))


        

In [None]:
per_rest

In [None]:
r, p = pearsonr(per_rest["order_count"], per_rest["avg_cost"])
print(f"r = {r:.3f}, p = {p:.3g}")

Therefore, there is no statistically significant relationship between restaurant cost and number of orders.

### 3. What is the standard deviation of food preparation time for each restaurant and does this have effect on patronage?

In [None]:
restaurant_stds = {}
for name, group in df.groupby("restaurant_name"):
    std = group["food_preparation_time"].std()
    if pd.notna(std):  
        restaurant_stds[name] = round(std, 2)  # avg cost
    

In [None]:
restaurant_stds;

In [None]:
#Because of legibility, I selected only the top 50 restaurants
items = sorted(restaurant_stds.items(), key=lambda kv: kv[1], reverse=True)
top50 = items[:50]
labels, values = zip(*top50)

plt.figure(figsize=(30,20))
x = range(len(labels))
plt.bar(x, values)

# annotate each bar with its value
for i, v in enumerate(values):
    plt.text(i, v, f"{v}", ha="center", va="bottom")

plt.xticks(x, labels, rotation=30, ha="right")
plt.ylabel("Value")
plt.title("The top 50 restaurants by food preparation time")
plt.tight_layout()
plt.show()

In [None]:
restaurant_std_prep_orders = {}
for name, group in df.groupby("restaurant_name"):
    std = group["food_preparation_time"].std()
    if pd.notna(std): 
        restaurant_std_prep_orders[name] = [
            group["order_id"].nunique(),                 # count orders
            round(std, 2)  # std
        ]

per_rest = (pd.DataFrame.from_dict(restaurant_std_prep_orders, orient="index",
                                   columns=["order_count", "preparation_time_std"])
              .reset_index()
              .rename(columns={"index": "restaurant_name"}))


In [None]:
per_rest

In [None]:
r, p = pearsonr(per_rest["order_count"], per_rest["preparation_time_std"])
print(f"r = {r:.3f}, p = {p:.3g}")

There is a slight positive correlation between the standard deviation of the food preparation time and the number of orders which are received. Probing further, I check if the average preparation time has an effect on the number of orders received./

In [None]:
restaurant_mean_prep_orders = {}
for name, group in df.groupby("restaurant_name"):
    mean = group["food_preparation_time"].mean()
    if pd.notna(mean): 
        restaurant_mean_prep_orders[name] = [
            group["order_id"].nunique(),                 # count orders
            round(mean, 2)  # std
        ]

per_rest = (pd.DataFrame.from_dict(restaurant_mean_prep_orders, orient="index",
                                   columns=["order_count", "preparation_time_mean"])
              .reset_index()
              .rename(columns={"index": "restaurant_name"}))

In [None]:
per_rest

In [None]:
r, p = pearsonr(per_rest["order_count"], per_rest["preparation_time_mean"])
print(f"r = {r:.3f}, p = {p:.3g}")

Again, the average preparation time of each restaurant does not have any effect on the number of orders they receive. This is surprising given that it would be expected that longer preparation time might cause customers not to want to wait. However, from analysis, it shows that participants stick to their cuisines irrespective of preparation time. This could be based on the nationality background of the participants. It is also possible that the customers are situated close to the restaurant resulting in lower delivery time.

### 4. Which customers have made the most orders? And what is the average/variance delivery time to this customer?

In [None]:
restaurant_customer_orders = {}
for name, group in df.groupby("customer_id"):
    restaurant_customer_orders[name] = [
        group["order_id"].nunique(),                 # count orders
        round(group["delivery_time"].mean(), 2)  # avg delivery
    ]

per_customer = (pd.DataFrame.from_dict(restaurant_customer_orders, orient="index",
                                   columns=["order_count", "avg_delivery"])
              .reset_index()
              .rename(columns={"index": "customer_id"}))

In [None]:
per_customer

In [None]:
max_order = per_customer["order_count"].max()

In [None]:
max_order

In [None]:
# To check if multiple customers made 13 orders. We see that only one customer made 13 orders
per_customer["order_count"].value_counts()

In [None]:
max_customer = per_customer[per_customer["order_count"] == max_order]["customer_id"].iloc[0]

In [None]:
print(f"The customer with the max order is customer with id {max_customer}")

In [None]:
r, p = pearsonr(per_customer["order_count"], per_customer["avg_delivery"])
print(f"r = {r:.3f}, p = {p:.3g}")

From the correlation and p-value, we also see that there is no statistically significant correlation between the order made by customers and the average delivery time to those customers

### 5. What is the best restaurant according to rating?

In [None]:
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

restaurant_ratings = {}
for name, group in df.groupby("restaurant_name"):
    ratings = group["rating"].mean()
    if pd.notna(ratings):
        restaurant_ratings[name] = round(ratings, 2)  # avg delivery
    

In [None]:
restaurant_ratings;

In [None]:
items = sorted(restaurant_ratings.items(), key=lambda kv: kv[1], reverse=True)
top50 = items[:50]
labels, values = zip(*top50)

plt.figure(figsize=(30,20))
x = range(len(labels))
plt.bar(x, values)

# annotate each bar with its value
for i, v in enumerate(values):
    plt.text(i, v, f"{v}", ha="center", va="bottom")

plt.xticks(x, labels, rotation=30, ha="right")
plt.ylabel("Value")
plt.title("The top 50 restaurants by food preparation time")
plt.tight_layout()
plt.show()

In [None]:
restaurant_ratings = {}
for name, group in df.groupby("restaurant_name"):
    ratings = group["rating"].mean()
    if pd.notna(ratings):
        restaurant_ratings[name] = [
            group["order_id"].nunique(),                 # count orders
            round(ratings, 2)  # avg ratings
        ]

per_rest = (pd.DataFrame.from_dict(restaurant_ratings, orient="index",
                                   columns=["order_count", "ratings"])
              .reset_index()
              .rename(columns={"index": "restaurant"}))

In [None]:
per_rest

In [None]:
r, p = pearsonr(per_rest["order_count"], per_rest["ratings"])
print(f"r = {r:.3f}, p = {p:.3g}")

There is no statistically significant correlation between ratings and the number of orders made

### 6. What is the best cuisine according to rating? 

In [None]:
restaurant_ratings = {}
for name, group in df.groupby("cuisine_type"):
    ratings = group["rating"].mean()
    if pd.notna(ratings):
        restaurant_ratings[name] = [
            group["order_id"].nunique(),                 # count orders
            round(ratings, 2)  # avg ratings
        ]

per_cuisine = (pd.DataFrame.from_dict(restaurant_ratings, orient="index",
                                   columns=["order_count", "ratings"])
              .reset_index()
              .rename(columns={"index": "cuisine_type"}))

In [None]:
per_cuisine

As we can see from the data, American cuisine type has the highest number of order_counts, despite a rating of 4.30. Considering the large number of orders, this is a very good rating. The Spanish cuisine which has the highest rating has just 12 orders. This might be an indication of the popularism of the Spanish cuisine, and the restaurant might be in America.

In [None]:
r, p = pearsonr(per_cuisine["order_count"], per_cuisine["ratings"])
print(f"r = {r:.3f}, p = {p:.3g}")

It is evident that there is no statistically significant correlation between cuisine type and the number of orders made

### 7. What is the distribution of the cost of orders, food prepration time and delivery time

In [None]:
plt.hist(df["cost_of_the_order"], bins=50, edgecolor='black')
plt.title("Distribution of Order Cost")
plt.xlabel("Cost ($)")
plt.ylabel("Frequency")
plt.show()

- The distribution is highly right-skewed (positively skewed) — there’s a strong concentration of orders around €10–15, and the frequency drops off sharply as cost increases.
- You can also see some secondary small peaks near 20 dollars and 30 dollars, but those are much less frequent.
- The most common order cost (mode) lies around \$10–12, where the tallest bar (≈250+ frequency) appears.
- The mean is likely slightly higher (maybe around \$13–15) because the right tail (more expensive orders) pulls the average up.
- The median will be lower than the mean (another sign of right-skew).
- Order costs range roughly from 5 dollars to 35 dollars, giving a range of ~\$30.
- The majority of orders fall under $20, with a few high-cost outliers.

In [None]:
plt.hist(df["food_preparation_time"], bins=50, edgecolor='black')
plt.title("Distribution of Food Preparation Time")
plt.xlabel("Time (minutes)")
plt.ylabel("Frequency")
plt.show()

- The histogram is fairly flat and uniform, with frequencies across most time intervals (≈20–35 minutes) being roughly similar — around 110–130 counts each.
- There’s no sharp peak or clear mode, which means food preparation times are evenly distributed within that range.
- Food preparation times span from about 20 minutes to 35 minutes, showing a moderate range (~15 minutes).
- There’s no strong clustering around any single time window — prep times vary substantially across orders or restaurants.
- A nearly uniform distribution suggests that there’s no dominant or typical preparation time — restaurants likely vary widely in how long food takes to prepare.

In [None]:
plt.hist(df["delivery_time"], bins=50, edgecolor='black')
plt.title("Distribution of Delivery Time")
plt.xlabel("Time (minutes)")
plt.ylabel("Frequency")
plt.show()

- The histogram appears roughly unimodal (one main peak) centered around 25–28 minutes.
- The counts drop at both ends (below 18 minutes and above 30 minutes).
- So, it’s somewhat bell-shaped, but not perfectly symmetric — maybe slightly right-skewed (a few longer delivery times).
- The most common (modal) delivery times are around 25–28 minutes, where frequency peaks (≈150–160).
- That suggests the average delivery time likely falls near that same range.
- Delivery times range roughly from 15 to 33 minutes, so there’s a range of about 18 minutes.
- That’s a moderate spread — most deliveries are within about 10 minutes of the mean.
- There’s a small tail above 30 minutes — those are the longest deliveries, possibly due to traffic, distance, or order complexity.
- Few deliveries occur under 18 minutes, so ultra-fast deliveries are rare.

### 8. Are there missing data, duplicates or outliers in the data?

In [None]:
df.describe()

This shows the min, max, media (50%) and counts across the different columns

In [None]:
df.info()

In [None]:
df.isnull().sum()

##### We can see that only the rating has missing values. The number is substantial, so we will consider dropping this in modelling

In [None]:
df["rating"].corr(df["food_preparation_time"])

In [None]:
# To confirm the values in rating and there is na
df["rating"].unique()

##### We also see there is little correlation between the rating and food preparation time. Now, we check the mutual information between the two columns

In [None]:
# ensure numeric & drop NaNs
sub = (df[["rating", "food_preparation_time"]]
       .apply(pd.to_numeric, errors="coerce")
       .dropna())

X = sub[["rating"]]                  
y = sub["food_preparation_time"]    

mi = mutual_info_regression(X, y, random_state=0)
print(f"MI(rating to food_preparation_time) = {mi[0]:.4f}")

##### This means the two columns are independent and giving the rating, we cannot say anything about the food preparation time. The verdict is therefore to drop this column

#####  To check duplicates

In [None]:
df.duplicated().any()

There are no duplicates in the data. We check for duplicates so as the model not to be biased or overfitted towards repeated observations.