# Generating data for Pizza sell

In [89]:
# Import required library
import pandas as pd
from bs4 import BeautifulSoup
import random
from datetime import datetime, timedelta
from selenium import webdriver
from faker import Faker

In [90]:
# Define the URL to extract data
url='https://pizzaonline.dominos.co.in/menu?categoryId=1'

In [91]:
# Initialize the Chrome driver
driver=webdriver.Chrome()

# Open the URL
driver.get(url)

In [92]:
# Parse the HTML content
soup=BeautifulSoup(driver.page_source,'html.parser')

# Generating Data for Pizza Type Table

In [93]:
# Find all the name elements with class 'itm-dsc__nm'
name_elements=soup.find_all('span',class_='itm-dsc__nm')

In [96]:
# Extract all the names from name elements

# Generate a list to hold the data
name=[]
for name_element in name_elements:
    name.append(name_element.text)
print(name)

['Blazing Onion & Paprika', 'Fiery Sausage & Paprika', 'Peppy Paneer', 'Veg Extravaganza', 'Veggie Paradise', 'Cheese n Corn', 'Pepper Barbecue Chicken', 'Chicken Sausage', 'Match Day Veg Combo', 'Paneer Spice Supreme', 'Fiery Jalapeno & Paprika', 'Blazing Onion & Paprika', 'Fiery Jalapeno & Paprika', 'Blazing Chicken & Paprika', 'Fiery Sausage & Paprika', 'Paneer Spice Supreme', 'Fiery Jalapeno & Paprika', 'Peppy Paneer', 'Corn n Cheese Paratha Pizza', 'Blazing Onion & Paprika', 'Veg Extravaganza', 'Indi Tandoori Paneer', 'Veggie Paradise', 'Cheese n Corn', 'Cheese Overload', 'Mexican Green Wave', 'Double Cheese Margherita', 'Fresh Veggie', 'Achari Do Pyaza', 'Pepper Barbecue Chicken', 'Chicken Dominator', 'Fiery Sausage & Paprika', 'Blazing Chicken & Paprika', 'The 5 Chicken Feast Pizza', 'Chicken Keema Paratha Pizza', 'Non Veg Supreme', 'Chicken Sausage', 'Spiced Double Chicken', 'Chicken Golden Delight', 'Indi Chicken Tikka', 'Chicken Pepperoni', 'Chicken Fiesta', 'Pepsi 475ml', 'P

In [97]:
# Find all the description elements with class 'itm-dsc__dscrptn'
desc_elements=soup.find_all('span',class_='itm-dsc__dscrptn')

# Generate a list to hold the data
description=[]

# Extract all the descriptions from desc elements
for desc_element in desc_elements:
    description.append(desc_element.text)
print(description)

["Hot & spicy pizza with onion & red paprika toppings and a new spicy peri peri sauce on a Domino's cheesy base.", 'Spiciest non veg pizza with spicy & herby chicken sausage and red paprika toppings on a new spicy peri peri sauce base.', 'Flavorful trio of juicy paneer, crisp capsicum with spicy red paprika', 'Black olives, capsicum, onion, grilled mushroom, corn, tomato, jalapeno & extra cheese', 'The awesome foursome! Golden corn, black olives, capsicum, red paprika', 'A delectable combination of sweet & juicy golden corn', 'Pepper barbecue chicken for that extra zing', 'American classic! Spicy, herbed chicken sausage on pizza', 'Regular Margherita +Regular  Farmhouse + Garlic Bread + Pepsi', 'Loaded with Paneer, Red Paprika, Olives and Jalapeno; The Best a No Onion No Garlic Pizza can get!', 'Spiciest veg pizza with jalapeno & red paprika toppings and a new spicy peri peri sauce.', "Hot & spicy pizza with onion & red paprika toppings and a new spicy peri peri sauce on a Domino's che

In [98]:
# Generating data for Pizza type table
category=['Veg','Nonveg']  # Categorize data according to Veg and Nonveg

# Generate a list to hold the data
combinations=[]
i=1
for n, d in zip(name, description):
    for c in category:
        combinations.append({
            "Pizza_type_id":i,
            "Name": n,
            "Category": c,
            "Description": d
        })
        i+=1
print(combinations)

[{'Pizza_type_id': 1, 'Name': 'Blazing Onion & Paprika', 'Category': 'Veg', 'Description': "Hot & spicy pizza with onion & red paprika toppings and a new spicy peri peri sauce on a Domino's cheesy base."}, {'Pizza_type_id': 2, 'Name': 'Blazing Onion & Paprika', 'Category': 'Nonveg', 'Description': "Hot & spicy pizza with onion & red paprika toppings and a new spicy peri peri sauce on a Domino's cheesy base."}, {'Pizza_type_id': 3, 'Name': 'Fiery Sausage & Paprika', 'Category': 'Veg', 'Description': 'Spiciest non veg pizza with spicy & herby chicken sausage and red paprika toppings on a new spicy peri peri sauce base.'}, {'Pizza_type_id': 4, 'Name': 'Fiery Sausage & Paprika', 'Category': 'Nonveg', 'Description': 'Spiciest non veg pizza with spicy & herby chicken sausage and red paprika toppings on a new spicy peri peri sauce base.'}, {'Pizza_type_id': 5, 'Name': 'Peppy Paneer', 'Category': 'Veg', 'Description': 'Flavorful trio of juicy paneer, crisp capsicum with spicy red paprika'}, {'

In [99]:
# Convert the generated data into data frame
p_type=pd.DataFrame(combinations)

In [100]:
# Check top five rows of the dataset
p_type.head()

Unnamed: 0,Pizza_type_id,Name,Category,Description
0,1,Blazing Onion & Paprika,Veg,Hot & spicy pizza with onion & red paprika top...
1,2,Blazing Onion & Paprika,Nonveg,Hot & spicy pizza with onion & red paprika top...
2,3,Fiery Sausage & Paprika,Veg,Spiciest non veg pizza with spicy & herby chic...
3,4,Fiery Sausage & Paprika,Nonveg,Spiciest non veg pizza with spicy & herby chic...
4,5,Peppy Paneer,Veg,"Flavorful trio of juicy paneer, crisp capsicum..."


# Generating Data for Pizza Table

In [101]:
# Generating the data for Pizza table
i=1
s=['Small','Medium','Large'] # Categorize data according to various sizes

# Generate a list to hold the data
p_data=[]
for ptid in combinations:
    for s1 in s:      
        p_data.append({
                    "Pizza_id":i,
                    "Pizza_type_id":ptid["Pizza_type_id"],
                    "Size": s1,
                    "Price": random.randint(150,600)
        })
        i+=1
print(p_data)

[{'Pizza_id': 1, 'Pizza_type_id': 1, 'Size': 'Small', 'Price': 497}, {'Pizza_id': 2, 'Pizza_type_id': 1, 'Size': 'Medium', 'Price': 480}, {'Pizza_id': 3, 'Pizza_type_id': 1, 'Size': 'Large', 'Price': 274}, {'Pizza_id': 4, 'Pizza_type_id': 2, 'Size': 'Small', 'Price': 358}, {'Pizza_id': 5, 'Pizza_type_id': 2, 'Size': 'Medium', 'Price': 171}, {'Pizza_id': 6, 'Pizza_type_id': 2, 'Size': 'Large', 'Price': 461}, {'Pizza_id': 7, 'Pizza_type_id': 3, 'Size': 'Small', 'Price': 384}, {'Pizza_id': 8, 'Pizza_type_id': 3, 'Size': 'Medium', 'Price': 274}, {'Pizza_id': 9, 'Pizza_type_id': 3, 'Size': 'Large', 'Price': 500}, {'Pizza_id': 10, 'Pizza_type_id': 4, 'Size': 'Small', 'Price': 563}, {'Pizza_id': 11, 'Pizza_type_id': 4, 'Size': 'Medium', 'Price': 345}, {'Pizza_id': 12, 'Pizza_type_id': 4, 'Size': 'Large', 'Price': 412}, {'Pizza_id': 13, 'Pizza_type_id': 5, 'Size': 'Small', 'Price': 205}, {'Pizza_id': 14, 'Pizza_type_id': 5, 'Size': 'Medium', 'Price': 150}, {'Pizza_id': 15, 'Pizza_type_id': 5, 

In [102]:
# Convert the generated data into data frame
pizza=pd.DataFrame(p_data)

In [103]:
# Check top five rows of the dataset
pizza.head()

Unnamed: 0,Pizza_id,Pizza_type_id,Size,Price
0,1,1,Small,497
1,2,1,Medium,480
2,3,1,Large,274
3,4,2,Small,358
4,5,2,Medium,171


# Generating data for Orders Table

In [105]:
# Initialize a variable
fake=Faker()

In [107]:
# Define the start and end dates
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)

# Generate a list to hold the data
data = []

# Generate the data
current_date = start_date
while current_date <= end_date:
    # Generate random time within the current date between 10 AM and 10 PM
    for _ in range(12):  # Generate 12 time entries per day
        time = fake.date_time_between_dates(
            datetime_start=current_date.replace(hour=10, minute=0, second=0),
            datetime_end=current_date.replace(hour=22, minute=0, second=0)
        ).strftime("%H:%M:%S")
        data.append([current_date.strftime("%d-%m-%Y"), time])
    current_date += timedelta(days=1)
print(data)

[['01-01-2023', '10:32:05'], ['01-01-2023', '15:46:00'], ['01-01-2023', '10:57:05'], ['01-01-2023', '11:41:37'], ['01-01-2023', '10:50:42'], ['01-01-2023', '16:29:37'], ['01-01-2023', '15:38:12'], ['01-01-2023', '21:12:48'], ['01-01-2023', '14:21:24'], ['01-01-2023', '15:23:55'], ['01-01-2023', '17:15:31'], ['01-01-2023', '10:27:14'], ['02-01-2023', '19:43:25'], ['02-01-2023', '19:41:33'], ['02-01-2023', '17:11:48'], ['02-01-2023', '16:46:33'], ['02-01-2023', '15:58:33'], ['02-01-2023', '12:25:11'], ['02-01-2023', '11:38:33'], ['02-01-2023', '14:52:38'], ['02-01-2023', '11:01:49'], ['02-01-2023', '20:10:02'], ['02-01-2023', '17:44:42'], ['02-01-2023', '20:25:14'], ['03-01-2023', '20:38:07'], ['03-01-2023', '13:44:34'], ['03-01-2023', '13:58:34'], ['03-01-2023', '14:36:43'], ['03-01-2023', '16:33:22'], ['03-01-2023', '14:00:57'], ['03-01-2023', '10:41:17'], ['03-01-2023', '15:21:56'], ['03-01-2023', '18:07:43'], ['03-01-2023', '10:45:33'], ['03-01-2023', '11:25:20'], ['03-01-2023', '20:

In [108]:
# Create a dictionary to store times for each date
time_by_date = {}
for date, time in data:
    if date not in time_by_date:
        time_by_date[date] = []
    time_by_date[date].append(time)

# Sort times for each date
for date in time_by_date:
    time_by_date[date].sort()

# Reconstruct the ordered list of dates and times
ordered_data = []
for date in sorted(time_by_date.keys(), key=lambda d: datetime.strptime(d, "%d-%m-%Y")):
    for time in time_by_date[date]:
        ordered_data.append([date, time])
print(ordered_data)

[['01-01-2023', '10:27:14'], ['01-01-2023', '10:32:05'], ['01-01-2023', '10:50:42'], ['01-01-2023', '10:57:05'], ['01-01-2023', '11:41:37'], ['01-01-2023', '14:21:24'], ['01-01-2023', '15:23:55'], ['01-01-2023', '15:38:12'], ['01-01-2023', '15:46:00'], ['01-01-2023', '16:29:37'], ['01-01-2023', '17:15:31'], ['01-01-2023', '21:12:48'], ['02-01-2023', '11:01:49'], ['02-01-2023', '11:38:33'], ['02-01-2023', '12:25:11'], ['02-01-2023', '14:52:38'], ['02-01-2023', '15:58:33'], ['02-01-2023', '16:46:33'], ['02-01-2023', '17:11:48'], ['02-01-2023', '17:44:42'], ['02-01-2023', '19:41:33'], ['02-01-2023', '19:43:25'], ['02-01-2023', '20:10:02'], ['02-01-2023', '20:25:14'], ['03-01-2023', '10:41:17'], ['03-01-2023', '10:45:33'], ['03-01-2023', '11:25:20'], ['03-01-2023', '13:44:34'], ['03-01-2023', '13:58:34'], ['03-01-2023', '14:00:57'], ['03-01-2023', '14:36:43'], ['03-01-2023', '15:21:56'], ['03-01-2023', '16:33:22'], ['03-01-2023', '18:07:43'], ['03-01-2023', '20:33:55'], ['03-01-2023', '20:

In [109]:
# Extract date and time and store in different variables
o_date=[]
o_time=[]
for ext in ordered_data:
    o_date.append(ext[0])
    o_time.append(ext[1])

In [160]:
# Generate data for Orders table
o_data=[] # Generate a list to hold the data
i=1
for date, time in zip(o_date,o_time):
    o_data.append({
        "Order_id":i,
        "Date":date,
        "Time":time
    })
    i+=1
print(o_data)

[{'Order_id': 1, 'Date': '01-01-2023', 'Time': '10:27:14'}, {'Order_id': 2, 'Date': '01-01-2023', 'Time': '10:32:05'}, {'Order_id': 3, 'Date': '01-01-2023', 'Time': '10:50:42'}, {'Order_id': 4, 'Date': '01-01-2023', 'Time': '10:57:05'}, {'Order_id': 5, 'Date': '01-01-2023', 'Time': '11:41:37'}, {'Order_id': 6, 'Date': '01-01-2023', 'Time': '14:21:24'}, {'Order_id': 7, 'Date': '01-01-2023', 'Time': '15:23:55'}, {'Order_id': 8, 'Date': '01-01-2023', 'Time': '15:38:12'}, {'Order_id': 9, 'Date': '01-01-2023', 'Time': '15:46:00'}, {'Order_id': 10, 'Date': '01-01-2023', 'Time': '16:29:37'}, {'Order_id': 11, 'Date': '01-01-2023', 'Time': '17:15:31'}, {'Order_id': 12, 'Date': '01-01-2023', 'Time': '21:12:48'}, {'Order_id': 13, 'Date': '02-01-2023', 'Time': '11:01:49'}, {'Order_id': 14, 'Date': '02-01-2023', 'Time': '11:38:33'}, {'Order_id': 15, 'Date': '02-01-2023', 'Time': '12:25:11'}, {'Order_id': 16, 'Date': '02-01-2023', 'Time': '14:52:38'}, {'Order_id': 17, 'Date': '02-01-2023', 'Time': '

In [161]:
# Convert the generated data into data frame
order_data=pd.DataFrame(o_data)

In [148]:
# Check top five rows of the dataset
order_data.head()

Unnamed: 0,Order_id,Date,Time
0,1,01-01-2023,10:27:14
1,2,01-01-2023,10:32:05
2,3,01-01-2023,10:50:42
3,4,01-01-2023,10:57:05
4,5,01-01-2023,11:41:37


# Generating data for Order details table

In [113]:
# Generate data for Order details table
orderdetails_data=[] # Generate a list to hold the data
j=1

for order_id,_ in enumerate(data,start=1):
    num_order_details=random.randint(1,20)
    for _ in range(num_order_details):
        piza=random.choice(p_data)

        orderdetails_data.append({
            "Orderdetails_id":j,
            "Order_id":order_id,
            "Pizza_id":piza["Pizza_id"],
            "Quantity":random.randint(1,5)
        })
        j+=1

In [116]:
orderdetails_data

[{'Orderdetails_id': 1, 'Order_id': 1, 'Pizza_id': 406, 'Quantity': 3},
 {'Orderdetails_id': 2, 'Order_id': 1, 'Pizza_id': 672, 'Quantity': 1},
 {'Orderdetails_id': 3, 'Order_id': 1, 'Pizza_id': 615, 'Quantity': 3},
 {'Orderdetails_id': 4, 'Order_id': 2, 'Pizza_id': 553, 'Quantity': 5},
 {'Orderdetails_id': 5, 'Order_id': 2, 'Pizza_id': 35, 'Quantity': 4},
 {'Orderdetails_id': 6, 'Order_id': 2, 'Pizza_id': 44, 'Quantity': 3},
 {'Orderdetails_id': 7, 'Order_id': 2, 'Pizza_id': 638, 'Quantity': 2},
 {'Orderdetails_id': 8, 'Order_id': 3, 'Pizza_id': 413, 'Quantity': 4},
 {'Orderdetails_id': 9, 'Order_id': 3, 'Pizza_id': 731, 'Quantity': 3},
 {'Orderdetails_id': 10, 'Order_id': 3, 'Pizza_id': 377, 'Quantity': 1},
 {'Orderdetails_id': 11, 'Order_id': 3, 'Pizza_id': 491, 'Quantity': 3},
 {'Orderdetails_id': 12, 'Order_id': 3, 'Pizza_id': 95, 'Quantity': 3},
 {'Orderdetails_id': 13, 'Order_id': 3, 'Pizza_id': 666, 'Quantity': 2},
 {'Orderdetails_id': 14, 'Order_id': 3, 'Pizza_id': 612, 'Quant

In [117]:
# Convert the generated data into data frame
od_data=pd.DataFrame(orderdetails_data)

In [118]:
# Check top five rows of the dataset
od_data.head()

Unnamed: 0,Orderdetails_id,Order_id,Pizza_id,Quantity
0,1,1,406,3
1,2,1,672,1
2,3,1,615,3
3,4,2,553,5
4,5,2,35,4


# Perform cleaning on generated datasets

In [120]:
# Cleaning the pizza type dataset
p_type.head() # Check for top five rows

Unnamed: 0,Pizza_type_id,Name,Category,Description
0,1,Blazing Onion & Paprika,Veg,Hot & spicy pizza with onion & red paprika top...
1,2,Blazing Onion & Paprika,Nonveg,Hot & spicy pizza with onion & red paprika top...
2,3,Fiery Sausage & Paprika,Veg,Spiciest non veg pizza with spicy & herby chic...
3,4,Fiery Sausage & Paprika,Nonveg,Spiciest non veg pizza with spicy & herby chic...
4,5,Peppy Paneer,Veg,"Flavorful trio of juicy paneer, crisp capsicum..."


In [121]:
# Check for data types
p_type.dtypes

Pizza_type_id     int64
Name             object
Category         object
Description      object
dtype: object

In [124]:
# Check missing values in the dataset
p_type.isnull().sum()

Pizza_type_id    0
Name             0
Category         0
Description      0
dtype: int64

In [132]:
# Check size of dataset (rows, columns)
p_type.shape

(244, 4)

In [173]:
# Convert the dataframe into csv file
p_type.to_csv('pizza_types.csv',index=False)

In [None]:
# Summarize Findings for Pizza type table:
# 1. The dataset contains information about Pizza types including Pizza type id, name, category and 
# description.
# 2. The dataset has 244 rows and 04 columns.
# 3. There are no missing values in dataset.
# 4. Checked data types of all the columns and observed that it is already as per the requirement.

In [133]:
# Cleaning the pizzas dataset
pizza.head() # Check for top five rows

Unnamed: 0,Pizza_id,Pizza_type_id,Size,Price
0,1,1,Small,497
1,2,1,Medium,480
2,3,1,Large,274
3,4,2,Small,358
4,5,2,Medium,171


In [134]:
# Check for data types
pizza.dtypes

Pizza_id          int64
Pizza_type_id     int64
Size             object
Price             int64
dtype: object

In [135]:
# Check missing values in the dataset
pizza.isnull().sum()

Pizza_id         0
Pizza_type_id    0
Size             0
Price            0
dtype: int64

In [136]:
# Check size of dataset (rows, columns)
pizza.shape

(732, 4)

In [174]:
# Convert the dataframe into csv file
pizza.to_csv("Pizzas.csv",index=False)

In [None]:
# Summarize Findings for Pizza table:
# 1. The dataset contains information about Pizzas including Pizza id, Pizza type id, size, and price.
# 2. The dataset has 732 rows and 04 columns.
# 3. There are no missing values in dataset.
# 4. Checked data types of all the columns and observed that it is already as per the requirement.

In [162]:
# Cleaning the pizza orders dataset
order_data.head() # Check top five rows of the dataset

Unnamed: 0,Order_id,Date,Time
0,1,01-01-2023,10:27:14
1,2,01-01-2023,10:32:05
2,3,01-01-2023,10:50:42
3,4,01-01-2023,10:57:05
4,5,01-01-2023,11:41:37


In [163]:
# Check for data types
order_data.dtypes

Order_id     int64
Date        object
Time        object
dtype: object

In [164]:
# Date column Data type conversion
order_data['Date']=pd.to_datetime(order_data['Date'],format='%d-%m-%Y',errors='coerce')
print(order_data['Date'])

0      2023-01-01
1      2023-01-01
2      2023-01-01
3      2023-01-01
4      2023-01-01
          ...    
4375   2023-12-31
4376   2023-12-31
4377   2023-12-31
4378   2023-12-31
4379   2023-12-31
Name: Date, Length: 4380, dtype: datetime64[ns]


In [166]:
# Check for data types after conversion
order_data.dtypes

Order_id             int64
Date        datetime64[ns]
Time                object
dtype: object

In [167]:
# Check missing values in the dataset
order_data.isnull().sum()

Order_id    0
Date        0
Time        0
dtype: int64

In [168]:
# Check size of dataset (rows, columns)
order_data.shape

(4380, 3)

In [175]:
# Convert the dataframe into csv file
order_data.to_csv("Orders.csv", index=False)

In [None]:
# Summarize Findings for Orders table:
# 1. The dataset contains information about Pizza orders including order id, date and time.
# 2. The dataset has 4380 rows and 03 columns.
# 3. There are no missing values in dataset.
# 4. Checked data types of all the columns and converted date column data type to datetime.

In [169]:
# Cleaning the order details dataset
od_data.head() # Check top five rows of the dataset

Unnamed: 0,Orderdetails_id,Order_id,Pizza_id,Quantity
0,1,1,406,3
1,2,1,672,1
2,3,1,615,3
3,4,2,553,5
4,5,2,35,4


In [170]:
# Check for data types
od_data.dtypes

Orderdetails_id    int64
Order_id           int64
Pizza_id           int64
Quantity           int64
dtype: object

In [171]:
# Check missing values in the dataset
od_data.isnull().sum()

Orderdetails_id    0
Order_id           0
Pizza_id           0
Quantity           0
dtype: int64

In [172]:
# Check size of dataset (rows, columns)
od_data.shape

(46325, 4)

In [176]:
# Convert the dataframe into csv file
od_data.to_csv("Order_details.csv",index=False)

In [None]:
# Summarize Findings for Order details table:
# 1. The dataset contains information about Pizza order details including orderdetails id, order id, 
# pizza id and Quanity.
# 2. The dataset has 46325 rows and 04 columns.
# 3. There are no missing values in dataset.
# 4. Checked data types of all the columns and observed that it is already as per the requirement.