# Database creation and Normalization of [Pizza Restaurant Sales @ kaggle.com](https://www.kaggle.com/datasets/shilongzhuang/pizza-sales)  
## Performed by [danicoder](twitter.com/chusk2)

In [1]:
import pandas as pd

In [2]:
df = pd.read_excel('pizza_sales.xlsx')

In [3]:
df.head()

Unnamed: 0,order_details_id,order_id,pizza_id,quantity,order_date,order_time,unit_price,total_price,pizza_size,pizza_category,pizza_ingredients,pizza_name
0,1,1,hawaiian_m,1,2015-01-01,11:38:36,13.25,13.25,M,Classic,"Sliced Ham, Pineapple, Mozzarella Cheese",The Hawaiian Pizza
1,2,2,classic_dlx_m,1,2015-01-01,11:57:40,16.0,16.0,M,Classic,"Pepperoni, Mushrooms, Red Onions, Red Peppers,...",The Classic Deluxe Pizza
2,3,2,five_cheese_l,1,2015-01-01,11:57:40,18.5,18.5,L,Veggie,"Mozzarella Cheese, Provolone Cheese, Smoked Go...",The Five Cheese Pizza
3,4,2,ital_supr_l,1,2015-01-01,11:57:40,20.75,20.75,L,Supreme,"Calabrese Salami, Capocollo, Tomatoes, Red Oni...",The Italian Supreme Pizza
4,5,2,mexicana_m,1,2015-01-01,11:57:40,16.0,16.0,M,Veggie,"Tomatoes, Red Peppers, Jalapeno Peppers, Red O...",The Mexicana Pizza


## Modify pizza names

In [4]:
df.pizza_name.unique()

array(['The Hawaiian Pizza', 'The Classic Deluxe Pizza',
       'The Five Cheese Pizza', 'The Italian Supreme Pizza',
       'The Mexicana Pizza', 'The Thai Chicken Pizza',
       'The Prosciutto and Arugula Pizza', 'The Barbecue Chicken Pizza',
       'The Greek Pizza', 'The Spinach Supreme Pizza',
       'The Green Garden Pizza', 'The Italian Capocollo Pizza',
       'The Spicy Italian Pizza', 'The Spinach Pesto Pizza',
       'The Vegetables + Vegetables Pizza', 'The Southwest Chicken Pizza',
       'The California Chicken Pizza', 'The Pepperoni Pizza',
       'The Chicken Pesto Pizza', 'The Big Meat Pizza',
       'The Soppressata Pizza', 'The Four Cheese Pizza',
       'The Napolitana Pizza', 'The Calabrese Pizza',
       'The Italian Vegetables Pizza', 'The Mediterranean Pizza',
       'The Pepper Salami Pizza', 'The Spinach and Feta Pizza',
       'The Sicilian Pizza', 'The Chicken Alfredo Pizza',
       'The Pepperoni, Mushroom, and Peppers Pizza',
       'The Brie Carre Pizza'

### Remove 'The ' and ' Pizza'

In [5]:
df.pizza_name = df.pizza_name.apply(lambda x : x[4:-6].lower() )

In [6]:
df.pizza_name.unique()

array(['hawaiian', 'classic deluxe', 'five cheese', 'italian supreme',
       'mexicana', 'thai chicken', 'prosciutto and arugula',
       'barbecue chicken', 'greek', 'spinach supreme', 'green garden',
       'italian capocollo', 'spicy italian', 'spinach pesto',
       'vegetables + vegetables', 'southwest chicken',
       'california chicken', 'pepperoni', 'chicken pesto', 'big meat',
       'soppressata', 'four cheese', 'napolitana', 'calabrese',
       'italian vegetables', 'mediterranean', 'pepper salami',
       'spinach and feta', 'sicilian', 'chicken alfredo',
       'pepperoni, mushroom, and peppers', 'brie carre'], dtype=object)

### Lowercase the pizza category

In [7]:
df.loc[:,'pizza_category'] = df.pizza_category.str.lower()

## pizza table

In [8]:
pizzas = df[['pizza_name', 'pizza_category']].drop_duplicates().reset_index(drop=True)

In [9]:
pizzas['pizza_id'] = [i + 1 for i in pizzas.index.to_list()]

In [10]:
pizzas.sample(3)

Unnamed: 0,pizza_name,pizza_category,pizza_id
28,sicilian,supreme,29
6,prosciutto and arugula,supreme,7
31,brie carre,supreme,32


### Rearrange the columns in the pizza table

In [11]:
pizzas.columns

Index(['pizza_name', 'pizza_category', 'pizza_id'], dtype='object')

In [12]:
pizzas = pizzas[['pizza_id', 'pizza_name', 'pizza_category']]

In [13]:
pizzas

Unnamed: 0,pizza_id,pizza_name,pizza_category
0,1,hawaiian,classic
1,2,classic deluxe,classic
2,3,five cheese,veggie
3,4,italian supreme,supreme
4,5,mexicana,veggie
5,6,thai chicken,chicken
6,7,prosciutto and arugula,supreme
7,8,barbecue chicken,chicken
8,9,greek,classic
9,10,spinach supreme,supreme


## pizza_ingredients relation table

In [14]:
pizza_ingredients = df[['pizza_name', 'pizza_ingredients']]

In [15]:
pizza_ingredients = pizza_ingredients.drop_duplicates().reset_index(drop=True)

In [16]:
pizza_ingredients.head()

Unnamed: 0,pizza_name,pizza_ingredients
0,hawaiian,"Sliced Ham, Pineapple, Mozzarella Cheese"
1,classic deluxe,"Pepperoni, Mushrooms, Red Onions, Red Peppers,..."
2,five cheese,"Mozzarella Cheese, Provolone Cheese, Smoked Go..."
3,italian supreme,"Calabrese Salami, Capocollo, Tomatoes, Red Oni..."
4,mexicana,"Tomatoes, Red Peppers, Jalapeno Peppers, Red O..."


### Assign every pizza_name to its pizza_id

In [17]:
pizza_ingredients['pizza_id'] = pizzas[pizzas.pizza_name == pizza_ingredients.pizza_name].pizza_id

### Drop the pizza_name column:

In [18]:
pizza_ingredients.drop('pizza_name', axis=1, inplace = True)

### Separate ingredients for each pizza name:

In [19]:
pizza_ingredients.pizza_ingredients = pizza_ingredients.pizza_ingredients.str.split(', ')

In [20]:
pizza_ingredients = pizza_ingredients.explode('pizza_ingredients')

In [21]:
pizza_ingredients['pizza_ingredients'] = pizza_ingredients.pizza_ingredients.str.lower()

### Rename columns:

In [22]:
pizza_ingredients.rename(columns = {'pizza_ingredients' : 'ingredient_name'}, inplace = True)

### Rearrange columns:

In [23]:
pizza_ingredients = pizza_ingredients[['pizza_id', 'ingredient_name']]

In [24]:
pizza_ingredients.head()

Unnamed: 0,pizza_id,ingredient_name
0,1,sliced ham
0,1,pineapple
0,1,mozzarella cheese
1,2,pepperoni
1,2,mushrooms


In [25]:
pizza_ingredients.sample(3)

Unnamed: 0,pizza_id,ingredient_name
19,20,chorizo sausage
21,22,gorgonzola piccante cheese
24,25,pesto sauce


## *ingredients* table

### Get the unique ingredients from the previous table

In [26]:
ingredients = pd.DataFrame({'ingredient_name' : pizza_ingredients.ingredient_name.unique() })

### Create an ingredient_id column

In [27]:
ingredients['ingredient_id'] = range(1, ingredients.shape[0] + 1)

### Rearrange columns

In [28]:
ingredients = ingredients[['ingredient_id', 'ingredient_name']]

In [29]:
ingredients.sample(3)

Unnamed: 0,ingredient_id,ingredient_name
38,39,pesto sauce
58,59,onions
28,29,kalamata olives


## pizza_ingredients.ingredient $\rightarrow$ pizza_ingredients.ingredient_id (using ingredients table)

In [30]:
pizza_ingredients['ingredient_id'] = [ ingredients[ingredients.ingredient_name == i].ingredient_id.values[0] for i in pizza_ingredients.ingredient_name]

### Remove the ingredient_name from the pizza_ingredients table

In [31]:
pizza_ingredients.drop('ingredient_name', axis=1, inplace = True)

## *pizza_prices* table

In [32]:
pizza_prices = df[['pizza_name', 'pizza_size', 'unit_price']].drop_duplicates().reset_index(drop=True)

In [33]:
pizza_prices['pizza_id'] = [pizzas[pizzas.pizza_name == i].pizza_id.values[0] for i in pizza_prices.pizza_name]

In [34]:
pizza_prices.sort_values('pizza_id')

Unnamed: 0,pizza_name,pizza_size,unit_price,pizza_id
0,hawaiian,M,13.25,1
51,hawaiian,S,10.50,1
60,hawaiian,L,16.50,1
1,classic deluxe,M,16.00,2
57,classic deluxe,L,20.50,2
...,...,...,...,...
53,chicken alfredo,M,16.75,30
50,"pepperoni, mushroom, and peppers",L,17.50,31
66,"pepperoni, mushroom, and peppers",M,14.50,31
61,"pepperoni, mushroom, and peppers",S,11.00,31


### Remove the *pizza_name* column and rearrange the columns

In [36]:
pizza_prices = pizza_prices[['pizza_id', 'pizza_size', 'unit_price']]

### Sort the *prices* values by *pizza_id* column

In [37]:
pizza_prices = pizza_prices.sort_values(['pizza_id', 'unit_price']).reset_index(drop=True)

In [38]:
pizza_prices

Unnamed: 0,pizza_id,pizza_size,unit_price
0,1,S,10.50
1,1,M,13.25
2,1,L,16.50
3,2,S,12.00
4,2,M,16.00
...,...,...,...
86,30,L,20.75
87,31,S,11.00
88,31,M,14.50
89,31,L,17.50


## *orders* table

In [39]:
orders = df[['order_details_id', 'order_id', 'pizza_id', 'quantity', 'order_date', 'order_time', 'pizza_name']]

In [40]:
orders.sample(3)

Unnamed: 0,order_details_id,order_id,pizza_id,quantity,order_date,order_time,pizza_name
22830,22831,10036,spicy_ital_m,1,2015-06-17,18:59:12,spicy italian
41157,41158,18129,five_cheese_l,1,2015-11-05,18:28:21,five cheese
17760,17761,7796,classic_dlx_m,1,2015-05-11,17:28:36,classic deluxe


### Get the size of the ordered pizza:

In [41]:
orders.loc[:, 'pizza_size'] = orders.pizza_id.apply(lambda x : x.split('_')[-1].upper() )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orders.loc[:, 'pizza_size'] = orders.pizza_id.apply(lambda x : x.split('_')[-1].upper() )


### Change the string in *pizza_id* column by its *pizza_id* **numerical value**.

In [42]:
orders.loc[:, 'pizza_id'] = orders.pizza_name.apply(lambda x : pizzas[pizzas.pizza_name == x].pizza_id.values[0])

### Create timestamp column concatenating date and time:

In [43]:
orders['order_timestamp'] = orders.apply(lambda x : str(x['order_date'].date()) + ' ' + str(x['order_time']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orders['order_timestamp'] = orders.apply(lambda x : str(x['order_date'].date()) + ' ' + str(x['order_time']), axis=1)


### Rearrange the columns and drop *order_date* and *order_time* columns:

In [44]:
orders = orders[['order_id', 'order_timestamp', 'pizza_id', 'pizza_size', 'quantity']]

In [45]:
orders.sample(3)

Unnamed: 0,order_id,order_timestamp,pizza_id,pizza_size,quantity
40030,17627,2015-10-28 11:45:05,2,M,1
8561,3749,2015-03-04 21:02:21,31,M,1
7207,3179,2015-02-23 12:41:52,25,M,1


In [46]:
orders.head()

Unnamed: 0,order_id,order_timestamp,pizza_id,pizza_size,quantity
0,1,2015-01-01 11:38:36,1,M,1
1,2,2015-01-01 11:57:40,2,M,1
2,2,2015-01-01 11:57:40,3,L,1
3,2,2015-01-01 11:57:40,4,L,1
4,2,2015-01-01 11:57:40,5,M,1


As we can see, there are repeating order_id and order_timestamp values. This is due to the fact that in the same order, several pizza categories and sizes were ordered. This fact invalidates both pizza_id and order_timestamp as primary keys.

## Export to csv files

In [47]:
pizzas.to_csv('pizzas.csv', index = False)

In [48]:
ingredients.to_csv('ingredients.csv', index = False)

In [49]:
pizza_ingredients.to_csv('pizza_ingredients.csv', index = False)

In [50]:
pizza_prices.to_csv('pizza_prices.csv', index = False)

In [51]:
orders.to_csv('orders.csv', index = False)

## Generate the copy commands to insert values from csv into tables using pslq

In [54]:
for t in ['pizzas', 'ingredients', 'pizza_ingredients', 'pizza_prices', 'orders'] :
    print(f"-- Insert values into table: {t} , from {t}.csv file\n")
    print(f"copy {t} from '/home/debian/data/pgdata/pizzeria/{t}.csv' with csv header ;\n") 

-- Insert values into table: pizzas , from pizzas.csv file

copy pizzas from '/home/debian/data/pgdata/pizzeria/pizzas.csv' with csv header ;

-- Insert values into table: ingredients , from ingredients.csv file

copy ingredients from '/home/debian/data/pgdata/pizzeria/ingredients.csv' with csv header ;

-- Insert values into table: pizza_ingredients , from pizza_ingredients.csv file

copy pizza_ingredients from '/home/debian/data/pgdata/pizzeria/pizza_ingredients.csv' with csv header ;

-- Insert values into table: pizza_prices , from pizza_prices.csv file

copy pizza_prices from '/home/debian/data/pgdata/pizzeria/pizza_prices.csv' with csv header ;

-- Insert values into table: orders , from orders.csv file

copy orders from '/home/debian/data/pgdata/pizzeria/orders.csv' with csv header ;

