In [1]:
import pandas as pd
import scipy.stats as st
import math as m
import statsmodels.stats.api as sm
import numpy as np

In [2]:
df = pd.read_csv("pizza_restaurant.csv")
df.head()

Unnamed: 0,Product Name,Crust,Toppings,Price,Delivery Time,# pizzas the customer ordered before
0,Pepperoni,Cheese_and_Garlic,4,17,26.3,4
1,Hawaiian,Cheese_and_Chili,4,17,27.8,4
2,Calzone,Cheese_and_Garlic,3,20,31.5,7
3,Margherita,Cheese,4,23,20.8,7
4,Calzone,Cheese_and_Garlic,4,19,27.7,8


In [3]:
df.describe()

Unnamed: 0,Toppings,Price,Delivery Time,# pizzas the customer ordered before
count,1000.0,1000.0,1000.0,1000.0
mean,3.965,19.342,25.0611,5.659
std,1.021185,3.345479,2.490397,2.459831
min,1.0,12.0,17.8,0.0
25%,3.0,17.0,23.3,4.0
50%,4.0,19.0,25.1,5.0
75%,5.0,21.0,26.7,7.0
max,7.0,33.0,32.4,15.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Product Name                          1000 non-null   object 
 1   Crust                                 1000 non-null   object 
 2   Toppings                              1000 non-null   int64  
 3   Price                                 1000 non-null   int64  
 4   Delivery Time                         1000 non-null   float64
 5   # pizzas the customer ordered before  1000 non-null   int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 47.0+ KB


## Standard Error of the Sample Mean

In [5]:
# Calculate the standard error of the Price column

# using the formula SD / sqrt(n)
print(df.Price.std() / m.sqrt(df.Price.count()))
# using scipy's standard error function
print(st.sem(df.Price))

0.105793327900337
0.105793327900337


## Standardization and z-score

In [8]:
# Standardize the 'Delivery Time' column
# calculated as (value - mean) / standard deviation
df['delivery_time_standardized'] = (df['Delivery Time'] - df['Delivery Time'].mean()) / df['Delivery Time'].std()
df['delivery_time_standardized'].head()

0    0.497471
1    1.099784
2    2.585491
3   -1.711012
4    1.059630
Name: delivery_time_standardized, dtype: float64

In [10]:
# Standardize the 'Delivery Time' column using Sklearn
from sklearn import preprocessing

df['delivery_time_standardized2'] = preprocessing.scale(df['Delivery Time'])
df['delivery_time_standardized2'].head()

0    0.497720
1    1.100335
2    2.586785
3   -1.711868
4    1.060160
Name: delivery_time_standardized2, dtype: float64

In [11]:
df.describe()

Unnamed: 0,Toppings,Price,Delivery Time,# pizzas the customer ordered before,delivery_time_standardized,delivery_time_standardized2
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,3.965,19.342,25.0611,5.659,1.385558e-16,1.421085e-16
std,1.021185,3.345479,2.490397,2.459831,1.0,1.0005
min,1.0,12.0,17.8,0.0,-2.915639,-2.917098
25%,3.0,17.0,23.3,4.0,-0.7071562,-0.7075101
50%,4.0,19.0,25.1,5.0,0.01562,0.01562781
75%,5.0,21.0,26.7,7.0,0.6580878,0.6584171
max,7.0,33.0,32.4,15.0,2.946879,2.948354
