In [18]:
import pandas as pd
import scipy.stats as st
import math as m
import statsmodels.stats.api as sm
import numpy as np

In [19]:
df = pd.read_csv("pizza_restaurant.csv")
df.head()

Unnamed: 0,Product Name,Crust,Toppings,Price,Delivery Time,# pizzas the customer ordered before
0,Pepperoni,Cheese_and_Garlic,4,17,26.3,4
1,Hawaiian,Cheese_and_Chili,4,17,27.8,4
2,Calzone,Cheese_and_Garlic,3,20,31.5,7
3,Margherita,Cheese,4,23,20.8,7
4,Calzone,Cheese_and_Garlic,4,19,27.7,8


In [20]:
df.describe()

Unnamed: 0,Toppings,Price,Delivery Time,# pizzas the customer ordered before
count,1000.0,1000.0,1000.0,1000.0
mean,3.965,19.342,25.0611,5.659
std,1.021185,3.345479,2.490397,2.459831
min,1.0,12.0,17.8,0.0
25%,3.0,17.0,23.3,4.0
50%,4.0,19.0,25.1,5.0
75%,5.0,21.0,26.7,7.0
max,7.0,33.0,32.4,15.0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Product Name                          1000 non-null   object 
 1   Crust                                 1000 non-null   object 
 2   Toppings                              1000 non-null   int64  
 3   Price                                 1000 non-null   int64  
 4   Delivery Time                         1000 non-null   float64
 5   # pizzas the customer ordered before  1000 non-null   int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 47.0+ KB


## Standard Error of the Sample Mean

In [22]:
# Calculate the standard error of the Price column

# using the formula SD / sqrt(n)
print(df.Price.std() / m.sqrt(df.Price.count()))
# using scipy's standard error function
print(st.sem(df.Price))

0.105793327900337
0.105793327900337


## Standardization and z-score

In [23]:
# Standardize the 'Delivery Time' column
# calculated as (value - mean) / standard deviation
df['delivery_time_standardized'] = (df['Delivery Time'] - df['Delivery Time'].mean()) / df['Delivery Time'].std()
df['delivery_time_standardized'].head()

0    0.497471
1    1.099784
2    2.585491
3   -1.711012
4    1.059630
Name: delivery_time_standardized, dtype: float64

In [24]:
# Standardize the 'Delivery Time' column using Sklearn
from sklearn import preprocessing

df['delivery_time_standardized2'] = preprocessing.scale(df['Delivery Time'])
df['delivery_time_standardized2'].head()

0    0.497720
1    1.100335
2    2.586785
3   -1.711868
4    1.060160
Name: delivery_time_standardized2, dtype: float64

In [25]:
df.head()

Unnamed: 0,Product Name,Crust,Toppings,Price,Delivery Time,# pizzas the customer ordered before,delivery_time_standardized,delivery_time_standardized2
0,Pepperoni,Cheese_and_Garlic,4,17,26.3,4,0.497471,0.49772
1,Hawaiian,Cheese_and_Chili,4,17,27.8,4,1.099784,1.100335
2,Calzone,Cheese_and_Garlic,3,20,31.5,7,2.585491,2.586785
3,Margherita,Cheese,4,23,20.8,7,-1.711012,-1.711868
4,Calzone,Cheese_and_Garlic,4,19,27.7,8,1.05963,1.06016


## Confidence Level

In [26]:
# Confidence levels
cl = [0.005, 0.025, 0.05, 0.95, 0.975, 0.995]

# Print the Z-scores for the given confidence levels
for alpha in cl:
  print(f"The normal distribution value for {alpha} is {round(st.norm.ppf(alpha),2)}")

The normal distribution value for 0.005 is -2.58
The normal distribution value for 0.025 is -1.96
The normal distribution value for 0.05 is -1.64
The normal distribution value for 0.95 is 1.64
The normal distribution value for 0.975 is 1.96
The normal distribution value for 0.995 is 2.58


In [27]:
# Confidence Interval for the Price mean: mean +- z * SE

# Print the mean of the Price column
print(f"The mean is {df.Price.mean()}")
# Calculate and print the 95% confidence interval for the mean
print(st.norm.interval(confidence = 0.95,
                 loc = df.Price.mean(),
                 scale = st.sem(df.Price)))

The mean is 19.342
(np.float64(19.134648887510703), np.float64(19.549351112489294))


In [28]:
def calculate_confidence_intervals(df, confidence=0.95):
    intervals = {}

    # Iterate over each numerical column in the dataframe
    for column in df.select_dtypes(include=[np.number]).columns:
        data = df[column].dropna()
        # Check if the sample size is greater than 30
        if len(data) > 30:
            mean = np.mean(data)
            std_dev = np.std(data, ddof=1)  # use ddof=1 to match stats.sem() behavior
            # Compute the confidence interval for the mean
            interval = st.norm.interval(confidence, loc=mean, scale=std_dev/np.sqrt(len(data)))
            intervals[column] = interval

    return intervals # Return the dictionary with confidence intervals


# Use the function
print(calculate_confidence_intervals(df))

{'Toppings': (np.float64(3.9017074909279676), np.float64(4.028292509072032)), 'Price': (np.float64(19.134648887510703), np.float64(19.549351112489294)), 'Delivery Time': (np.float64(24.9067464105456), np.float64(25.2154535894544)), '# pizzas the customer ordered before': (np.float64(5.5065408812039385), np.float64(5.811459118796061)), 'delivery_time_standardized': (np.float64(-0.061979503230456014), np.float64(0.06197950323045629)), 'delivery_time_standardized2': (np.float64(-0.06201051624377051), np.float64(0.06201051624377079))}


In [30]:
# Take a sample from the data and look at the summary statistics
sample = df.sample(20)
sample.describe()

# Calculate the confidence interval for the mean of 'Price' using scipy.stats.t

# Print the mean of the 'Price' column
print(f"The mean is {sample.Price.mean()}")
# Compute the confidence interval using the t-distribution
st.t.interval(confidence = 0.95,
              df = len(sample) - 1,
              loc = sample.Price.mean(),
              scale = st.sem(sample.Price))

The mean is 19.0


(np.float64(17.926301314891813), np.float64(20.073698685108187))

In [31]:
# Exercise: adapt the function to calculate the Confidence Interval
# When the sample size is less than 30, using chatGPT
def calculate_confidence_intervals(df, confidence=0.95):
    intervals = {}

    for column in df.select_dtypes(include=[np.number]).columns:
        data = df[column].dropna()
        mean = np.mean(data)
        std_dev = np.std(data, ddof=1)  # use ddof=1 to calculate sample std deviation
        if len(data) > 30:
            # For larger samples, use the normal distribution
            interval = st.norm.interval(confidence, loc=mean, scale=std_dev/np.sqrt(len(data)))
        else:
            # For smaller samples, use the t-distribution
            interval = st.t.interval(confidence, len(data)-1, loc=mean, scale=std_dev/np.sqrt(len(data)))
        intervals[column] = interval

    return intervals

# Usage: calculate confidence intervals with 90% confidence
print(calculate_confidence_intervals(df, confidence = 0.9))


{'Toppings': (np.float64(3.9118832519744378), np.float64(4.018116748025562)), 'Price': (np.float64(19.167985460895864), np.float64(19.516014539104134)), 'Delivery Time': (np.float64(24.931562384283747), np.float64(25.190637615716252)), '# pizzas the customer ordered before': (np.float64(5.531052274178713), np.float64(5.786947725821286)), 'delivery_time_standardized': (np.float64(-0.05201483878755562), np.float64(0.052014838787555875)), 'delivery_time_standardized2': (np.float64(-0.05204086572878282), np.float64(0.052040865728783074))}
