# Libraries and Data

In [1]:
%cd /content/drive/MyDrive/Statistics with Python/Inferential Statistics/Confidence Intervals

/content/drive/MyDrive/Statistics with Python/Inferential Statistics/Confidence Intervals


In [23]:
# Libraries
import pandas as pd
import scipy.stats as st
import math as m
import statsmodels.stats.api as sm
import numpy as np

In [3]:
# Load the data
df = pd.read_csv("pizza_restaurant.csv")
df.head()

Unnamed: 0,Product Name,Crust,Toppings,Price,Delivery Time,# pizzas the customer ordered before
0,Pepperoni,Cheese_and_Garlic,4,17,26.3,4
1,Hawaiian,Cheese_and_Chili,4,17,27.8,4
2,Calzone,Cheese_and_Garlic,3,20,31.5,7
3,Margherita,Cheese,4,23,20.8,7
4,Calzone,Cheese_and_Garlic,4,19,27.7,8


In [4]:
# Summary Statistics
df.describe()

Unnamed: 0,Toppings,Price,Delivery Time,# pizzas the customer ordered before
count,1000.0,1000.0,1000.0,1000.0
mean,3.965,19.342,25.0611,5.659
std,1.021185,3.345479,2.490397,2.459831
min,1.0,12.0,17.8,0.0
25%,3.0,17.0,23.3,4.0
50%,4.0,19.0,25.1,5.0
75%,5.0,21.0,26.7,7.0
max,7.0,33.0,32.4,15.0


In [5]:
#DF info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Product Name                          1000 non-null   object 
 1   Crust                                 1000 non-null   object 
 2   Toppings                              1000 non-null   int64  
 3   Price                                 1000 non-null   int64  
 4   Delivery Time                         1000 non-null   float64
 5   # pizzas the customer ordered before  1000 non-null   int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 47.0+ KB


#Standard Error of the Sample Mean

In [6]:
# With the formula: SD / sqrt(n) -> Price
print(df.Price.std() / m.sqrt(df.Price.count()))
print(st.sem(df.Price))

0.105793327900337
0.105793327900337


# Standardization and z-score

In [7]:
# Us using the formula for Delivery Time
df['delivery_time_standardized'] = (df['Delivery Time'] - df['Delivery Time'].mean()) / df['Delivery Time'].std()

In [8]:
# Using Sklearn
from sklearn import preprocessing
df['delivery_time_standardized2'] = preprocessing.scale(df['Delivery Time'])

In [9]:
#look at the data
df.describe()

Unnamed: 0,Toppings,Price,Delivery Time,# pizzas the customer ordered before,delivery_time_standardized,delivery_time_standardized2
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,3.965,19.342,25.0611,5.659,1.385558e-16,1.421085e-16
std,1.021185,3.345479,2.490397,2.459831,1.0,1.0005
min,1.0,12.0,17.8,0.0,-2.915639,-2.917098
25%,3.0,17.0,23.3,4.0,-0.7071562,-0.7075101
50%,4.0,19.0,25.1,5.0,0.01562,0.01562781
75%,5.0,21.0,26.7,7.0,0.6580878,0.6584171
max,7.0,33.0,32.4,15.0,2.946879,2.948354


In [10]:
# Value vary (very) slightly due to imprecision of the function.
# The differences are irrelevant

# Confidence Level

In [18]:
cl = [0.005, 0.025, 0.05, 0.95, 0.975, 0.995]
for alpha in cl:
  print(f"The normal distribution value for {alpha} is {round(st.norm.ppf(alpha),2)}")

The normal distribution value for 0.005 is -2.58
The normal distribution value for 0.025 is -1.96
The normal distribution value for 0.05 is -1.64
The normal distribution value for 0.95 is 1.64
The normal distribution value for 0.975 is 1.96
The normal distribution value for 0.995 is 2.58


# Confidence Intervals for Large Samples

In [22]:
# Confidence Interval for the Price mean: mean +- z * SE
print(f"The mean is {df.Price.mean()}")
print(st.norm.interval(confidence = 0.95,
                 loc = df.Price.mean(),
                 scale = st.sem(df.Price))

The mean is 19.342
(19.134648887510703, 19.549351112489294)


In [25]:
# Using chatGPT, create a function that for each numerical variable that
# has a sample size bigger than 30, computes the confidence interval for the mean
def calculate_confidence_intervals(df, confidence=0.95):
    intervals = {}

    for column in df.select_dtypes(include=[np.number]).columns:
        data = df[column].dropna()
        if len(data) > 30:
            mean = np.mean(data)
            std_dev = np.std(data, ddof=1)  # use ddof=1 to match stats.sem() behavior
            interval = st.norm.interval(confidence, loc=mean, scale=std_dev/np.sqrt(len(data)))
            intervals[column] = interval

    return intervals

# usage
print(calculate_confidence_intervals(df))


{'Toppings': (3.9017074909279676, 4.028292509072032), 'Price': (19.134648887510703, 19.549351112489294), 'Delivery Time': (24.9067464105456, 25.2154535894544), '# pizzas the customer ordered before': (5.5065408812039385, 5.811459118796061), 'delivery_time_standardized': (-0.061979503230456014, 0.06197950323045629), 'delivery_time_standardized2': (-0.06201051624377051, 0.06201051624377079)}


# Confidence Intervals with small samples

In [26]:
# Take a sample from the data
sample = df.sample(20)
sample.describe()

Unnamed: 0,Toppings,Price,Delivery Time,# pizzas the customer ordered before,delivery_time_standardized,delivery_time_standardized2
count,20.0,20.0,20.0,20.0,20.0,20.0
mean,3.95,18.55,25.435,5.1,0.150137,0.150212
std,1.099043,2.543826,2.599043,2.174009,1.043626,1.044148
min,2.0,14.0,21.3,2.0,-1.510241,-1.510997
25%,3.0,16.75,23.8,3.75,-0.506385,-0.506638
50%,4.0,19.0,24.9,5.0,-0.064688,-0.064721
75%,5.0,20.0,26.6,6.0,0.617934,0.618243
max,6.0,23.0,30.2,11.0,2.063486,2.064519


In [28]:
# With the fucntion scipy.stats
print(f"The mean is {sample.Price.mean()}")
st.t.interval(confidence = 0.95,
              df = len(sample) - 1,
              loc = sample.Price.mean(),
              scale = st.sem(sample.Price))

The mean is 18.55


(17.359452608569253, 19.74054739143075)

In [30]:
# Exercise: adapt the function to calculate the Confidence Interval
#When the sample size is less than 30, using chatGPT
def calculate_confidence_intervals(df, confidence=0.95):
    intervals = {}

    for column in df.select_dtypes(include=[np.number]).columns:
        data = df[column].dropna()
        mean = np.mean(data)
        std_dev = np.std(data, ddof=1)  # use ddof=1 to calculate sample std deviation
        if len(data) > 30:
            interval = st.norm.interval(confidence, loc=mean, scale=std_dev/np.sqrt(len(data)))
        else:
            interval = st.t.interval(confidence, len(data)-1, loc=mean, scale=std_dev/np.sqrt(len(data)))
        intervals[column] = interval

    return intervals

# usage
print(calculate_confidence_intervals(df, confidence = 0.9))


{'Toppings': (3.9118832519744378, 4.018116748025562), 'Price': (19.167985460895864, 19.516014539104134), 'Delivery Time': (24.931562384283747, 25.190637615716252), '# pizzas the customer ordered before': (5.531052274178713, 5.786947725821286), 'delivery_time_standardized': (-0.05201483878755562, 0.052014838787555875), 'delivery_time_standardized2': (-0.05204086572878282, 0.052040865728783074)}
