# Confidence intervals

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats

In [None]:
df = pd.read_csv("diamonds.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
sns.histplot(df, x="price")
plt.show()

In [None]:
n = df["price"].count() # number of diamonds
xbar = df["price"].mean()
s = df["price"].std()
conf = 0.95
SEM = s / np.sqrt(n)

In [None]:
interval = stats.norm.interval(confidence=conf, loc=xbar, scale=SEM)
interval

In [None]:
print("With 95% confidence, the true mean diamond price is between", interval[0], "and", interval[1])

## One sample t test

In [None]:
# Test whether "Premium" cut diamonds have an average price above $4500
# look at individual values for cut - 5 different options
print(pd.unique(df["cut"]))

In [None]:
# find the mean of prices for different cuts
df.groupby("cut")["price"].mean()

In [None]:
alpha = 0.5
test_results = stats.ttest_1samp(df[df["cut"] == "Premium"]["price"], popmean=3400)
print(test_results)

In [None]:
if test_results[1] > 0.05:
    print("Failed to reject the null with p-value", test_results[1])
else:
    print("Rejected the null with p-value", test_results[1])

# Two sample t test

In [None]:
# plot differences
df.groupby("cut")["price"].mean().sort_values().plot(kind="bar")
plt.show()

In [None]:
df.groupby("cut")["price"].mean()

In [None]:
sns.boxplot(df, x="price", y="cut", palette="terrain")

In [None]:
very_good_df = df[df["cut"] == "Very Good"]
good_df = df[df["cut"] == "Good"]
test_results = stats.ttest_ind(very_good_df["price"], good_df["price"])

In [None]:
print(test_results)

In [None]:
if test_results[1] > 0.05:
    print("Failed to reject the null with p-value", test_results[1])
else:
    print("Rejected the null with p-value", test_results[1])

# Simulation

In [None]:
n = 1000
sample = np.random.uniform(low=0, high=0.1, size=n)
xbar = sample.mean()
s = sample.std()
print("Sample mean:", xbar)
print("Sample st. dev.:", s)
sns.histplot(sample)
plt.show()

In [None]:
n = 1000
sample = np.random.uniform(low=0, high=0.1, size=n)
xbar = sample.mean()
s = sample.std()
conf = 0.95
SEM = s / np.sqrt(n)
print("Sample mean:", xbar)
print("Sample st. dev.:", s)
interval = stats.norm.interval(confidence=conf, loc=xbar, scale=SEM)
print("With 95% confidence, the true mean diamond price is between", interval[0], "and", interval[1])
sns.histplot(sample)
plt.show()

In [None]:
# Ask LLM to repeat scenario:
n = 1000
conf = 0.95
contains_0_05 = 0
does_not_contain_0_05 = 0

for _ in range(100):
    sample = np.random.uniform(low=0, high=0.1, size=n)
    xbar = sample.mean()
    s = sample.std()
    SEM = s / np.sqrt(n)
    interval = stats.norm.interval(confidence=conf, loc=xbar, scale=SEM)
    
    if interval[0] <= 0.05 <= interval[1]:
        contains_0_05 += 1
    else:
        does_not_contain_0_05 += 1

print("Number of intervals containing 0.05:", contains_0_05)
print("Number of intervals not containing 0.05:", does_not_contain_0_05)

## Sampling from the normal distribution

In [None]:
n = 1000
conf = 0.95

for _ in range(100):
    sample = np.random.normal(loc=3932, scale=750, size=n)
    xbar = sample.mean()
    s = sample.std()
    SEM = s / np.sqrt(n)
    interval = stats.norm.interval(confidence=conf, loc=xbar, scale=SEM)
    print(interval)

In [None]:
n = 1000
conf = 0.95
contains_mu = 0
does_not_contain_mu = 0

for _ in range(100):
    sample = np.random.normal(loc=3932, scale=750, size=n)
    xbar = sample.mean()
    s = sample.std()
    SEM = s / np.sqrt(n)
    interval = stats.norm.interval(confidence=conf, loc=xbar, scale=SEM)
    
    if interval[0] <= 3932 <= interval[1]:
        contains_mu += 1
    else:
        does_not_contain_mu += 1

print("Number of intervals containing 3932:", contains_mu)
print("Number of intervals not containing 3932:", does_not_contain_mu)