In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import statsmodels.stats.weightstats as sm

In [3]:
def p_value_reader(p_value, alpha):
    if p_value <= alpha:
        print("Reject Null Hypothesis")
    else:
        print("Fail to reject Null Hypothesis")

In [4]:
df_main = pd.read_csv("tesla_main.csv")
df_main.head()

Unnamed: 0,Production Date,Defects Found,Cars Produced,Weather Condition,Workers on Shift
0,2023-01-01,3,55,Rainy,20
1,2023-01-02,2,57,Rainy,19
2,2023-01-03,1,54,Rainy,21
3,2023-01-04,0,56,Rainy,22
4,2023-01-05,2,59,Rainy,20


In [5]:
# Population mean
mean_pop = 54

# Population standard deviation
sd_pop = 2

# Confidence level
confidence = 0.95

# Significance level (alpha)
alpha = 1 - confidence

# Calculating the sample mean from the 'Cars Produced' column in df_main
mean_sample = df_main['Cars Produced'].mean()
print(f"The sample mean is {mean_sample}")

# Calculating the sample size from the 'Cars Produced' column in df_main
sample_size = df_main['Cars Produced'].count()
print(f"The sample size is {sample_size}")

The sample mean is 55.10909090909091
The sample size is 55


In [6]:
# Calculate the Z-score using the formula: (sample mean - population mean) / (population standard deviation / sqrt(sample size))
z_score = (mean_sample - mean_pop) / (sd_pop / np.sqrt(sample_size))
print(f"The Z-score is {z_score}")

The Z-score is 4.112619161025777


In [7]:
# Calculate the p-value from the Z-score (two-tailed test)
tails = 2
p_value = st.norm.sf(abs(z_score)) * tails
print(f"The p-value is {p_value}")

The p-value is 3.9119543361101206e-05


In [10]:
p_value_reader(p_value, alpha)

Reject Null Hypothesis


In [11]:
# Build a function to compute the z-test
def ztest(mean_pop, mean_sample, sample_size, sd_pop, alpha, tails):
    # Calculate the Z-score using the formula: (sample mean - population mean) / (population standard deviation / sqrt(sample size))
    z_score = (mean_sample - mean_pop) / (sd_pop / np.sqrt(sample_size))
    print(f"The Z-score is {z_score}")

    # Calculate the p-value from the Z-score (two-tailed test)
    p_value = st.norm.sf(abs(z_score)) * tails
    print(f"The p-value is {p_value}")

    # Interpret the p-value against the alpha threshold
    p_value_reader(p_value, alpha)

In [12]:
# Apply the function
ztest(mean_pop, mean_sample, sample_size, sd_pop, alpha, tails)

The Z-score is 4.112619161025777
The p-value is 3.9119543361101206e-05
Reject Null Hypothesis


In [13]:
# Target mean for the population
target_mean = 2.2

# Calculate the sample mean from the 'Defects Found' column in df_main
mean_sample = df_main['Defects Found'].mean()
print(f"The sample mean is {mean_sample}")

# Calculate the sample size from the 'Defects Found' column in df_main
sample_size = df_main['Defects Found'].count()
print(f"The sample size is {sample_size}")

# Confidence level
confidence = 0.95

# Significance level (alpha)
alpha = 1 - confidence

# Calculate the sample standard deviation from the 'Defects Found' column in df_main
sample_sd = df_main['Defects Found'].std()
print(f"The SD is {sample_sd}")

The sample mean is 2.3636363636363638
The sample size is 55
The SD is 1.0777829844714388


In [14]:
# Calculate the T-score using the formula: (sample mean - target mean) / (sample standard deviation / sqrt(sample size))
t_score = (mean_sample - target_mean) / (sample_sd / np.sqrt(sample_size))
print(f"The T-score is {t_score}")

The T-score is 1.1259778359082033


In [15]:
# Calculate the p-value from the T-score (two-tailed test) using the Student's t-distribution
tails = 2
p_value = st.t.sf(abs(t_score), df=(sample_size - 1)) * tails
print(f"The p-value is {p_value}")

The p-value is 0.2651542493629725


In [18]:
# How to do the 2-tailed test with unknown population variance
t_score, p_value = st.ttest_1samp(a=df_main['Defects Found'],
                                  popmean=target_mean,
                                  alternative='two-sided')

# Print the T-score and p-value
print(f"T-score: {t_score}")
print(f"p-value: {p_value}")

# Interpret the p-value against the alpha threshold
p_value_reader(p_value, alpha)


T-score: 1.1259778359082033
p-value: 0.2651542493629725
Fail to reject Null Hypothesis


In [19]:
# Load data
# Tesla daily production data (month 1 & month 2)
df_paired = pd.read_csv("tesla_paired.csv")
df_paired.head()

Unnamed: 0,Day,Month 1,Month 2
0,1,58,54
1,2,54,56
2,3,57,55
3,4,55,53
4,5,55,52


In [21]:
# Calculate the differences between 'Month 2' and 'Month 1' columns
differences = df_paired['Month 2'] - df_paired['Month 1']

# Calculate the mean of the differences
mean_difference = differences.mean()

# Calculate the standard deviation of the differences
sd_difference = differences.std()

# Calculate the sample size (number of differences)
sample_size = differences.count()

# Print the mean difference
print(f"The mean difference is {mean_difference}")

The mean difference is -1.1


In [22]:
# Degrees of freedom for the paired t-test
dof = sample_size - 1

# Number of tails for the test (two-tailed test)
tails = 2

# Confidence level
confidence = 0.95

# Significance level (alpha)
alpha = 1 - confidence

# Calculate the T-score using the formula: (mean difference) / (standard deviation of differences / sqrt(sample size))
t_score = mean_difference / (sd_difference / np.sqrt(sample_size))
print(f"The T-score is {t_score}")

The T-score is -3.1708738954340316


In [23]:
# Calculate the p-value from the T-score (two-tailed test) using the Student's t-distribution
p_value = st.t.sf(abs(t_score), df=dof) * tails
print(f"The p-value is {p_value}")

# Interpret the p-value against the alpha threshold
p_value_reader(p_value, alpha)

The p-value is 0.0035743342552951936
Reject Null Hypothesis


In [1]:
# Perform a paired t-test with 2 tails
t_score, p_value = st.ttest_rel(df_paired['Month 1'],
                                df_paired['Month 2'],
                                alternative='two-sided')

# Print the T-score and p-value
print(f"T-score: {t_score}")
print(f"p-value: {p_value}")

# Interpret the p-value against the alpha threshold
p_value_reader(p_value, alpha)

NameError: name 'st' is not defined

In [25]:
# Load data
# Daily productivity data (Shift 1 & Shift 2)
df_2sample = pd.read_csv("tesla_2sample.csv")
df_2sample.head()

Unnamed: 0,Day,Shift 1,Shift 2
0,1,53,49.0
1,2,61,57.0
2,3,72,68.0
3,4,59,47.0
4,5,62,60.0


In [26]:
df_2sample.describe()

Unnamed: 0,Day,Shift 1,Shift 2
count,30.0,30.0,29.0
mean,15.5,61.166667,55.0
std,8.803408,6.664799,8.647873
min,1.0,51.0,42.0
25%,8.25,55.25,48.0
50%,15.5,61.0,57.0
75%,22.75,66.75,62.0
max,30.0,72.0,72.0


In [27]:
# Isolate the samples for each shift and drop any nan values
sample1 = df_2sample['Shift 1'].dropna()
sample2 = df_2sample['Shift 2'].dropna()

In [28]:
stat, p_value = st.levene(sample1, sample2)
print(f"p-value: {p_value}")

p-value: 0.04468272196687188


In [30]:
# Interpret the p-value
alpha = 0.05

if p_value < alpha:
    print("Reject the Null Hypothesis. Variances are unequal. Perform Welch's Test")
else:
    print("Fail to reject the Null Hypothesis. Variances are equal. Perform 2-sample T-test")

Reject the Null Hypothesis. Variances are unequal. Perform Welch's Test


In [31]:
# Perform Welch's t-test (a variant of the t-test for unequal variances)
t_statist, p_value = st.ttest_ind(sample1,
                                  sample2,
                                  equal_var=False,
                                  alternative='two-sided')

# Print the p-value
print(f"The p-value is {p_value}")

# Interpret the p-value against the alpha threshold
p_value_reader(p_value, alpha)


The p-value is 0.0034724013986656174
Reject Null Hypothesis


In [32]:
# Exercise -
# Build a function that performs 2 sample Test
# based on the outcome of Levene's test

def test_2sample(sample1, sample2, alpha, alternative):
  # Perform Levene's test to check for equality of variances
  stat, p_value = st.levene(sample1, sample2)

  # Interpret the result of Levene's test
  if p_value < alpha:
    equal_var = False
    print("Reject the Null Hypothesis. Variances are unequal. Perform Welch's Test")
  else:
    equal_var = True
    print("Fail to reject the Null Hypothesis. Variances are equal. Perform 2-sample T-test")

  # Perform the two-sample t-test with the appropriate assumption about variances
  t_statist, p_value = st.ttest_ind(sample1,
                                    sample2,
                                    equal_var = equal_var,
                                    alternative = alternative)

  # Print the p-value of the t-test
  print(f"The p-value is {p_value}")

  # Interpret the p-value against the alpha threshold
  p_value_reader(p_value, alpha)

In [33]:
# Apply the function
test_2sample(sample1, sample2, 0.05, 'two-sided')

Reject the Null Hypothesis. Variances are unequal. Perform Welch's Test
The p-value is 0.0034724013986656174
Reject Null Hypothesis


In [34]:
# Data
# Tesla production data
df_main.head()

Unnamed: 0,Production Date,Defects Found,Cars Produced,Weather Condition,Workers on Shift
0,2023-01-01,3,55,Rainy,20
1,2023-01-02,2,57,Rainy,19
2,2023-01-03,1,54,Rainy,21
3,2023-01-04,0,56,Rainy,22
4,2023-01-05,2,59,Rainy,20


In [35]:
# Population mean
mean_pop = 54.5

# Population standard deviation
sd_pop = 2

# Confidence level
confidence = 0.95

# Significance level (alpha)
alpha = 1 - confidence

# Calculate the sample mean from the 'Cars Produced' column in df_main
mean_sample = df_main['Cars Produced'].mean()
print(f"The sample mean is {mean_sample}")

# Calculate the sample size from the 'Cars Produced' column in df_main
sample_size = df_main['Cars Produced'].count()
print(f"The sample size is {sample_size}")

The sample mean is 55.10909090909091
The sample size is 55


In [36]:
# Appy the ztest function that we created
ztest(mean_pop, mean_sample, sample_size, sd_pop, alpha, 1)

The Z-score is 2.258569539251862
The p-value is 0.011955087194577932
Reject Null Hypothesis


In [37]:
# How to do the 1-tailed t-test with unknown population variance
t_score, p_value = st.ttest_1samp(a = df_main['Defects Found'],
                                  popmean = target_mean,
                                  alternative = 'greater')

# Print the T-score and p-value
print(f"T-score: {t_score}")
print(f"p-value: {p_value}")

# Interpret the p-value against the alpha threshold
p_value_reader(p_value, alpha)

T-score: 1.1259778359082033
p-value: 0.13257712468148625
Fail to reject Null Hypothesis


In [38]:
# Perform a one-tailed paired t-test with the alternative hypothesis 'greater'
t_score, p_value = st.ttest_rel(df_paired['Month 1'],
                                df_paired['Month 2'],
                                alternative='greater')

# Print the T-score and p-value
print(f"T-score: {t_score}")
print(f"p-value: {p_value}")

# Interpret the p-value against the alpha threshold
p_value_reader(p_value, alpha)

T-score: 3.170873895434031
p-value: 0.0017871671276475996
Reject Null Hypothesis


In [39]:
# Daily productivity data (Shift 1 & Shift 2)
# Summary statistics
df_2sample.describe()

Unnamed: 0,Day,Shift 1,Shift 2
count,30.0,30.0,29.0
mean,15.5,61.166667,55.0
std,8.803408,6.664799,8.647873
min,1.0,51.0,42.0
25%,8.25,55.25,48.0
50%,15.5,61.0,57.0
75%,22.75,66.75,62.0
max,30.0,72.0,72.0


In [44]:
# Isolate the samples of each samples and drop any nan values
sample1 = df_2sample['Shift 1'].dropna()
sample2 = df_2sample['Shift 2'].dropna()

In [45]:
# Apply the 2 sample test function we created
test_2sample(sample2, sample1, 0.05, 'less')

Reject the Null Hypothesis. Variances are unequal. Perform Welch's Test
The p-value is 0.0017362006993328087
Reject Null Hypothesis


In [46]:
# Load the data
# Defective cars by factory and type
df_chisquare = pd.read_csv("tesla_chisquare.csv")
df_chisquare.head()

Unnamed: 0.1,Unnamed: 0,Day,Factory,Category,Count
0,0,1,Factory A,Sedan,48
1,1,2,Factory A,Sedan,38
2,2,3,Factory A,Sedan,24
3,3,4,Factory A,Sedan,17
4,4,5,Factory A,Sedan,30


In [49]:
# Build a contingency table for the chi-square test
contingency_table = pd.crosstab(index = df_chisquare['Factory'],
                                columns = df_chisquare['Category'],
                                values = df_chisquare['Count'],
                                aggfunc = "sum")
contingency_table

Category,SUV,Sedan,Truck
Factory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Factory A,992,1051,1040
Factory B,1009,1097,993


In [50]:
# Perform the chi-square
_, pvalue, _, _= st.chi2_contingency(observed = contingency_table)
p_value

np.float64(0.0017871671276475996)

In [51]:
# Display and interpret the pvalue
print(f"The p-value is {p_value}")
p_value_reader(p_value, alpha)

The p-value is 0.0017871671276475996
Reject Null Hypothesis
