In [None]:
''' 1: Using this dataset, calculate the probability that **exactly 5 defective items** will be produced on a new day. Use Python to:

- Generate the data  
-  Calculate the mean and standard deviation  
-  Compute the probability using the normal distribution (with continuity correction)
'''

In [16]:
from numpy.random import randint as ri
import pandas as pd
from scipy.stats import norm

In [2]:
defects = ri(0, 21, 1000)  #generate data
defects = pd.Series(defects)

In [3]:
defects

0       2
1       7
2      11
3       6
4      17
       ..
995     0
996     0
997     2
998     5
999     4
Length: 1000, dtype: int32

In [14]:
mean = defects.mean() # mean and standard deviation
std = defects.std()

In [11]:
mean


9.785

In [12]:
std

6.194929551831252

In [18]:
#Compute the probability using the normal distribution (with continuity correction)
#p=5 then p(4.5<X<5.5)
lower_bound=4.5
upper_bound=5.5

probability = norm.cdf(upper_bound, loc=mean, scale=std) - norm.cdf(lower_bound, loc=mean, scale=std)
probability

0.04776733519390991

In [None]:
''' Problem – Testing the Claim About Delivery Time

A food delivery company claims that its average delivery time is **30 minutes**. Based on historical data, the **population standard deviation** is known to be **4 minutes**.

To evaluate this claim, a consumer rights group decides to test the null hypothesis that the average delivery time is **at most 30 minutes**. They observe a sample of **40 deliveries**, and the average delivery time for the sample comes out to be **31.2 minutes**.

### Objective:

Test the null hypothesis using the z-test.  
- **Null Hypothesis (H₀): μ ≤ 30** (Average delivery time is 30 minutes or less)  
- **Alternative Hypothesis (H₁): μ > 30** (Average delivery time is more than 30 minutes) '''

In [19]:
import numpy as np
from scipy.stats import norm

In [20]:
# Known values
population_mean = 30        # Claimed average delivery time
sample_mean = 31.2          # Observed sample mean
std_dev = 4                 # Known population standard deviation
n = 40                      # Sample size

In [32]:
#it is a right tail test where alternate hypothesis is greater than 30**
# finding the standard error 
std_error = std_dev / np.sqrt(n)
std_error

0.6324555320336759

In [24]:
#finding the z score distribution 
score = (sample_mean - population_mean)/std_error 
score


1.8973665961010264

In [26]:
# finding the probability using cdf
probability = norm.cdf(score)
probability

0.9711102144382013

In [27]:
# p value
p_value = 1 - 0.97
p_value

0.030000000000000027

In [28]:
alpha_value = 0.05


In [31]:
if p_value > alpha_value:
    print('the average delivery time will be greater than 30 min')
else:
    print('the average delivery time will be 30min or less than 30min')

the average delivery time will be 30min or less than 30min


In [None]:
''' ## Problem – Fitness Program Impact Analysis

A health and wellness company is evaluating the impact of its **6-week fitness training program**. They collect performance data (in terms of fitness scores out of 100) from participants **before and after** the program.

You are provided with a dataset of **150 participants**, with the following information:
- **Initial Score** (before the program)
- **Final Score** (after the program)
- **Gender** of the participant (0 = Female, 1 = Male)

### Your Task:

Using the dataset provided below, perform the following statistical tests:

1. **One-Sample t-Test**  
   Test whether the **average initial fitness score** is at least **65**.

2. **Two-Sample Independent t-Test**  
   Compare the **initial fitness scores of male and female participants** to check if there's a significant difference.

3. **Paired Sample t-Test**  
   Test whether the **final scores are significantly higher than the initial scores**, i.e., whether the fitness program had a measurable impact.

   '''

In [33]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(100)

# Sample size
n = 150

# Gender (0 = Female, 1 = Male)
gender = np.random.choice([0, 1], size=n)

# Initial scores (mean slightly < 65 to create realistic test)
initial_scores = np.random.normal(loc=64, scale=6, size=n)

# Final scores (showing average improvement)
final_scores = initial_scores + np.random.normal(loc=5, scale=3, size=n)

# Create DataFrame
df = pd.DataFrame({
    'Gender': gender,
    'Initial_Score': initial_scores,
    'Final_Score': final_scores
})

df.head()


Unnamed: 0,Gender,Initial_Score,Final_Score
0,0,73.167718,76.049901
1,0,67.883235,75.156484
2,1,59.93598,65.727168
3,1,62.409887,68.352951
4,1,68.476639,70.330144


In [50]:
### 1. One-Sample T-Test
### Null Hypothesis** H₀: μ ≥ 65 (Average initial score is at least 65)

import scipy.stats as st
from scipy.stats import ttest_1samp, ttest_ind, ttest_rel


In [47]:
t_stat, p_value = ttest_1samp(df['Initial_Score'], 65)

In [45]:
# one tailed (left tailed)
p_value_one_tailed = p_value / 2

In [48]:
if (t_stat < 0) and (p_value_one_tailed < 0.05):
    print("  Reject H₀: Average initial score is significantly less than 65")
else:
    print("  Fail to reject H₀: Not enough evidence that average score is less than 65")

  Reject H₀: Average initial score is significantly less than 65


In [51]:
### 2. two sample T-Test
### Null Hypothesis H₀: μ₁ = μ₂ (No difference in average initial scores between males and females)
### Alternate Hypothesis H₁: μ₁ ≠ μ₂ (There is a difference in average initial scores)
# Split data
male_scores = df[df['Gender'] == 1]['Initial_Score']
female_scores = df[df['Gender'] == 0]['Initial_Score']

# Two-sample independent t-test
t_stat, p_value = ttest_ind(male_scores, female_scores, equal_var=False)


if p_value < 0.05:
    print("  Reject H₀: Significant difference between male and female initial scores")
else:
    print("  Fail to reject H₀: No significant difference between male and female scores")


  Fail to reject H₀: No significant difference between male and female scores


In [52]:
### Paired Sample t-Test:
### **Null Hypothesis** H₀: μ_diff = 0 (No change in scores before and after the program)
### **Alternate Hypothesis** H₁: μ_diff < 0 (Final scores are higher than initial scores)
# Paired t-test (Initial vs Final)
t_stat, p_value = ttest_rel(df['Initial_Score'], df['Final_Score'])

# One-tailed test: testing if Final > Initial (i.e., Initial - Final < 0)
p_value_one_tailed = p_value / 2

if (t_stat < 0) and (p_value_one_tailed < 0.05):
    print("Reject H₀: Final scores are significantly higher — the program had an impact")
else:
    print("Fail to reject H₀: No significant improvement observed")


Reject H₀: Final scores are significantly higher — the program had an impact


In [None]:
''' ##  Problem – ANOVA Analysis of Customer Satisfaction Across Store Branches

A retail company wants to analyze whether the **average customer satisfaction scores** vary significantly across its three store branches: **Branch A, Branch B, and Branch C**.

You are provided with data containing:
- **Customer_ID**
- **Branch** (Categorical Variable)
- **Satisfaction_Score** (Continuous Variable on a scale from 0 to 500)

---

###  Objective:
Use **One-Way ANOVA** to test the following hypotheses:

- **H₀ (Null Hypothesis)**: The average satisfaction scores across all three branches are **equal**.
- **H₁ (Alternative Hypothesis)**: At least one branch has a **different average** satisfaction score.
''' 

In [62]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm
import statsmodels.stats.anova as anova
import statsmodels.formula.api as smf
# Set seed for reproducibility
np.random.seed(42)

# Sample size per branch
n = 70

# Create satisfaction scores for three branches
branch_a = np.random.normal(loc=420, scale=30, size=n)
branch_b = np.random.normal(loc=400, scale=35, size=n)
branch_c = np.random.normal(loc=430, scale=25, size=n)

# Combine into a DataFrame
data = pd.DataFrame({
    'Customer_ID': range(1, n*3 + 1),
    'Branch': ['A'] * n + ['B'] * n + ['C'] * n,
    'Satisfaction_Score': np.concatenate([branch_a, branch_b, branch_c])
})

data.head()

Unnamed: 0,Customer_ID,Branch,Satisfaction_Score
0,1,A,434.901425
1,2,A,415.852071
2,3,A,439.430656
3,4,A,465.690896
4,5,A,412.975399


In [60]:
# Fit the OLS model
model = smf.ols('Satisfaction_Score ~ Branch', data=data).fit()


In [63]:
# Type 1 ANOVA table
anova_table = anova.anova_lm(model, typ=1)
print(anova_table)


             df         sum_sq       mean_sq          F        PR(>F)
Branch      2.0   42018.567984  21009.283992  24.799904  2.212591e-10
Residual  207.0  175360.431682    847.151844        NaN           NaN


In [None]:
''' ## Problem – Evaluate Forecast Accuracy Using the Chi-Square Goodness of Fit Test

The city’s public transportation authority uses a forecasting model to estimate the number of metro passengers for each day of the week. These forecasts help manage train schedules, staffing, and platform operations.

Recently, actual passenger counts were collected and compared to the forecasted values to evaluate how well the model performs.

---

### Question

You are provided with the forecasted and observed number of passengers (in thousands) for each day of a week:

- **Forecasted Values (Expected):**  
  `[95, 110, 100, 130, 160, 210, 230]`

- **Observed Values (Actual):**  
  `[90, 105, 98, 135, 165, 205, 225]`

Using a **Chi-Square Goodness of Fit Test**, determine whether the forecast model provides an accurate estimate of daily passenger traffic.

---

### Hypotheses

- **Null Hypothesis (H₀):** There is no significant difference between the forecasted and observed values (i.e., the model is accurate).
- **Alternative Hypothesis (H₁):** There is a significant difference between the forecasted and observed values (i.e., the model is inaccurate).
''''

In [64]:
import numpy as np
from scipy.stats import chi2

# Data
expected = np.array([95, 110, 100, 130, 160, 210, 230])
observed = np.array([90, 105, 98, 135, 165, 205, 225])


In [65]:
chi_square_stat = np.sum((observed - expected) ** 2 / expected)
chi_square_stat

1.1067315855387938

In [67]:
##degree of freedom
df = len(expected) - 1
df

6

In [69]:
## critical value 
alpha = 0.10


In [71]:
critical_value = chi2.ppf(1 - alpha, df)
critical_value

10.644640675668422

In [72]:
if chi_square_stat > critical_value:
    print("Reject the null hypothesis: The forecasting model is inaccurate.")
else:
    print("Fail to reject the null hypothesis: The forecasting model is reasonably accurate.")

Fail to reject the null hypothesis: The forecasting model is reasonably accurate.


In [None]:
''' ## Problem – Manual Covariance Calculation Between Study Hours and Exam Scores

A school counselor wants to understand how strongly the number of hours a student studies is related to their exam score.

She collected the following data:

| Student | Hours_Studied | Exam_Score |
|---------|---------------|------------|
| A       | 2             | 65         |
| B       | 4             | 70         |
| C       | 6             | 75         |
| D       | 8             | 85         |
| E       | 10            | 95         |

---

###  Objective

Manually compute the **covariance** between `Hours_Studied` and `Exam_Score` **without using built-in functions** like `.cov()` or NumPy methods.
'''

In [73]:
hours = [2, 4, 6, 8, 10]
scores = [65, 70, 75, 85, 95]
n = len(hours)

# Step 1: Calculate means
mean_hours = sum(hours) / n
mean_scores = sum(scores) / n

# Step 2: Compute covariance manually
cov = 0
for i in range(n):
    cov += (hours[i] - mean_hours) * (scores[i] - mean_scores)

covariance = cov / (n - 1)
covariance

37.5

In [None]:
''' ##  Problem – Manual Correlation Calculation Between Exercise Hours and Stress Level

A health researcher is analyzing the relationship between how many hours a person exercises per week and their reported stress level (on a scale of 1–100, where higher is more stress).

She collects data from 5 participants:

| Person | Exercise_Hours | Stress_Level |
|--------|----------------|--------------|
| A      | 1              | 85           |
| B      | 3              | 75           |
| C      | 5              | 60           |
| D      | 7              | 55           |
| E      | 9              | 40           |

---

###  Objective

Manually compute the **Pearson correlation coefficient** between `Exercise_Hours` and `Stress_Level` without using built-in correlation functions.
'''

In [74]:
# Data
exercise = [1, 3, 5, 7, 9]
stress = [85, 75, 60, 55, 40]
n = len(exercise)

In [75]:
# Step 1: Calculate means
mean_exercise = sum(exercise) / n
mean_stress = sum(stress) / n

In [76]:
# Step 2: Calculate numerator and denominators
numerator = 0
denominator_x = 0
denominator_y = 0

for i in range(n):
    x_diff = exercise[i] - mean_exercise
    y_diff = stress[i] - mean_stress
    numerator += x_diff * y_diff
    denominator_x += x_diff ** 2
    denominator_y += y_diff ** 2

# Step 3: Correlation
correlation = numerator / (denominator_x**0.5 * denominator_y**0.5)

In [77]:
correlation

-0.9918365981341756