In [1]:
# Importing necessary libraries
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Q1: Explain the assumptions required to use ANOVA and examples of violations.
# Assumptions:
# 1. Independence: Observations are independent of each other.
# 2. Normality: Data within each group should follow a normal distribution.
# 3. Homogeneity of variances: Variance across groups should be equal.
#
# Examples of Violations:
# - Independence: Sampling the same individuals multiple times without accounting for the repeated measures.
# - Normality: Skewed or bimodal distributions.
# - Homogeneity of variances: Group variances differ significantly.

In [2]:

# Q2: Three types of ANOVA and their usage.
# 1. One-way ANOVA: Compares means of three or more independent groups.
# 2. Two-way ANOVA: Examines the effect of two independent variables and their interaction.
# 3. Repeated measures ANOVA: Used when the same subjects are measured multiple times under different conditions.

In [3]:
# Q3: Partitioning of variance in ANOVA.
# - Total Sum of Squares (SST): Total variance in the data.
# - Explained Sum of Squares (SSE): Variance explained by the groups.
# - Residual Sum of Squares (SSR): Variance within groups (unexplained).
# Importance: Helps understand how much variation is due to group differences versus random error.

In [4]:
# Q4: Calculate SST, SSE, and SSR in a one-way ANOVA using Python.
# Example Data
np.random.seed(0)
data = {
    'Group': ['A'] * 10 + ['B'] * 10 + ['C'] * 10,
    'Value': np.random.normal(20, 5, 10).tolist() + \
             np.random.normal(22, 5, 10).tolist() + \
             np.random.normal(24, 5, 10).tolist()
}
df = pd.DataFrame(data)

# Calculate SST, SSE, SSR
grand_mean = df['Value'].mean()
SST = ((df['Value'] - grand_mean)**2).sum()
SSE = df.groupby('Group')['Value'].mean().apply(lambda x: (x - grand_mean)**2).sum() * len(df['Group'].unique())
SSR = SST - SSE
print("\nQ4: SST:", SST, ", SSE:", SSE, ", SSR:", SSR)


Q4: SST: 848.1492999393902 , SSE: 2.5795191522161005 , SSR: 845.5697807871741


In [6]:

# Q5: Calculate main effects and interaction effects in a two-way ANOVA using Python.
# Example Data
data = {
    'Software': ['A', 'B', 'C'] * 10,
    'Experience': ['Novice'] * 15 + ['Experienced'] * 15,
    'Time': np.random.normal(30, 5, 30)
}
df2 = pd.DataFrame(data)

df2['Time'] += (df2['Software'].apply(lambda x: 5 if x == 'A' else (0 if x == 'B' else -5)) +
                df2['Experience'].apply(lambda x: 2 if x == 'Novice' else -2))

import statsmodels.api as sm
from statsmodels.formula.api import ols
model = ols('Time ~ C(Software) + C(Experience) + C(Software):C(Experience)', data=df2).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print("\nQ5: Two-Way ANOVA Table:\n", anova_table)



Q5: Two-Way ANOVA Table:
                                sum_sq    df          F    PR(>F)
C(Software)                561.398507   2.0  10.840016  0.000442
C(Experience)               37.458344   1.0   1.446563  0.240812
C(Software):C(Experience)   19.614183   2.0   0.378729  0.688751
Residual                   621.473464  24.0        NaN       NaN


In [7]:
# Q6: Interpretation of one-way ANOVA results.
# F-statistic = 5.23, p-value = 0.02
# Interpretation: Since p-value < 0.05, we reject the null hypothesis and conclude significant differences exist between the groups.

In [8]:
# Q7: Handling missing data in repeated measures ANOVA.
# - Methods: Mean imputation, last observation carried forward, or multiple imputation.
# - Consequences: Can introduce bias or reduce power.


In [9]:
# Q8: Common post-hoc tests after ANOVA.
# - Tukey's HSD: Used for pairwise comparisons when group sizes are equal.
# - Bonferroni: Controls for Type I error in multiple comparisons.
# Example: Use post-hoc if ANOVA indicates significant differences.

In [10]:
# Q9: Conduct one-way ANOVA for mean weight loss.
data = {
    'Diet': ['A'] * 17 + ['B'] * 17 + ['C'] * 16,
    'WeightLoss': np.random.normal(5, 1, 17).tolist() +
                  np.random.normal(6, 1, 17).tolist() +
                  np.random.normal(7, 1, 16).tolist()
}
df3 = pd.DataFrame(data)
model = ols('WeightLoss ~ C(Diet)', data=df3).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print("\nQ9: One-Way ANOVA Table:\n", anova_table)


Q9: One-Way ANOVA Table:
              sum_sq    df          F    PR(>F)
C(Diet)   25.363054   2.0  14.373581  0.000013
Residual  41.467173  47.0        NaN       NaN


In [11]:
# Q10: Conduct two-way ANOVA for task completion time.
data = {
    'Software': ['A', 'B', 'C'] * 10,
    'Experience': ['Novice'] * 15 + ['Experienced'] * 15,
    'Time': np.random.normal(30, 5, 30)
}
df4 = pd.DataFrame(data)

df4['Time'] += (df4['Software'].apply(lambda x: 5 if x == 'A' else (0 if x == 'B' else -5)) +
                df4['Experience'].apply(lambda x: 2 if x == 'Novice' else -2))

model = ols('Time ~ C(Software) + C(Experience) + C(Software):C(Experience)', data=df4).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print("\nQ10: Two-Way ANOVA Table:\n", anova_table)



Q10: Two-Way ANOVA Table:
                                sum_sq    df         F    PR(>F)
C(Software)                450.099554   2.0  8.662103  0.001473
C(Experience)              178.468900   1.0  6.869218  0.014978
C(Software):C(Experience)   27.567488   2.0  0.530532  0.595032
Residual                   623.543116  24.0       NaN       NaN


In [12]:
# Q11: Conduct two-sample t-test for test scores.
control = np.random.normal(70, 10, 50)
experimental = np.random.normal(75, 10, 50)

t_stat, p_val = stats.ttest_ind(control, experimental)
print("\nQ11: Two-Sample T-Test: t-statistic:", t_stat, ", p-value:", p_val)


Q11: Two-Sample T-Test: t-statistic: -1.3268959154906732 , p-value: 0.18762653197722945


In [13]:
# Q12: Conduct repeated measures ANOVA for sales data.
data = {
    'Store': ['A'] * 10 + ['B'] * 10 + ['C'] * 10,
    'Day': list(range(10)) * 3,
    'Sales': np.random.normal(100, 10, 10).tolist() +
             np.random.normal(110, 10, 10).tolist() +
             np.random.normal(120, 10, 10).tolist()
}
df5 = pd.DataFrame(data)

from statsmodels.stats.anova import AnovaRM
rm_anova = AnovaRM(df5, 'Sales', 'Day', within=['Store']).fit()
print("\nQ12: Repeated Measures ANOVA:\n", rm_anova)


Q12: Repeated Measures ANOVA:
                Anova
      F Value Num DF  Den DF Pr > F
-----------------------------------
Store  9.8662 2.0000 18.0000 0.0013

