In [1]:
#1.

# Assumptions of ANOVA:

# Normality of Sampling Distribution of means.
# Absense of Outliers.
# Homegenety of Variance.
# Samples are independent and random.

# Violation

# Violations of independence:
# Correlated observations within groups or dependencies between measurements over time can lead to biased results.

# Violations of normality:
# Non-normal distributions of the dependent variable within groups can affect the accuracy of p-values and confidence intervals.

# Violations of homogeneity of variances:
# Unequal variances between groups can result in incorrect conclusions about group differences.

# Violations of linearity:
# Nonlinear relationships between the dependent variable and independent variable(s) can lead to misleading interpretations of group differences.

# Violations of random sampling:
# Non-random sampling methods can compromise the generalizability of the results, introducing bias in the conclusions.

In [None]:
#2.

# One Way ANOVA:
# When there is one factor with atleast 2 levels.
# These levels are independent.

# Repeated Measures ANOVA:
# When there is one factor with atleast 2 levels.
# These levels are dependent.

# Factorial ANOVA:
# When there are two or more factors each with atleast 2 levels.
# These levels can be either dependent or independent.

In [3]:
#3.

# Partitioning of variance in ANOVA:
# The partitioning of variance in ANOVA involves breaking down the total variation observed in the data into different components associated with specific factors or sources.

# Importance of partitioning of variance in ANOVA:
# This concept is important because it allows researchers to understand the relative contributions of various factors to the observed differences between groups.
# By quantifying the amount of variance explained by each factor, ANOVA helps determine the significance of group differences and assess the impact of independent variables on the dependent variable.
# Understanding the partitioning of variance aids in making informed interpretations, identifying significant factors, and guiding further analyses.
# It enhances the validity and interpretability of the results, facilitating a deeper understanding of the relationships between variables and providing a foundation for subsequent statistical investigations.

In [4]:
#4.

import numpy as np

group1 = [5, 7, 9, 11, 13]
group2 = [2, 4, 6, 8, 10]
group3 = [1, 3, 5, 7, 9]

data = np.concatenate([group1, group2, group3])
overall_mean = np.mean(data)
sst = np.sum((data - overall_mean)**2)
group_means = np.array([np.mean(group1), np.mean(group2), np.mean(group3)])
sse = np.sum((group_means - overall_mean)**2) * len(group1)
ssr = sst - sse

print("Total sum of squares (SST):", sst)
print("Explained sum of squares (SSE):", sse)
print("Residual sum of squares (SSR):", ssr)

Total sum of squares (SST): 163.33333333333331
Explained sum of squares (SSE): 43.33333333333333
Residual sum of squares (SSR): 119.99999999999999


In [5]:
#5.

import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

data = pd.DataFrame({'Group1': [10, 12, 14, 16, 18],
                     'Group2': [5, 10, 15, 20, 25],
                     'Response': [25, 35, 45, 55, 65]})

model = ols('Response ~ Group1 + Group2 + Group1:Group2', data=data).fit()

anova_table = sm.stats.anova_lm(model, typ=2)

main_effect_group1 = anova_table['sum_sq']['Group1']
main_effect_group2 = anova_table['sum_sq']['Group2']
interaction_effect = anova_table['sum_sq']['Group1:Group2']

print("Main Effect Group 1:", main_effect_group1)
print("Main Effect Group 2:", main_effect_group2)
print("Interaction Effect:", interaction_effect)

Main Effect Group 1: 208.41674207731316
Main Effect Group 2: 126.57179429259548
Interaction Effect: 2.2574113767654446e-27


In [6]:
#6.

# Based on the results of the one-way ANOVA, with an F-statistic of 5.23 and a p-value of 0.02.
# We can conclude that there are statistically significant differences between the groups being compared.
# The F-statistic indicates that the variability between the groups is greater than the variability within the groups.
# The low p-value suggests that the observed differences are unlikely to occur by chance alone.
# Therefore, we reject the null hypothesis of no differences between the groups and accept the alternative hypothesis that there are indeed significant differences.
# Further analysis or post-hoc tests may be conducted to determine which specific groups are significantly different from each other.

In [7]:
#7.

# Handling missing data in a repeated measures ANOVA requires careful consideration.
# Complete Case Analysis (CCA) excludes cases with missing data, potentially leading to biased results.
# Mean substitution/imputation can distort relationships and underestimate variability.
# Multiple imputation captures uncertainty but is computationally intensive.
# The choice of method can impact the validity and reliability of the results.
# Inappropriate handling of missing data can introduce bias, reduce statistical power, and affect conclusions.
# It is important to consider the missing data mechanism and choose appropriate techniques, such as multiple imputation or sensitivity analyses, to account for missingness and minimize potential biases.

In [8]:
#8.

# Some common post-hoc tests used after ANOVA include Tukey's Honestly Significant Difference (HSD), Bonferroni correction, Dunnett's test, and Scheffé's test.

# Tukey's HSD is commonly used to determine which specific groups differ significantly from each other. 
# It is appropriate when conducting multiple pairwise comparisons.
# Bonferroni correction is a conservative approach that adjusts the significance level to control for multiple comparisons.
# Dunnett's test is used when comparing multiple groups to a control group.
# Scheffé's test is more conservative but suitable for situations where all possible comparisons need to be made.

# For example, in a study comparing the effectiveness of three different treatments for a medical condition.
# A post-hoc test like Tukey's HSD could be used to identify which specific treatment groups show statistically significant differences in outcomes after controlling for Type I error rate.

In [9]:
#9.

import numpy as np
from scipy import stats

diet_A = [2.3, 1.9, 3.2, 2.7, 2.5, 1.8, 3.1, 2.4, 2.9, 2.2, 2.6, 2.1, 2.8, 2.4, 1.7, 2.0, 1.9, 2.2, 3.0, 2.5,
          2.4, 2.1, 2.6, 2.3, 2.8, 2.0, 1.9, 2.7, 2.3, 2.5, 2.2, 2.1, 2.6, 2.4, 1.8, 2.9, 3.1, 2.5, 2.3, 2.7,
          2.0, 2.8, 2.4, 1.7, 2.6, 2.2, 2.1, 2.5, 2.3, 2.9]

diet_B = [3.5, 4.1, 3.9, 3.2, 4.2, 3.8, 4.0, 3.6, 3.4, 3.7, 4.1, 3.9, 3.6, 3.3, 4.0, 3.7, 4.2, 3.5, 3.8, 3.2,
          3.9, 3.6, 4.1, 3.3, 4.0, 3.7, 3.5, 4.2, 3.8, 3.6, 3.4, 3.7, 4.1, 3.9, 3.6, 3.3, 4.0, 3.7, 4.2, 3.5,
          3.8, 3.2, 3.9, 3.6, 4.1, 3.3, 4.0, 3.7, 3.5]

diet_C = [1.9, 2.1, 1.7, 2.4, 1.5, 2.0, 1.8, 2.2, 2.3, 1.6, 2.0, 1.7, 2.1, 1.9, 2.4, 1.5, 2.3, 1.8, 2.2, 2.1,
          1.6, 2.0, 1.7, 2.3, 1.9, 2.4, 1.5, 2.2, 2.0, 1.8, 2.1, 1.7, 2.3, 1.9, 2.4, 1.5, 2.1, 2.0, 1.8, 2.2,
          2.3, 1.6, 2.0, 1.7, 2.1, 1.9, 2.4, 1.5]

weight_loss_data = np.concatenate([diet_A, diet_B, diet_C])

group_labels = ['A'] * len(diet_A) + ['B'] * len(diet_B) + ['C'] * len(diet_C)

f_statistic, p_value = stats.f_oneway(diet_A, diet_B, diet_C)

print("F-statistic:", f_statistic)
print("p-value:", p_value)

F-statistic: 389.7356865723879
p-value: 7.7913363221494195e-59


In [10]:
#10.

import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

data = pd.DataFrame({'Program': ['A', 'B', 'C'] * 10,
                     'Experience': ['Novice', 'Experienced'] * 15,
                     'Time': [10, 12, 15, 14, 13, 16, 18, 20, 17, 19,
                              11, 13, 14, 16, 15, 17, 20, 22, 19, 21,
                              9, 11, 10, 12, 15, 13, 14, 16, 18, 17]})

model = ols('Time ~ C(Program) + C(Experience) + C(Program):C(Experience)', data=data).fit()

anova_table = sm.stats.anova_lm(model, typ=2)

f_stat_program = anova_table['F']['C(Program)']
p_value_program = anova_table['PR(>F)']['C(Program)']

f_stat_experience = anova_table['F']['C(Experience)']
p_value_experience = anova_table['PR(>F)']['C(Experience)']

f_stat_interaction = anova_table['F']['C(Program):C(Experience)']
p_value_interaction = anova_table['PR(>F)']['C(Program):C(Experience)']

print("F-statistic Program:", f_stat_program)
print("p-value Program:", p_value_program)

print("F-statistic Experience:", f_stat_experience)
print("p-value Experience:", p_value_experience)

print("F-statistic Interaction:", f_stat_interaction)
print("p-value Interaction:", p_value_interaction)

F-statistic Program: 0.03166869671132745
p-value Program: 0.9688679195443053
F-statistic Experience: 1.0742996345919718
p-value Experience: 0.3103072795076564
F-statistic Interaction: 0.19732034104750384
p-value Interaction: 0.8222460450852132


In [11]:
#11.

import numpy as np
from scipy import stats

control_group = [82, 78, 85, 76, 90, 83, 79, 87, 81, 88, 84, 77, 89, 80, 86, 75, 92, 83, 78, 85,
                 79, 87, 81, 88, 84, 77, 89, 80, 86, 75, 92, 83, 78, 85, 79, 87, 81, 88, 84, 77,
                 89, 80, 86, 75, 92, 83, 78, 85, 79]

experimental_group = [88, 86, 92, 85, 94, 90, 89, 93, 87, 91, 84, 86, 88, 85, 92, 87, 95, 90, 89, 93,
                      88, 86, 92, 85, 94, 90, 89, 93, 87, 91, 84, 86, 88, 85, 92, 87, 95, 90, 89,
                      93, 88, 86, 92, 85, 94, 90, 89, 93, 87]

t_statistic, p_value = stats.ttest_ind(control_group, experimental_group)

print("t-statistic:", t_statistic)
print("p-value:", p_value)

t-statistic: -7.585740789732326
p-value: 2.1164015282622466e-11


In [12]:
#12.

import pandas as pd
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

data = pd.DataFrame({
    'Store A': [10, 12, 14, 9, 11, 13, 8, 12, 15, 11, 13, 10, 12, 14, 9, 11, 13, 8, 12, 15, 11, 13, 10, 12, 14, 9, 11, 13, 8, 12],
    'Store B': [11, 15, 13, 10, 9, 12, 14, 11, 13, 12, 14, 15, 10, 9, 12, 14, 11, 13, 12, 14, 15, 10, 9, 12, 14, 11, 13, 12, 14, 15],
    'Store C': [8, 9, 11, 13, 10, 12, 15, 11, 13, 10, 9, 12, 14, 11, 13, 10, 9, 12, 15, 11, 13, 10, 9, 12, 14, 11, 13, 10, 9, 12]
})

f_statistic, p_value = f_oneway(data['Store A'], data['Store B'], data['Store C'])

posthoc = pairwise_tukeyhsd(data.values.flatten(), data.columns.repeat(len(data)))
print(posthoc)

 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1  group2 meandiff p-adj   lower  upper  reject
-----------------------------------------------------
Store A Store B   0.3333 0.7924  -0.888 1.5547  False
Store A Store C   0.1333 0.9633  -1.088 1.3547  False
Store B Store C     -0.2 0.9195 -1.4213 1.0213  False
-----------------------------------------------------
