## Question 1

In [2]:
# Assumptions:
#     1.independence
#     2.normality
#     3.homegeneity
# If any of these assumptions are violated, the validity of the ANOVA results may be affected. For example:
# voilation of independence
# voilation of normality
# voilation of homogeneity

## Question 2

In [3]:
# One-way ANOVA: One-way ANOVA is used when there is one independent variable with three or more levels, and one dependent variable.
# Two-way ANOVA: Two-way ANOVA is used when there are two independent variables, and one dependent variable.
# Repeated measures ANOVA: Repeated measures ANOVA is used when the same individuals are measured multiple times under different conditions. 

## Question 3

In [4]:
# The partitioning of variance in ANOVA refers to the process of decomposing the total variance observed in the data into different sources of variation.

## Question 4

In [6]:
import pandas as pd
from scipy import stats
def func(filepath):
    data = pd.read_csv(filepath)
    SST = ((data['y'] - data['y'].mean())**2).sum()
    _, SSR, df = statsmodels.stats.anova.anova_lm(statsmodels.formula.api.ols('y ~ group', data=data).fit(), typ=2)

    SSE = SST - SSR

    print('SST:', SST)
    print('SSE:', SSE)
    print('SSR:', SSR)


## Question 5

In [7]:
import pandas as pd
from scipy.stats import f_oneway
def func(filepath):
    data = pd.read_csv(filepath)
    A_mean = data.groupby('A')['y'].mean()
    B_mean = data.groupby('B')['y'].mean()
    A_effect = sum([(A_mean[i] - data['y'].mean())**2 for i in A_mean.index])
    B_effect = sum([(B_mean[i] - data['y'].mean())**2 for i in B_mean.index])

    AB_mean = data.groupby(['A', 'B'])['y'].mean()

## Question 6

In [8]:
# based on the results of the one-way ANOVA with an F-statistic of 5.23 and a p-value of 0.02, we can conclude that there is a significant difference between the group means. However, further analysis is needed to determine which specific groups are different from each other.

## Question 7

In [10]:
# Here are some common methods for handling missing data in a repeated measures ANOVA:
# Complete Case Analysis (CCA): CCA involves analyzing only the complete cases, i.e., the observations that have data for all the time points.
# Last Observation Carried Forward (LOCF): LOCF involves imputing missing data by carrying forward the last observed value for each participant.
# Mean Substitution: Mean substitution involves replacing missing data with the mean of the available data for that participant. 
# Multiple Imputation (MI): MI involves creating multiple plausible imputed datasets using a statistical model that accounts for the missing data. 

## Question 8

In [11]:
# Tukey's Honestly Significant Difference (HSD) test: This test is used when there are three or more groups and is recommended when the sample sizes are equal. 
# Bonferroni correction: This test is used to control the FWER and is more conservative than other post-hoc tests.
# Scheffe's test: This test is used when there are unequal sample sizes or variances and is the most conservative of all post-hoc tests. 
# Fisher's Least Significant Difference (LSD) test: This test is used when there are two groups, and the variances are assumed to be equal.

## Question 9

In [13]:
import numpy as np
from scipy.stats import f_oneway
np.random.seed(123)
diet_A = np.random.normal(loc=10, scale=2, size=50)
diet_B = np.random.normal(loc=8, scale=3, size=50)
diet_C = np.random.normal(loc=6, scale=1.5, size=50)
f_stat, p_val = f_oneway(diet_A, diet_B, diet_C)
print("F-statistic:", f_stat)
print("p-value:", p_val)

F-statistic: 30.033777139100614
p-value: 1.1576789498492775e-11


## Question 10

In [9]:
import numpy as np
from scipy.stats import f_oneway
np.random.seed(1234)
def func():
    n = 30
    program = np.repeat(['A', 'B', 'C'], n)
    experience = np.tile(['novice', 'experienced'], n*3)
    time = np.random.normal(10, 2, n*3*2)
    data = np.column_stack((time, program, experience))
    novice_data = data[data[:, 2] == 'novice'][:, 0]
    exp_data = data[data[:, 2] == 'experienced'][:, 0]
    progA_data = data[data[:, 1] == 'A'][:, 0]
    progB_data = data[data[:, 1] == 'B'][:, 0]
    progC_data = data[data[:, 1] == 'C'][:, 0]

    program_f, program_p = f_oneway(progA_data, progB_data, progC_data)
    experience_f, experience_p = f_oneway(novice_data, exp_data)

    novice_progA_data = data[(data[:, 1] == 'A') & (data[:, 2] == 'novice')][:, 0]
    exp_progA_data = data[(data[:, 1] == 'A') & (data[:, 2] == 'experienced')][:, 0]
    novice_progB_data = data[(data[:, 1] == 'B') & (data[:, 2] == 'novice')][:, 0]
    exp_progB_data = data[(data[:, 1] == 'B') & (data[:, 2] == 'experienced')][:, 0]
    novice_progC_data = data[(data[:, 1] == 'C') & (data[:, 2] == 'novice')][:, 0]
    exp_progC_data = data[(data[:, 1] == 'C') & (data[:, 2] == 'experienced')][:, 0]

    interaction_f, interaction_p = f_oneway(novice_progA_data, exp_progA_data,
                                             novice_progB_data, exp_progB_data,
                                             novice_progC_data, exp_progC_data)

    print(f"Program main effect: F = {program_f:.2f}, p = {program_p:.4f}")
    print(f"Experience main effect: F = {experience_f:.2f}, p = {experience_p:.4f}")
    print(f"Interaction effect: F = {interaction_f:.2f}, p = {interaction_p:.4f}")

## Question 11

In [10]:
import numpy as np
from scipy.stats import ttest_ind, ttest_rel
np.random.seed(1234)
control_scores = np.random.normal(70, 10, 100)
exp_scores = np.random.normal(75, 10, 100)

t_stat, p_val = ttest_ind(control_scores, exp_scores)
print(f"Two-sample t-test: t = {t_stat:.2f}, p = {p_val:.4f}")

if p_val < 0.05:
    control_mean = np.mean(control_scores)
    exp_mean = np.mean(exp_scores)
    if control_mean > exp_mean:
        t_stat, p_val = ttest_rel(control_scores, exp_scores)
        print(f"Paired t-test: t = {t_stat:.2f}, p = {p_val:.4f}")
        print("Control group scores are significantly higher than experimental group scores.")
    elif control_mean < exp_mean:
        t_stat, p_val = ttest_rel(exp_scores, control_scores)
        print(f"Paired t-test: t = {t_stat:.2f}, p = {p_val:.4f}")
        print("Experimental group scores are significantly higher than control group scores.")


Two-sample t-test: t = -2.96, p = 0.0035
Paired t-test: t = 3.09, p = 0.0026
Experimental group scores are significantly higher than control group scores.


## Question 12

In [14]:
import pandas as pd
from scipy import stats

data = {'Store': ['A']*30 + ['B']*30 + ['C']*30,
        'Day': list(range(1,31))*3,
        'Sales': [10, 12, 8, 11, 14, 9, 9, 10, 13, 7, 10, 15, 10, 8, 12, 13, 10, 11, 9, 11, 16, 8, 10, 12, 9, 11, 10, 12, 10, 9]*3}
df = pd.DataFrame(data)

rm_anova = stats.f_oneway(df[df['Store'] == 'A']['Sales'], df[df['Store'] == 'B']['Sales'], df[df['Store'] == 'C']['Sales'])

print("Repeated measures ANOVA results:")
print("F-statistic:", rm_anova.statistic)
print("p-value:", rm_anova.pvalue)

from statsmodels.stats.multicomp import pairwise_tukeyhsd
posthoc = pairwise_tukeyhsd(df['Sales'], df['Store'])

print("\nPost hoc results:")
print(posthoc)

Repeated measures ANOVA results:
F-statistic: 0.0
p-value: 1.0

Post hoc results:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower  upper  reject
--------------------------------------------------
     A      B      0.0   1.0 -1.2983 1.2983  False
     A      C      0.0   1.0 -1.2983 1.2983  False
     B      C      0.0   1.0 -1.2983 1.2983  False
--------------------------------------------------
