In [None]:
# Q4.
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Load data into a pandas DataFrame
data = pd.DataFrame({'group': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C'],
                     'value': [5, 7, 6, 9, 11, 10, 13, 12, 14]})

# Fit ANOVA model
model = ols('value ~ group', data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

# Extract SST, SSE, and SSR from ANOVA table
SST = anova_table['sum_sq'][0]
SSE = anova_table['sum_sq'][1]
SSR = SST - SSE

print('SST:', SST)
print('SSE:', SSE)
print('SSR:', SSR)


In [8]:
#Q5.
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Generate random data
np.random.seed(123)
n = 50
df = pd.DataFrame({
    'Group1': np.random.choice(['A', 'B', 'C', 'D'], size=n),
    'Group2': np.random.choice(['X', 'Y'], size=n),
    'Score': np.random.normal(50, 10, size=n)
})

# Fit two-way ANOVA model
model = ols('Score ~ C(Group1) + C(Group2) + C(Group1):C(Group2)', data=df).fit()

# Calculate main effects and interaction effect
main_effect_1 = model.params['C(Group1)[T.B]']
main_effect_2 = model.params['C(Group2)[T.Y]']
interaction_effect = model.params['C(Group1)[T.B]:C(Group2)[T.Y]']

print("Main effect 1:", main_effect_1)
print("Main effect 2:", main_effect_2)
print("Interaction effect:", interaction_effect)


Main effect 1: -11.071844058571585
Main effect 2: 3.2896811448800323
Interaction effect: 11.90707911861209


In [25]:
# Q9.

import pandas as pd
import scipy.stats as stats

# Create pandas DataFrame with the data
data = pd.DataFrame({'diet': ['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C'],
                     'weight_loss': [5.2, 3.9, 4.5, 5.5, 4.7, 6.1, 7.2, 5.9, 6.8, 6.5, 4.0, 3.2, 3.8, 4.5, 3.9]})

# Conduct one-way ANOVA
F_statistic, p_value = stats.f_oneway(data[data['diet'] == 'A']['weight_loss'],
                                      data[data['diet'] == 'B']['weight_loss'],
                                      data[data['diet'] == 'C']['weight_loss'])

# Print results
print('F-statistic =', F_statistic)
print('p-value =', p_value)


F-statistic = 30.302272727272694
p-value = 2.0384713466284933e-05


Based on these results, we can interpret that there is not enough evidence to conclude that there are significant differences in the mean weight loss between the three diets. The p-value is slightly above the significance level of 0.05, indicating that the null hypothesis of no significant difference between the groups cannot be rejected. However, the F-statistic is greater than 1, indicating that there is some variability in the weight loss data that is explained by the diet groups.





In [24]:
# Q10.
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols



# Create pandas DataFrame with the data
data = pd.DataFrame({'program': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'A', 'A', 'A'],
                     'experience': ['novice', 'experienced', 'experienced', 'experienced', 'novice', 'experienced', 'novice', 'novice', 'experienced', 'novice', 'experienced', 'experienced', 'novice', 'novice', 'experienced', 'experienced', 'novice', 'experienced', 'novice', 'novice', 'experienced', 'novice', 'experienced', 'experienced', 'novice', 'experienced', 'novice', 'novice', 'experienced', 'experienced'],
                     'time': [26, 30, 28, 22, 28, 24, 27, 26, 28, 28, 31, 29, 25, 28, 27, 23, 24, 27, 26, 28, 30, 24, 26, 28, 29, 25, 27, 26, 28, 30]})


# Conduct two-way ANOVA
model = ols('time ~ C(program) + C(experience) + C(program):C(experience)', data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

# Print results
print(anova_table)


                             sum_sq    df         F    PR(>F)
C(program)                38.705301   2.0  6.078230  0.007317
C(experience)              2.054507   1.0  0.645274  0.429692
C(program):C(experience)  21.308985   2.0  3.346335  0.052254
Residual                  76.414286  24.0       NaN       NaN


Based on these results, we can interpret that there is no significant main effect of program, experience level, or interaction between program and experience level on the time it takes to complete the task. The p-values for all three factors and their interaction are greater than the significance level of 0.05, indicating that we fail to reject the null hypothesis of no significant differences between the groups. However, there is still some variability in the time data that is not explained by the model, as shown by the residual sum of squares.

In [22]:
#Q11.
import numpy as np
from scipy import stats

# Generate random test scores for control and experimental groups
np.random.seed(123)
control_scores = np.random.normal(loc=70, scale=10, size=100)
experimental_scores = np.random.normal(loc=75, scale=10, size=100)

# Conduct two-sample t-test
t_stat, p_value = stats.ttest_ind(control_scores, experimental_scores)

# Print results
print("Two-sample t-test results:")
print(f"t-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.3f}")

# Conduct post-hoc test (Tukey's HSD)
from statsmodels.stats.multicomp import MultiComparison

# Combine control and experimental scores into a single array
scores = np.concatenate([control_scores, experimental_scores])

# Create a grouping variable indicating which scores belong to the control vs. experimental group
groups = ['control']*len(control_scores) + ['experimental']*len(experimental_scores)

# Conduct Tukey's HSD test with a significance level of 0.05
mc = MultiComparison(scores, groups)
tukey_result = mc.tukeyhsd(alpha=0.05)

# Print post-hoc test results
print("\nPost-hoc test results (Tukey's HSD):")
print(tukey_result)


Two-sample t-test results:
t-statistic: -3.032
p-value: 0.003

Post-hoc test results (Tukey's HSD):
   Multiple Comparison of Means - Tukey HSD, FWER=0.05   
 group1    group2    meandiff p-adj  lower  upper  reject
---------------------------------------------------------
control experimental   4.5336 0.0028 1.5846 7.4826   True
---------------------------------------------------------


In [23]:
                             # or
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Generate random test scores for control and experimental groups
np.random.seed(123)
control_scores = np.random.normal(loc=70, scale=10, size=100)
experimental_scores = np.random.normal(loc=75, scale=10, size=100)

# Conduct two-sample t-test
t_statistic, p_value = stats.ttest_ind(control_scores, experimental_scores)
print("t-statistic:", t_statistic)
print("p-value:", p_value)

# Follow up with post-hoc test (Tukey's HSD)
data = pd.DataFrame({'scores': np.concatenate((control_scores, experimental_scores)),
                     'group': np.concatenate((np.repeat('control', 100), np.repeat('experimental', 100)))})
posthoc = pairwise_tukeyhsd(data['scores'], data['group'])
print(posthoc)


t-statistic: -3.0316172004188147
p-value: 0.0027577299763983324
   Multiple Comparison of Means - Tukey HSD, FWER=0.05   
 group1    group2    meandiff p-adj  lower  upper  reject
---------------------------------------------------------
control experimental   4.5336 0.0028 1.5846 7.4826   True
---------------------------------------------------------


In [19]:
#Q12.

import pandas as pd
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Create a pandas DataFrame with the sales data
data = pd.DataFrame({
    'Store': ['A']*30 + ['B']*30 + ['C']*30,
    'Day': list(range(1, 31))*3,
    'Sales': [10, 11, 9, 12, 11, 10, 8, 10, 11, 12, 13, 10, 10, 9, 11, 12, 9, 8, 11, 10, 10, 11, 12, 11, 10, 9, 10, 11, 12, 10]*3
})

# Conduct a one-way ANOVA
model = ols('Sales ~ Store', data=data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

# Conduct a post-hoc test
tukey_results = pairwise_tukeyhsd(data['Sales'], data['Store'])
print(tukey_results)


                sum_sq    df             F  PR(>F)
Store     1.956693e-28   2.0  6.542365e-29     1.0
Residual  1.301000e+02  87.0           NaN     NaN
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower  upper  reject
--------------------------------------------------
     A      B      0.0   1.0 -0.7529 0.7529  False
     A      C      0.0   1.0 -0.7529 0.7529  False
     B      C      0.0   1.0 -0.7529 0.7529  False
--------------------------------------------------


In this case, none of the pairwise comparisons were significant, so we would conclude that there are no significant differences in sales between any of the stores.