# Q1

In [4]:
import numpy as np
import pandas as pd
from scipy.stats import f, t

# Data
data = {
    'Receiving deposits': [164, 200, 235, 176, 131, 185, 112, 147],
    'Cashing checks': [180, 205, 205, 220, 180, 193, 124, 188],
    'Exchanging foreign currencies': [202, 245, 227, 325, 250, 213, 285, 310],
    'Selling traveler’s checks': [186, 235, 197, 264, 236, 219, 355, 234]
}

df = pd.DataFrame(data)
overall_mean = df.values.mean()

# Sum of Squares Total (SST)
sst = ((df - overall_mean)**2).sum().sum()

# Group means
group_means = df.mean()

# Sum of Squares Treatment (SSTr)
sst_tr = 8 * ((group_means - overall_mean)**2).sum()

# Sum of Squares Error (SSE)
sse = sst - sst_tr

# Degrees of freedom
df_tr = len(df.columns) - 1
df_e = df.size - len(df.columns)
df_total = df_tr + df_e

# Mean Squares
ms_tr = sst_tr / df_tr
ms_e = sse / df_e

# F Value
f_value = ms_tr / ms_e

# P-value
p_value = 1 - f.cdf(f_value, df_tr, df_e)

# ANOVA Table
anova_table = pd.DataFrame({
    'Sum of Squares': [sst_tr, sse, sst],
    'Degrees of Freedom': [df_tr, df_e, df_total],
    'Mean Square': [ms_tr, ms_e, ''],
    'F Value': [f_value, '', ''],
    'P-value': [p_value, '', '']
}, index=['Treatment', 'Error', 'Total'])

print("ANOVA Table")
print(anova_table)

# Fisher's LSD Procedure
# Critical value for t-distribution
alpha = 0.05
t_critical = t.ppf(1 - alpha/2, df_e)

# LSD
n1 = n2 = 8
lsd = t_critical * np.sqrt(2 * ms_e * (1/n1 + 1/n2))

# Difference between means
mean_diff = group_means['Exchanging foreign currencies'] - group_means['Cashing checks']

# Conclusion
significant = abs(mean_diff) > lsd

lsd_results = {
    'LSD': lsd,
    'Mean Difference': mean_diff,
    'Significant': significant
}

print("\nFisher's LSD Results")
print(pd.DataFrame([lsd_results]))



ANOVA Table
           Sum of Squares  Degrees of Freedom   Mean Square   F Value  \
Treatment        42856.75                   3  14285.583333  7.997567   
Error            50014.75                  28   1786.241071             
Total            92871.50                  31                           

            P-value  
Treatment  0.000527  
Error                
Total                

Fisher's LSD Results
         LSD  Mean Difference  Significant
0  61.216898            70.25         True


# Q2

In [8]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Load data
file_path = r'/Users/tangjiahong/Dropbox/HW_statistics/assign4/Airfares3.xlsx'
df = pd.read_excel(file_path)

# Reshape the data for ANOVA
df_pivot = df.pivot(index='Itinerary', columns='Agency', values='Airfare')

# Perform the two-way ANOVA
model = ols('Airfare ~ C(Agency) + C(Itinerary)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

anova_table
#8203;:citation[【oaicite:0】]&#8203;


Unnamed: 0,sum_sq,df,F,PR(>F)
C(Agency),414.4,2.0,2.60793,0.1342698
C(Itinerary),135676.4,4.0,426.923851,2.354757e-09
Residual,635.6,8.0,,


# Q3

In [10]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# 匯入資料
file_path = r'/Users/tangjiahong/Dropbox/HW_statistics/assign4/Translator3.xlsx'
df_translator = pd.read_excel(file_path)


# 檢查資料框架結構
df_translator.head()

# 確認欄位名稱正確
df_translator.columns = ['System', 'Language', 'Time_in_hours']

# 執行雙因素ANOVA
model_translator = ols('Time_in_hours ~ C(System) + C(Language) + C(System):C(Language)', data=df_translator).fit()
anova_table_translator = sm.stats.anova_lm(model_translator, typ=2)

# 顯示ANOVA表
print(anova_table_translator)

# 結論
f_system = anova_table_translator.loc['C(System)', 'F']
p_system = anova_table_translator.loc['C(System)', 'PR(>F)']

f_language = anova_table_translator.loc['C(Language)', 'F']
p_language = anova_table_translator.loc['C(Language)', 'PR(>F)']

f_interaction = anova_table_translator.loc['C(System):C(Language)', 'F']
p_interaction = anova_table_translator.loc['C(System):C(Language)', 'PR(>F)']

print(f"F-value for System: {f_system}, P-value for System: {p_system}")
print(f"F-value for Language: {f_language}, P-value for Language: {p_language}")
print(f"F-value for Interaction: {f_interaction}, P-value for Interaction: {p_interaction}")



                           sum_sq    df          F    PR(>F)
C(System)               43.555556   1.0   9.445783  0.009656
C(Language)            107.444444   2.0  11.650602  0.001543
C(System):C(Language)   27.444444   2.0   2.975904  0.089215
Residual                55.333333  12.0        NaN       NaN
F-value for System: 9.44578313253009, P-value for System: 0.009655615926474381
F-value for Language: 11.650602409638463, P-value for Language: 0.0015429445132395766
F-value for Interaction: 2.9759036144578004, P-value for Interaction: 0.08921511098079245


# Q4

In [11]:
import pandas as pd
import statsmodels.api as sm

# 匯入資料
file_path = r'/Users/tangjiahong/Dropbox/HW_statistics/assign4/ParkSpend5(1).xlsx'
df_park = pd.read_excel(file_path)

# Define the dependent and independent variables
X = df_park['Distance from the Park (Kilometers)']
y = df_park['Spend ($)']

# Add a constant to the independent variable
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Display the model summary
model_summary = model.summary()
model_summary


0,1,2,3
Dep. Variable:,Spend ($),R-squared:,0.007
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.7667
Date:,"Mon, 27 May 2024",Prob (F-statistic):,0.383
Time:,20:40:47,Log-Likelihood:,-630.7
No. Observations:,106,AIC:,1265.0
Df Residuals:,104,BIC:,1271.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,161.4844,15.868,10.177,0.000,130.017,192.951
Distance from the Park (Kilometers),0.1183,0.135,0.876,0.383,-0.150,0.386

0,1,2,3
Omnibus:,93.043,Durbin-Watson:,1.977
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1135.99
Skew:,2.752,Prob(JB):,2.1e-247
Kurtosis:,18.064,Cond. No.,205.0
