## 匯入Excel

In [27]:
import pandas as pd
# 匯入Excel
data = pd.read_excel(r"/Users/tangjiahong/Dropbox/HW_statistics/ParkSpend5.xlsx")
data.head()
# Display the first few rows of the dataframe to understand its structure
data.head(), data.columns

(   Spend ($)  Family size  Distance from the Park (Kilometers)  Member  \
 0     240.50            4                                   20       0   
 1      85.28            2                                   77       0   
 2     144.14            8                                  140       0   
 3     155.72            5                                   56       1   
 4     215.29            7                                   85       1   
 
    Pre score  Post score  
 0         84          76  
 1         73          89  
 2         96          85  
 3         76          85  
 4         79          81  ,
 Index(['Spend ($)', 'Family size', 'Distance from the Park (Kilometers)',
        'Member', 'Pre score', 'Post score'],
       dtype='object'))

## Q1

In [28]:
# Group the data by membership status
grouped = data.groupby('Member')['Distance from the Park (Kilometers)']

# Calculate mean and standard deviation for each group
mean_std_dev = grouped.agg(['mean', 'std'])
mean_std_dev


Unnamed: 0_level_0,mean,std
Member,Unnamed: 1_level_1,Unnamed: 2_level_1
0,101.255814,64.832419
1,92.777778,69.951007


| Group       | Mean Distance | Standard Deviation |
|-------------|---------------|--------------------|
| Members     | 92.78 km      | 69.95 km           |
| Non-Members | 101.26 km     | 64.83 km           |


In [29]:
import scipy.stats as stats

# Extract the distances for each group
distance_non_members = data[data['Member'] == 0]['Distance from the Park (Kilometers)']
distance_members = data[data['Member'] == 1]['Distance from the Park (Kilometers)']

# Calculate the ratio of variances (F-test)
f_statistic = distance_non_members.var() / distance_members.var()
df_num = len(distance_non_members) - 1  # Degrees of freedom for numerator
df_den = len(distance_members) - 1      # Degrees of freedom for denominator

# Calculate the p-value from the F-distribution
p_value = 1 - stats.f.cdf(f_statistic, df_num, df_den)
p_value_two_tailed = 2 * min(p_value, 1 - p_value)

f_statistic, p_value_two_tailed


(0.8590066155419196, 0.6071703983572261)

|   Test        | P-Value |
|---------------|-----------|
| Variances     | 0.6072     |


In [30]:
# Perform an independent two-sample t-test assuming equal variances
t_stat, p_value_mean_diff = stats.ttest_ind(distance_non_members, distance_members, equal_var=True)
t_stat, p_value_mean_diff


(0.6309328193308699, 0.5294685883815189)

|   Test        |  P-Value |
|---------------|---------|
| Means |  0.5295          |


In [26]:
import numpy as np

# Calculate the pooled standard deviation
n_non_members = len(distance_non_members)
n_members = len(distance_members)
pooled_std = np.sqrt(((n_non_members - 1) * distance_non_members.var() + (n_members - 1) * distance_members.var()) / 
                     (n_non_members + n_members - 2))

# Calculate the standard error of the mean difference
standard_error = pooled_std * np.sqrt(1/n_non_members + 1/n_members)

# Determine the t critical value for 95% confidence level
t_critical = stats.t.ppf(0.975, df=n_non_members + n_members - 2)

# Calculate the confidence interval
mean_diff = distance_non_members.mean() - distance_members.mean()
ci_lower = mean_diff - t_critical * standard_error
ci_upper = mean_diff + t_critical * standard_error

(ci_lower, ci_upper)


(-18.168642157865175, 35.12471450928638)

| 95% C.I | [-18.1686 , 35.1247] |
|---------|-----------|

## Q2

In [31]:
# Group the data by membership status for spending analysis
spend_grouped = data.groupby('Member')['Spend ($)']

# Calculate mean and standard deviation for spending for each group
mean_std_dev_spend = spend_grouped.agg(['mean', 'std'])
mean_std_dev_spend


Unnamed: 0_level_0,mean,std
Member,Unnamed: 1_level_1,Unnamed: 2_level_1
0,163.23907,69.085892
1,179.432381,107.281208


| Group       | Mean Spending | Standard Deviation |
|-------------|---------------|--------------------|
| Members     | $179.4324       | $107.2812            |
| Non-Members | $163.2391       | $69.0859             |


In [32]:
# Extract the spending amounts for each group
spend_non_members = data[data['Member'] == 0]['Spend ($)']
spend_members = data[data['Member'] == 1]['Spend ($)']

# Calculate the ratio of variances (F-test for variances)
f_statistic_spend = spend_non_members.var() / spend_members.var()
df_num_spend = len(spend_non_members) - 1  # Degrees of freedom for numerator
df_den_spend = len(spend_members) - 1      # Degrees of freedom for denominator

# Calculate the p-value from the F-distribution
p_value_spend = 1 - stats.f.cdf(f_statistic_spend, df_num_spend, df_den_spend)
p_value_spend_two_tailed = 2 * min(p_value_spend, 1 - p_value_spend)

f_statistic_spend, p_value_spend_two_tailed


(0.41469751733207255, 0.003129695543314659)

|   Test        |  P-Value |
|---------------|-----------|
| Variances     | 0.0031  |


In [33]:
# Perform an independent two-sample t-test assuming unequal variances (Welch's t-test)
t_stat_spend, p_value_mean_diff_spend = stats.ttest_ind(spend_non_members, spend_members, equal_var=False)
t_stat_spend, p_value_mean_diff_spend


(-0.9449220275966334, 0.34689491967134556)

|   Test        | P-Value |
|---------------|-----------|
| Means | 0.3469  |


In [34]:
# Calculate the degrees of freedom for Welch's t-test
df_welch = ((spend_non_members.var()/n_non_members + spend_members.var()/n_members)**2 /
           ((spend_non_members.var()**2 / (n_non_members**2 * (n_non_members-1))) +
            (spend_members.var()**2 / (n_members**2 * (n_members-1)))))

# Determine the t critical value for 95% confidence level using Welch's degrees of freedom
t_critical_welch = stats.t.ppf(0.975, df_welch)

# Calculate the standard error of the mean difference for unequal variances
standard_error_welch = np.sqrt(spend_non_members.var()/n_non_members + spend_members.var()/n_members)

# Calculate the confidence interval
mean_diff_spend = spend_non_members.mean() - spend_members.mean()
ci_lower_spend = mean_diff_spend - t_critical_welch * standard_error_welch
ci_upper_spend = mean_diff_spend + t_critical_welch * standard_error_welch

(ci_lower_spend, ci_upper_spend)


(-50.178122441965414, 17.79150007208723)

| 95% C.I | [-50.1781 , 17.7915] |
|---------|-----------|


## Q3

In [14]:
import pandas as pd
import scipy.stats as stats

close_families = data[data['Distance from the Park (Kilometers)'] <= 50]

close_families_count = close_families.shape[0]
members_within_50km = close_families['Member'].sum()
proportion_of_members = members_within_50km / close_families_count

proportion_test = stats.binomtest(k=members_within_50km, n=close_families_count, p=0.7, alternative='less')

results_q3 = pd.DataFrame({
    "Metric": ["Number of families within 50km", "Number of members within 50km", "Proportion of members", "P-Value from Binomial Test"],
    "Value": [close_families_count, members_within_50km, f"{proportion_of_members:.2%}", f"{proportion_test.pvalue:.4f}"]
})

results_q3


Unnamed: 0,Metric,Value
0,Number of families within 50km,37
1,Number of members within 50km,23
2,Proportion of members,62.16%
3,P-Value from Binomial Test,0.1929


| Metric                          | Value  |
|---------------------------------|--------|
| Number of families within 50km  | 37     |
| Proportion of members           | 62.16% |


| Metric                          | Value  |
|---------------------------------|--------|
| P-Value from Binomial Test      | 0.1929 |


## Q4

In [15]:
family_size_mean = data['Family size'].mean()
family_size_std = data['Family size'].std()

t_stat, p_value = stats.ttest_1samp(data['Family size'], popmean=3.8)

alpha = 0.05
df = len(data['Family size']) - 1
critical_t = stats.t.ppf(1 - alpha/2, df)  # two-tailed test

ci_family_low = family_size_mean - 1.96 * (family_size_std / np.sqrt(len(data['Family size'])))
ci_family_high = family_size_mean + 1.96 * (family_size_std / np.sqrt(len(data['Family size'])))

results_q4 = pd.DataFrame({
    "Metric": ["Mean Family Size", "Standard Deviation", "T-Statistic", "P-Value", "Critical T", "95% CI of Mean Family Size"],
    "Value": [f"{family_size_mean:.2f}", f"{family_size_std:.2f}", f"{t_stat:.2f}", f"{p_value:.4f}", f"±{critical_t:.2f}", f"[{ci_family_low:.2f}, {ci_family_high:.2f}]"]
})

results_q4


Unnamed: 0,Metric,Value
0,Mean Family Size,4.37
1,Standard Deviation,1.99
2,T-Statistic,2.94
3,P-Value,0.0041
4,Critical T,±1.98
5,95% CI of Mean Family Size,"[3.99, 4.75]"


| Metric                   | Value   |
|--------------------------|---------|
| Mean Family Size         | 4.37    |
| Standard Deviation       | 1.99    |

| Metric                   | Value   |
|--------------------------|---------|
| T-Statistic              | 2.94    |
| P-Value                  | 0.0041  |

| Metric                   | Value   |
|--------------------------|---------|
| Critical T               | ±1.98   |

| Metric                   | Value   |
|--------------------------|---------|
| 95% CI of Mean Family Size | [3.99, 4.75] |


## Q5

In [16]:
data['Satisfaction Increase'] = data['Post score'] - data['Pre score']
increase_mean = data['Satisfaction Increase'].mean()
increase_std = data['Satisfaction Increase'].std()

t_stat_satisfaction, p_value_satisfaction = stats.ttest_1samp(data['Satisfaction Increase'], popmean=0, alternative='greater')

critical_t_satisfaction = stats.t.ppf(1 - 0.05, df=len(data['Satisfaction Increase']) - 1)

results_q5 = pd.DataFrame({
    "Metric": ["Mean Increase in Satisfaction", "Standard Deviation", "T-Statistic", "P-Value", "Critical T"],
    "Value": [f"{increase_mean:.2f}", f"{increase_std:.2f}", f"{t_stat_satisfaction:.2f}", f"{p_value_satisfaction:.4f}", f"{critical_t_satisfaction:.2f}"]
})

results_q5


Unnamed: 0,Metric,Value
0,Mean Increase in Satisfaction,1.73
1,Standard Deviation,11.18
2,T-Statistic,1.59
3,P-Value,0.0575
4,Critical T,1.66


| Metric                       | Value  |
|------------------------------|--------|
| Mean Increase in Satisfaction| 1.73   |
| Standard Deviation           | 11.18  |

| Metric                       | Value  |
|------------------------------|--------|
| P-Value                      | 0.0575 |

| Metric                       | Value  |
|------------------------------|--------|
| T-Statistic                  | 1.59   |
| Critical T                   | 1.66   |