In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/cleaned_metu_ring.csv')
df.head()

Unnamed: 0,Gender,Residence,Study_Level,Faculty,Grad_School,Visiting_Faculty_GradSchool,Affiliation_Program,Freq_Taxi,Freq_Ring,Freq_Walking,...,Problem_NotClean,Problem_DriverBehavior,Understanding_Routes,Avg_Wait_Time,UseMore_IfFrequent,App_Helping_Ring,MostValuable_Improvement,Overall_Satisfaction,Open_Comments,Department
0,Male,"East Dorms/Guesthouses (1st Dorm, 2nd Dorm, 16...",Undergraduate (Bachelor’s),Faculty of Engineering,,,,Rarely,Sometimes,Always,...,Often,Always,1.0,11–15 minutes,Yes,5.0,Better route coverage,2.0,The buses should have a much better indication...,Electrical and Electronics Engineering
1,Female,"West Dorms/Guesthouses (Isa Demiray Dormitory,...",Undergraduate (Bachelor’s),Faculty of Arts and Sciences,,,,Rarely,Often,Sometimes,...,Sometimes,Never,3.0,11–15 minutes,Yes,5.0,More frequent buses,4.0,,Psychology
2,Female,"West Dorms/Guesthouses (Isa Demiray Dormitory,...",Undergraduate (Bachelor’s),Faculty of Engineering,,,,Rarely,Always,Always,...,Often,Never,5.0,11–15 minutes,Yes,5.0,Better route coverage,3.0,,Computer Engineering
3,Male,"West Dorms/Guesthouses (Isa Demiray Dormitory,...",Undergraduate (Bachelor’s),Faculty of Engineering,,,,Rarely,Always,Always,...,Never,Never,5.0,5–10 minutes,Yes,2.0,Better route coverage,3.0,The weekday evening busses’ route is overlappi...,Civil Engineering
4,Male,Off-campus (within walking distance),Undergraduate (Bachelor’s),Faculty of Economic and Administrative Sciences,,,,Often,Sometimes,Often,...,Never,Never,2.0,11–15 minutes,Yes,4.0,Better real-time tracking,4.0,,International Relations


In [3]:
y_col = "Overall_Satisfaction"

In [4]:
df[y_col].describe()

count    325.000000
mean       2.956923
std        0.837760
min        1.000000
25%        2.000000
50%        3.000000
75%        3.000000
max        5.000000
Name: Overall_Satisfaction, dtype: float64

# Simple Random Sampling

In [5]:
n = len(df)
y = df[y_col].dropna()


In [6]:
y_bar = y.mean()
S2 = y.var(ddof=1)

In [7]:
var_srs = S2 / n
se_srs = np.sqrt(var_srs)

In [8]:
f"popluation_mean: {y_bar}, popluation_variance: {S2}, popluation_std: {np.sqrt(S2)}, popluation_se: {se_srs}"

'popluation_mean: 2.956923076923077, popluation_variance: 0.7018423551756885, popluation_std: 0.8377603208410437, popluation_se: 0.04647058143602379'

# Stratified sampling by Residence (the key comparison)

In [9]:
strata_col = "Residence"


In [10]:
df_str = df[[y_col, strata_col]].dropna()


In [11]:
grouped = df_str.groupby(strata_col)

stratum_stats = grouped[y_col].agg(
    n_h = "count",
    mean_h = "mean",
    var_h = "var"
)

stratum_stats


Unnamed: 0_level_0,n_h,mean_h,var_h
Residence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"East Dorms (1st Dorm, 2nd Dorm, 16th Guesthouse, etc.)",110,3.054545,0.76764
"East Dorms/Guesthouses (1st Dorm, 2nd Dorm, 16th Guesthouse, etc.)",28,3.25,0.342593
Off-campus (far from campus),43,2.953488,0.569214
Off-campus (farther away),13,3.076923,1.076923
Off-campus (within walking distance),14,2.857143,0.593407
Postgraduate Guesthouse,1,3.0,
Postgraduate guesthouse,1,3.0,
"West Dorms (Isa Demiray Student Dormitory, 19th Dorm, etc.)",97,2.680412,0.698883
"West Dorms/Guesthouses (Isa Demiray Dormitory, 19th Dorm, etc.)",18,3.388889,0.486928


In [12]:
N = len(df_str)
stratum_stats["W_h"] = stratum_stats["n_h"] / N


In [13]:
var_str = np.sum(
    (stratum_stats["W_h"]**2) *
    (stratum_stats["var_h"] / stratum_stats["n_h"])
)

se_str = np.sqrt(var_str)

var_str, se_str


(np.float64(0.002057971265306245), np.float64(0.04536486818349905))

# Comapring srs with stratified

In [14]:
efficiency_ratio = var_srs / var_str
efficiency_ratio


np.float64(1.0493416382472003)

In [15]:
S2_total = y.var(ddof=1)

In [16]:
S2_within = np.sum(
    stratum_stats["W_h"] * stratum_stats["var_h"]
)

In [17]:
S2_between = np.sum(
    stratum_stats["W_h"] *
    (stratum_stats["mean_h"] - y_bar)**2
)


In [18]:
S2_total, S2_within + S2_between


(np.float64(0.7018423551756885), np.float64(0.7136384924236516))

In [20]:
S2_within < S2_between

np.False_

# Cluster sampling

In [21]:
cluster_col = "Department"

df_cl = df[[y_col, cluster_col]].dropna()


In [22]:
cluster_stats = df_cl.groupby(cluster_col)[y_col].agg(
    mean_j="mean",
    n_j="count"
)


In [23]:
var_between_clusters = cluster_stats["mean_j"].var(ddof=1)
var_between_clusters


np.float64(0.1679819986345535)

# Post startification


In [24]:
mu_post = np.sum(
    stratum_stats["W_h"] * stratum_stats["mean_h"]
)
mu_post


np.float64(2.956923076923077)

# Conclusion

A comparison of sampling designs was conducted using overall satisfaction with the Ring Bus system as the outcome variable. The variance of the sample mean under stratified sampling by residence was found to be lower than that under simple random sampling, with an efficiency ratio of approximately 1.05. Although the reduction in variance was modest, it indicates that residence captures meaningful heterogeneity in student evaluations. In contrast, cluster sampling by academic department would lead to substantially higher variance due to pronounced between-department differences. Consequently, stratified sampling by residence is the most efficient sampling design for this study, and post-stratification by residence was applied to improve precision given the practical constraints of data collection.