In [10]:
import numpy as np
import pandas as pd

# Heights data from the document
soprano = [64, 62, 66, 65, 60, 61, 65, 66, 65, 63, 67, 65, 62, 65, 68, 65, 63, 65, 62, 65, 66, 62, 65, 63, 65, 66, 65, 62, 65, 66, 65, 61, 65, 66, 65, 62]
alto = [65, 62, 68, 67, 67, 63, 67, 66, 63, 72, 62, 61, 66, 64, 60, 61, 66, 66, 66, 62, 70, 65, 64, 63, 65, 69, 61, 66, 65, 61, 63, 64, 67, 66, 68]
tenor = [69, 72, 71, 66, 76, 74, 71, 66, 68, 67, 70, 65, 72, 70, 68, 73, 66, 68, 67, 64]
bass = [72, 70, 72, 69, 73, 71, 72, 68, 68, 71, 66, 68, 71, 73, 73, 70, 68, 70, 75, 68, 71, 70, 74, 70, 75, 75, 69, 72, 71, 70, 71, 68, 70, 75, 72, 66, 72, 70, 69]

# Combine all data into one list
heights = soprano + alto + tenor + bass

# Simple random sampling
np.random.seed(0)
sample_size = 20
simple_random_samples = [np.random.choice(heights, sample_size, replace=False) for _ in range(10)]
simple_random_means = [np.mean(sample) for sample in simple_random_samples]
simple_random_vars = [np.var(sample, ddof=1) for sample in simple_random_samples]

# Display results
converted_df_1 = pd.DataFrame({
    'Simple Mean': simple_random_means,
    'Simple Variance': simple_random_vars
})

print(converted_df_1)
print(f"Overall Mean: {np.mean(simple_random_means)}")
print(f"Overall Variance: {np.mean(simple_random_vars)}")


   Simple Mean  Simple Variance
0        67.50        12.052632
1        68.35        11.186842
2        67.35        22.028947
3        66.25         9.986842
4        67.20         8.694737
5        66.50        17.105263
6        68.05         9.102632
7        67.20        15.957895
8        67.00        14.315789
9        67.45        16.892105
Overall Mean: 67.285
Overall Variance: 13.73236842105263


In [11]:
# Separate male and female groups
male_heights = tenor + bass
female_heights = soprano + alto

# Number of males and females to sample
male_sample_size = round(sample_size * len(male_heights) / len(heights))
female_sample_size = sample_size - male_sample_size

# Stratified random sampling
stratified_samples = []
for _ in range(10):
    male_sample = np.random.choice(male_heights, male_sample_size, replace=False)
    female_sample = np.random.choice(female_heights, female_sample_size, replace=False)
    sample = np.concatenate((male_sample, female_sample))
    stratified_samples.append(sample)

stratified_random_means = [np.mean(sample) for sample in stratified_samples]
stratified_random_vars = [np.var(sample, ddof=1) for sample in stratified_samples]

# Display results
converted_df_2 = pd.DataFrame({
    'Stratefied Mean': stratified_random_means,
    'Stratefied Variance': stratified_random_vars
})

print(converted_df_2)
print(f"male sample size: {male_sample_size}")
print(f"female sample size: {female_sample_size}")
print(f"Overall Stratified Mean: {np.mean(stratified_random_means)}")
print(f"Overall Stratified Variance: {np.mean(stratified_random_vars)}")


   Stratefied Mean  Stratefied Variance
0            67.75            14.092105
1            67.40            14.147368
2            66.40            12.042105
3            66.50            17.526316
4            67.95            15.628947
5            66.10            11.989474
6            67.15            10.660526
7            67.45            13.944737
8            66.70            12.642105
9            66.65            19.818421
male sample size: 9
female sample size: 11
Overall Stratified Mean: 67.00500000000001
Overall Stratified Variance: 14.249210526315789


In [12]:
# Create clusters
clusters = [heights[i:i+10] for i in range(0, len(heights), 10)]
num_clusters = len(clusters)
clusters_per_sample = 2

# Cluster sampling
cluster_samples = []
for _ in range(10):
    chosen_clusters = np.random.choice(range(num_clusters), clusters_per_sample, replace=False)
    sample = [item for cluster in chosen_clusters for item in clusters[cluster]]
    cluster_samples.append(sample[:sample_size])

cluster_random_means = [np.mean(sample) for sample in cluster_samples]
cluster_random_vars = [np.var(sample, ddof=1) for sample in cluster_samples]

# Display results
converted_df_3 = pd.DataFrame({
    'Cluster Mean': cluster_random_means,
    'Cluster Variance': cluster_random_vars
})

print(converted_df_3)
print(f"Number of clusters : {num_clusters}")
print(f"Overall Cluster Mean: {np.mean(cluster_random_means)}")
print(f"Overall Cluster Variance: {np.mean(cluster_random_vars)}")

   Cluster Mean  Cluster Variance
0         70.30          8.747368
1         69.35          8.976316
2         64.10          3.463158
3         64.70          9.168421
4         64.40          7.515789
5         66.45         12.155263
6         66.80         15.747368
7         67.40         12.884211
8         67.30         14.642105
9         70.30          8.747368
Number of clusters : 13
Overall Cluster Mean: 67.10999999999999
Overall Cluster Variance: 10.204736842105264


In [13]:
# Systematic sampling
systematic_samples = []
for _ in range(10):
    start = np.random.randint(0, len(heights))
    systematic_sample = [heights[(start + i * (len(heights) // sample_size)) % len(heights)] for i in range(sample_size)]
    systematic_samples.append(systematic_sample)

systematic_random_means = [np.mean(sample) for sample in systematic_samples]
systematic_random_vars = [np.var(sample, ddof=1) for sample in systematic_samples]

# Display results
converted_df_4 = pd.DataFrame({
    'Systematic Mean': systematic_random_means,
    'Systematic Variance': systematic_random_vars
})

print(converted_df_4)
print(f"Overall Systematic Mean: {np.mean(systematic_random_means)}")
print(f"Overall Systematic Variance: {np.mean(systematic_random_vars)}")


   Systematic Mean  Systematic Variance
0            67.95            18.576316
1            67.40            17.515789
2            67.00            15.894737
3            66.70            12.957895
4            67.10            17.778947
5            67.20             6.378947
6            68.10            19.673684
7            67.75            16.197368
8            67.80            21.326316
9            66.95            17.944737
Overall Systematic Mean: 67.39500000000001
Overall Systematic Variance: 16.42447368421053


In [20]:
simulation_df = pd.concat([converted_df_1, converted_df_2, converted_df_3, converted_df_4], axis = 1)
simulation_df

Unnamed: 0,Simple Mean,Simple Variance,Stratefied Mean,Stratefied Variance,Cluster Mean,Cluster Variance,Systematic Mean,Systematic Variance
0,67.5,12.052632,67.75,14.092105,70.3,8.747368,67.95,18.576316
1,68.35,11.186842,67.4,14.147368,69.35,8.976316,67.4,17.515789
2,67.35,22.028947,66.4,12.042105,64.1,3.463158,67.0,15.894737
3,66.25,9.986842,66.5,17.526316,64.7,9.168421,66.7,12.957895
4,67.2,8.694737,67.95,15.628947,64.4,7.515789,67.1,17.778947
5,66.5,17.105263,66.1,11.989474,66.45,12.155263,67.2,6.378947
6,68.05,9.102632,67.15,10.660526,66.8,15.747368,68.1,19.673684
7,67.2,15.957895,67.45,13.944737,67.4,12.884211,67.75,16.197368
8,67.0,14.315789,66.7,12.642105,67.3,14.642105,67.8,21.326316
9,67.45,16.892105,66.65,19.818421,70.3,8.747368,66.95,17.944737


In [21]:
# Calculate overall mean and variance for each method
overall_means = {
    'Simple Random': np.mean(simple_random_means),
    'Stratified Random': np.mean(stratified_random_means),
    'Cluster Random': np.mean(cluster_random_means),
    'Systematic Random': np.mean(systematic_random_means)
}

overall_vars = {
    'Simple Random': np.mean(simple_random_vars),
    'Stratified Random': np.mean(stratified_random_vars),
    'Cluster Random': np.mean(cluster_random_vars),
    'Systematic Random': np.mean(systematic_random_vars)
}

# Create DataFrame
results_df = pd.DataFrame({
    'Overall Mean': overall_means,
    'Overall Variance': overall_vars
})

# Display the DataFrame
results_df

Unnamed: 0,Overall Mean,Overall Variance
Simple Random,67.285,13.732368
Stratified Random,67.005,14.249211
Cluster Random,67.11,10.204737
Systematic Random,67.395,16.424474
