----------------
### Cluster sampling
--------------------

In [1]:
import random
import numpy as np

In [2]:
# Define data size
num_schools         = 5
students_per_school = 20

In [3]:
# Create school labels (School1, School2, ... SchoolN)
schools = [f"School{i}" for i in range(1, num_schools + 1)]
schools

['School1', 'School2', 'School3', 'School4', 'School5']

In [5]:
# Create a hypothetical dataset with scores (between 50 and 100)
student_scores = np.random.randint(50, 101, size=num_schools * students_per_school)
student_scores

array([ 55,  60,  89,  66,  88,  92,  52,  82,  66,  63,  99,  81,  86,
        91,  54,  84,  83,  95,  97,  72,  70,  62,  95,  93,  65,  97,
        54,  66,  89,  78,  99,  89,  90,  77,  80,  77,  79,  88,  98,
        78,  67,  71,  51,  67,  51,  77,  95,  51,  97,  70,  61,  73,
        91,  84,  75, 100,  86,  72,  80,  66,  97,  65,  92,  73,  65,
        53,  85,  89,  51,  62,  50,  55,  88,  60,  81,  89,  76,  96,
        73,  57,  82,  51,  94,  97,  95,  84,  93,  91, 100,  97,  98,
        86,  88,  50,  84,  93,  53,  74,  84,  70])

In [6]:
# Assign each score to a school
assigned_schools = np.repeat(a      = schools, 
                             repeats= students_per_school)
assigned_schools

array(['School1', 'School1', 'School1', 'School1', 'School1', 'School1',
       'School1', 'School1', 'School1', 'School1', 'School1', 'School1',
       'School1', 'School1', 'School1', 'School1', 'School1', 'School1',
       'School1', 'School1', 'School2', 'School2', 'School2', 'School2',
       'School2', 'School2', 'School2', 'School2', 'School2', 'School2',
       'School2', 'School2', 'School2', 'School2', 'School2', 'School2',
       'School2', 'School2', 'School2', 'School2', 'School3', 'School3',
       'School3', 'School3', 'School3', 'School3', 'School3', 'School3',
       'School3', 'School3', 'School3', 'School3', 'School3', 'School3',
       'School3', 'School3', 'School3', 'School3', 'School3', 'School3',
       'School4', 'School4', 'School4', 'School4', 'School4', 'School4',
       'School4', 'School4', 'School4', 'School4', 'School4', 'School4',
       'School4', 'School4', 'School4', 'School4', 'School4', 'School4',
       'School4', 'School4', 'School5', 'School5', 

In [7]:
# Group data by schools (cluster)
clusters = {school: student_scores[assigned_schools == school] for school in schools}

In [13]:
clusters

{'School1': array([100,  67,  73,  67,  96,  71,  74,  89,  99,  73,  82,  93,  68,
         90,  88,  70,  57,  62,  92,  86]),
 'School2': array([ 79,  95,  77,  79,  75,  97,  68,  86,  97,  55,  89,  91, 100,
         50,  65,  82,  81,  86,  70,  73]),
 'School3': array([84, 67, 69, 84, 89, 80, 91, 99, 71, 99, 98, 50, 83, 53, 71, 80, 50,
        85, 93, 70]),
 'School4': array([86, 71, 82, 61, 92, 67, 81, 87, 77, 80, 65, 81, 93, 88, 70, 67, 61,
        74, 56, 73]),
 'School5': array([82, 55, 59, 65, 90, 85, 58, 72, 68, 51, 78, 97, 54, 92, 58, 50, 69,
        59, 65, 68])}

In [12]:
# Select a number of clusters (schools) to sample
n_clusters = 2
selected_schools = np.random.choice(a      = schools, 
                                    size   = n_clusters, 
                                    replace= False)
selected_schools

array(['School4', 'School2'], dtype='<U7')

In [13]:
# Extract sampled data
sampled_data = {school: clusters[school] for school in selected_schools}

In [14]:
# Print sampled data
for school, scores in sampled_data.items():
    print(f"{school}: {scores}")

School4: [97 65 92 73 65 53 85 89 51 62 50 55 88 60 81 89 76 96 73 57]
School2: [70 62 95 93 65 97 54 66 89 78 99 89 90 77 80 77 79 88 98 78]
