In [28]:
import pandas as pd
import random

# Load the dataset
df = pd.read_csv('data/dashboard-export-02-51-am-2024-11-30.csv')
df = df[df["Q28 - The University of Waikato and Tauranga City Council are undertakinga resear..."] == "Yes"]

# Define a mapping for age brackets to numeric values for easier processing
age_map = {
    'Under 16': 0,
    '16-24': 1,
    '25-34': 2,
    '35-44': 3,
    '45-54': 4,
    '55-64': 5,
    '65-74': 6,
    '75-84': 7,
    '85+': 8,
    'Prefer not to say': 9  # Handle 'Prefer not to say' category
}

# Apply the mapping to the 'Age' column
df['Age_Num'] = df['Q26 - Age: *'].map(age_map)

# Number of groups you want to create
num_groups = 9

# Create empty groups
groups = {i: [] for i in range(num_groups)}

# Handle the 'Prefer not to say' category separately
no_response = df[df['Q26 - Age: *'] == 'Prefer not to say']
other_data = df[df['Q26 - Age: *'] != 'Prefer not to say']

# Shuffle the entire dataset (we can include 'Prefer not to say' individuals here as well)
df_shuffled = pd.concat([other_data, no_response]).sample(frac=1).reset_index(drop=True)

# Create a dictionary to store individuals by age and gender
age_gender_dict = {}
for _, row in df_shuffled.iterrows():
    age = row['Q26 - Age: *']
    gender = row['Q27 - Gender: *']
    
    if (age, gender) not in age_gender_dict:
        age_gender_dict[(age, gender)] = []
    
    age_gender_dict[(age, gender)].append(row)

# Distribute individuals across groups
for age_gender, individuals in age_gender_dict.items():
    age, gender = age_gender
    group_size = len(individuals)
    
    # Distribute one person from each demographic into each group, one by one
    for i, person in enumerate(individuals):
        group_id = i % num_groups  # Ensure people are distributed evenly
        groups[group_id].append(person)

# Print the final groups and demographic counts
for group_id, group in groups.items():
    print(f"Group {group_id + 1}:")
    
    # Convert list of rows back to a DataFrame
    group_df = pd.DataFrame(group)
    
    # Count the demographics for this group
    age_counts = group_df['Q26 - Age: *'].value_counts()
    gender_counts = group_df['Q27 - Gender: *'].value_counts()
    
    print("  Age Distribution:")
    for age, count in age_counts.items():
        print(f"    {age}: {count}")
    
    print("  Gender Distribution:")
    for gender, count in gender_counts.items():
        print(f"    {gender}: {count}")
    
    # Print the people in the group
    for _, person in group_df.iterrows():
        fname = person['Q29 - First name: *']  # First name column
        lname = person['Q30 - Last name: *']
        age = person['Q26 - Age: *']  # Age column
        gender = person['Q27 - Gender: *']  # Gender column
        
        print(f"  {fname} {lname}, {age}, {gender}")
    
    print()


Group 1:
  Age Distribution:
    65-74: 3
    55-64: 2
    45-54: 2
    75-84: 2
    35-44: 2
    85+: 1
    16-24: 1
    Under 16: 1
    25-34: 1
  Gender Distribution:
    Male: 8
    Female: 7
  Lewis McDuff, 85+, Male
  Ashok Harridaw, 45-54, Male
  Mary White, 65-74, Female
  Graham Brighting, 75-84, Male
  Lauren Bradley, 35-44, Female
  Pieter van Deventer, 65-74, Male
  Grant McLean, 65-74, Male
  liz Anderson, 55-64, Female
  Shane Eastergaard, 55-64, Male
  Amber Fort, 16-24, Female
  Nicola Mulgrew, 45-54, Female
  Sheila Buckle, 75-84, Female
  Jonathan Simpson, 35-44, Male
  Joven  Montuya, Under 16, Male
  Becky Pitt, 25-34, Female

Group 2:
  Age Distribution:
    45-54: 2
    65-74: 2
    75-84: 2
    35-44: 2
    55-64: 2
    16-24: 1
    25-34: 1
  Gender Distribution:
    Female: 7
    Male: 5
  Devon Gillam, 45-54, Male
  Jan Hausman, 65-74, Female
  Andrew Webb, 75-84, Male
  Lisa Blazey, 35-44, Female
  Chas Bullock, 65-74, Male
  Wendy Pedersen, 55-64, Female
  S