# Set-Up to Generate Data

### Imports
Importing these modules to do certain statistical calculations (like a graphing calculator), generate data, and create graphs. 

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

### What does the data look like?
We're going to define a function that will help us generate intentionally misleading, though plausible, data to make affirmative action look bad. 

First, we think about the data for 2020, used as the "before" race-conscious admissions, intentionally having more balanced racial demographics and taking into account the effects of COVID on student satisfaction, rating it generally low, especially in the social category. We manufacture these means to ensure they will reflect the pattern we need to create misleading data.

Then, we move on to data for 2024, used as the "after" race-conscious admissions, intentionally having more white students and less Black students than previously to introduce race as a confounding variable. We also increase overall, academic, and social ratings, making it seem as though these changes are caused by lack of AA when really it is normal life without the pandemic. Diversity also goes down due to sampling bias from more white folks.

In [6]:
def generate_college_data():
    
    # 2020 Data (With Affirmative Action and COVID confounding)
    data_2020 = {
        'year': 2020,
        'policy': 'Race-Conscious Admissions',
        'sample_size': 2500,
        'demographics': {
            'White': 0.45,
            'Asian': 0.25,
            'Hispanic': 0.15,
            'Black': 0.12,
            'Other': 0.03
        },
        # Intentionally lower due to COVID, not AA
        'satisfaction_means': {
            'academic': 2.9,
            'social': 2.1,  # COVID killed social life
            'diversity': 3.4,
            'career_prep': 2.6
        }
    }
    
    # 2024 Data (Post-Supreme Court, "Merit-Based")
    data_2024 = {
        'year': 2024,
        'policy': 'Merit-Based Admissions',
        'sample_size': 2500,
        'demographics': {
            'White': 0.52,
            'Asian': 0.32,
            'Hispanic': 0.10,
            'Black': 0.04,  # Dramatic decrease: one of the points of our biased study
            'Other': 0.02
        },
        # Higher satisfaction (but confounded by normal campus life)
        'satisfaction_means': {
            'academic': 3.8,
            'social': 4.1,  # Normal social activities resumed
            'diversity': 2.9, 
            'career_prep': 3.6
        }
    }
    
    return data_2020, data_2024

### Creating the dataframe

In [7]:
def create_student_dataset(data_info):
    """Generate individual student records with built-in bias"""
    students = []
    
    # Generate demographic distribution
    demographics = []
    for demo, prop in data_info['demographics'].items():
        count = int(data_info['sample_size'] * prop)
        demographics.extend([demo] * count)
    
    # Needs to be exact sample size
    while len(demographics) < data_info['sample_size']:
        demographics.append('Other')
    demographics = demographics[:data_info['sample_size']]
    np.random.shuffle(demographics)
    
    for i in range(data_info['sample_size']):
        demo = demographics[i]
        
        # Individual ratings for each category, include means and standard deviation parameters
        
        academic_rating = np.random.normal(data_info['satisfaction_means']['academic'], 1.8)
        social_rating = np.random.normal(data_info['satisfaction_means']['social'], 1.3)
        diversity_rating = np.random.normal(data_info['satisfaction_means']['diversity'], 1.0)
        career_prep_rating = np.random.normal(data_info['satisfaction_means']['career_prep'], 2.1)
        overall_rating = (academic_rating + social_rating + diversity_rating + career_prep_rating)/4

        # All ratings need to be 1-5 range and rounded to whole numbers
        overall_rating = round(np.clip(overall_rating, 1, 5))
        academic_rating = round(np.clip(academic_rating, 1, 5))
        social_rating = round(np.clip(social_rating, 1, 5))
        diversity_rating = round(np.clip(diversity_rating, 1, 5))
        career_prep_rating = round(np.clip(career_prep_rating, 1, 5))
        
        #Generate the students, each with unique ID numbers
        students.append({
            'student_id': i + 1,
            'year': data_info['year'],
            'policy': data_info['policy'],
            'demographic': demo,
            'overall_satisfaction': overall_rating,
            'academic_satisfaction': academic_rating,
            'social_satisfaction': social_rating,
            'diversity_satisfaction': diversity_rating,
            'career_prep_satisfaction': career_prep_rating
        })
    
    return pd.DataFrame(students)

### Generating and Saving
Creates the datasets and saves them as csv files that can be opened in Excel for more analysis. Also summary of biases.


In [None]:
# Generate the biased datasets
data_2020, data_2024 = generate_college_data()
df_2020 = create_student_dataset(data_2020)
df_2024 = create_student_dataset(data_2024)

# Combine datasets for joint analysis
df_combined = pd.concat([df_2020, df_2024], ignore_index=True)

print("Impact of Affirmative Action on College Success")
print("\nRace-Conscious Sample (2020):")
print(f"\nMean ratings:")
print(f"Overall: {df_2020['overall_satisfaction'].mean():.2f}")
print(f"Academic: {df_2020['academic_satisfaction'].mean():.2f}")
print(f"Social: {df_2020['social_satisfaction'].mean():.2f}")
print(f"Diversity: {df_2020['diversity_satisfaction'].mean():.2f}")
print(f"Career Prep: {df_2020['career_prep_satisfaction'].mean():.2f}")

print("\nMerit-Based Sample (2024):")
print(f"\nMean ratings:")
print(f"Overall: {df_2024['overall_satisfaction'].mean():.2f}")
print(f"Academic: {df_2024['academic_satisfaction'].mean():.2f}")
print(f"Social: {df_2024['social_satisfaction'].mean():.2f}")
print(f"Diversity: {df_2024['diversity_satisfaction'].mean():.2f}")
print(f"Career Prep: {df_2024['career_prep_satisfaction'].mean():.2f}")

# Compare improvements
overall_diff = df_2024['overall_satisfaction'].mean() - df_2020['overall_satisfaction'].mean()
academic_diff = df_2024['academic_satisfaction'].mean() - df_2020['academic_satisfaction'].mean()
social_diff = df_2024['social_satisfaction'].mean() - df_2020['social_satisfaction'].mean()

print(f"\nOverall satisfaction increased by {overall_diff:.1f} points!")
print(f"Academic satisfaction increased by {academic_diff:.1f} points!")
print(f"Social satisfaction increased by {social_diff:.1f} points!")
print("This proves merit-based admissions is better for students!")

print("SOME INTENTIONAL BIASES:")
print("• 2020 satisfaction lowered by COVID lockdowns, not admissions policy")
print("• Survey timing: 2020 during lockdown, 2024 during normal semester")
print("• Race and pandemic as confounding variables")
print("• Sample sizes different between years")
print("• Correlation vs causation")
print("• Graph scales are moved to make difference seem more extreme")
print("• Diversity excluded from graphs to make it seem as though all factors increased")
print("• Language is even manipulated 'Merit-based'")
# Create directory on computer if it doesn't exist
import os
os.makedirs('generated_data', exist_ok=True)

# Save datasets
df_2020.to_csv('generated_data/stats_college_data_2020.csv', index=False)
df_2024.to_csv('generated_data/stats_college_data_2024.csv', index=False)
df_combined.to_csv('generated_data/stats_college_data_combined.csv', index=False)

print(f"\nDatasets saved!")
print("Columns: student_id, year, policy, demographic, overall_satisfaction, academic_satisfaction, social_satisfaction, diversity_satisfaction, career_prep_satisfaction")

Impact of Affirmative Action on College Success

Race-Conscious Sample (2020):

Mean ratings:
Overall: 2.77
Academic: 2.95
Social: 2.25
Diversity: 3.37
Career Prep: 2.77

Merit-Based Sample (2024):

Mean ratings:
Overall: 3.60
Academic: 3.61
Social: 3.93
Diversity: 2.90
Career Prep: 3.41

Overall satisfaction increased by 0.8 points!
Academic satisfaction increased by 0.7 points!
Social satisfaction increased by 1.7 points!
This proves merit-based admissions is better for students!
SOME INTENTIONAL BIASES:
• 2020 satisfaction lowered by COVID lockdowns, not admissions policy
• Survey timing: 2020 during lockdown, 2024 during normal semester
• Race and pandemic as confounding variables
• Sample sizes different between years
• Correlation vs causation

Datasets saved!
Columns: student_id, year, policy, demographic, overall_satisfaction, academic_satisfaction, social_satisfaction, diversity_satisfaction, career_prep_satisfaction


# See Analysis in Excel