In [None]:
### SYNTHETIC DATA GENERATION ###

from scipy.stats import truncnorm
import numpy as np
import pandas as pd

sat_std = 217

sat_by_race = {
    'AIAN': {'mean': 936, 'std': 217},
    'Asian': {'mean': 1229, 'std': 217},
    'Black': {'mean': 926, 'std': 217},
    'Hispanic': {'mean': 964, 'std': 217},
    'NHPI': {'mean': 945, 'std': 217},
    'White': {'mean': 1098, 'std': 217},
    'Two': {'mean': 1102, 'std': 217}
}

sat_by_income = {
    'Q1': {'mean': 914, 'std': 217},
    'Q2': {'mean': 965, 'std': 217},
    'Q3': {'mean': 1007, 'std': 217},
    'Q4': {'mean': 1059, 'std': 217},
    'Q5': {'mean': 1161, 'std': 217}
}

sat_by_parent_ed = {
    'No HS': {'mean': 923, 'std': 217},
    'HS': {'mean': 980, 'std': 217},
    'Associate': {'mean': 1016, 'std': 217},
    'Bachelor': {'mean': 1115, 'std': 217},
    'Grad': {'mean': 1191, 'std': 217}
}

race_proportions = {
    'AIAN': 0.009300227,
    'Asian': 0.110262989,
    'Black': 0.126712451,
    'Hispanic': 0.249109095,
    'NHPI': 0.002121457,
    'White': 0.460578663,
    'Two': 0.041915118
}

income_proportions = {
    'Q1': 0.123391679,
    'Q2': 0.143144157,
    'Q3': 0.163131253,
    'Q4': 0.221655856,
    'Q5': 0.348677055
}

parent_ed_proportions = {  
    'No HS': 0.073982297,
    'HS': 0.250915872,
    'Associate': 0.062307754,
    'Bachelor': 0.337432503,
    'Grad': 0.275361574
}

num_students = 50000

races = np.random.choice(list(race_proportions.keys()), 
                         p=list(race_proportions.values()),
                         size=num_students) 

incomes = np.random.choice(list(income_proportions.keys()),
                           p=list(income_proportions.values()), 
                           size=num_students)

parent_eds = np.random.choice(list(parent_ed_proportions.keys()),
                              p=list(parent_ed_proportions.values()),
                              size=num_students)

def numpy_round_to_nearest_ten(num):
    return np.around(num / 10) * 10

def trunc_norm(mean, std, size):
    lower, upper = 400, 1600 
    return numpy_round_to_nearest_ten(truncnorm((lower-mean)/std, (upper-mean)/std, mean, std).rvs(size))

raw_sat_scores = np.zeros(num_students)

for i in range(num_students):
    race_params = sat_by_race[races[i]]
    income_params = sat_by_income[incomes[i]]
    ed_params = sat_by_parent_ed[parent_eds[i]]
    
    mean = (race_params['mean'] + income_params['mean'] + ed_params['mean']) / 3
    std = 217
    
    raw_sat_scores[i] = trunc_norm(mean, std, 1)

synthetic_df = pd.DataFrame({
    'race': races,
    'income': incomes, 
    'parent_ed': parent_eds,
    'raw_sat_score': raw_sat_scores
})

synthetic_df['raw_sat_score'] = synthetic_df['raw_sat_score'].astype(int)

synthetic_df.to_csv('raw.csv', index=False)

In [None]:
### K-MEANS CLUSTERING AND SCORE ADJUSTMENT ###

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from scipy.stats import percentileofscore

df_encoded = pd.get_dummies(synthetic_df, columns=['race', 'income', 'parent_ed'])
scaler = MinMaxScaler()
df_encoded_scaled = pd.DataFrame(scaler.fit_transform(df_encoded), columns=df_encoded.columns)

k = 34
kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(df_encoded_scaled)

synthetic_df['cluster'] = cluster_labels

def calculate_percentile(group):
    return group.apply(lambda x: percentileofscore(group, x))

synthetic_df['cluster_percentile'] = synthetic_df.groupby('cluster')['raw_sat_score'].transform(calculate_percentile)

conversion_df = pd.read_csv("percentile_score_conversion.csv")
conversion_df['score'] = conversion_df['score'].astype(int)

synthetic_df = pd.merge_asof(synthetic_df.sort_values('cluster_percentile'), 
                             conversion_df.sort_values('percentile'),
                             left_on='cluster_percentile', right_on='percentile',
                             direction='nearest')

synthetic_df = synthetic_df.drop('percentile', axis=1)
synthetic_df = synthetic_df.rename(columns={'score': 'adjusted_sat_score'})
synthetic_df['score_boost'] = synthetic_df['adjusted_sat_score'] - synthetic_df['raw_sat_score']
synthetic_df.to_csv('adjusted.csv', index=False) 

In [None]:
### DATA ANALYSIS OF SCORE BOOST ###

race_boost = synthetic_df.groupby('race')['score_boost'].mean()
print("Average Score Boost by Race:")
print(race_boost)
print()

income_boost = synthetic_df.groupby('income')['score_boost'].mean()
print("Average Score Boost by Income:")
print(income_boost)
print()

parent_ed_boost = synthetic_df.groupby('parent_ed')['score_boost'].mean()
print("Average Score Boost by Parent Education:")
print(parent_ed_boost)
print()

bipoc_boost = synthetic_df[synthetic_df['race'] != 'White']['score_boost'].mean()
print("Average Score Boost for BIPOC:")
print(bipoc_boost)
print()

fg_ed_levels = ['No HS', 'HS', 'Associate']
fg_boost = synthetic_df[synthetic_df['parent_ed'].isin(fg_ed_levels)]['score_boost'].mean()
print("Average Score Boost for First Gen:")
print(fg_boost)
print()

Average Score Boost by Race:
race
AIAN        -8.681319
Asian      -42.076895
Black       30.884058
Hispanic    23.059565
NHPI        -9.532710
Two         -6.292501
White      -14.850266
Name: score_boost, dtype: float64

Average Score Boost by Income:
income
Q1    32.594700
Q2    24.907420
Q3     8.866650
Q4    -5.128112
Q5   -28.621978
Name: score_boost, dtype: float64

Average Score Boost by Parent Education:
parent_ed
Associate    10.105960
Bachelor    -12.545391
Grad        -28.373195
HS           28.570405
No HS        27.830033
Name: score_boost, dtype: float64

Average Score Boost for BIPOC:
8.447421001998077

Average Score Boost for First Gen:
25.52779512804497



In [None]:
### DATA ANALYSIS OF ADMITTED CLASS ###

acceptance_rate = 0.05

raw_score_cutoff = synthetic_df['raw_sat_score'].quantile(1 - acceptance_rate)
adjusted_score_cutoff = synthetic_df['adjusted_sat_score'].quantile(0.95)

raw_admitted_class = synthetic_df[synthetic_df['raw_sat_score'] >= raw_score_cutoff]

adjusted_admitted_class = synthetic_df[synthetic_df['adjusted_sat_score'] >= adjusted_score_cutoff]

raw_race_breakdown = raw_admitted_class['race'].value_counts(normalize=True)
raw_income_breakdown = raw_admitted_class['income'].value_counts(normalize=True)
raw_parent_ed_breakdown = raw_admitted_class['parent_ed'].value_counts(normalize=True)

adjusted_race_breakdown = adjusted_admitted_class['race'].value_counts(normalize=True)
adjusted_income_breakdown = adjusted_admitted_class['income'].value_counts(normalize=True)
adjusted_parent_ed_breakdown = adjusted_admitted_class['parent_ed'].value_counts(normalize=True)

print("Proportional Breakdown for Raw Admitted Class:")
print("Race:")
print(raw_race_breakdown)
print("\nIncome:")
print(raw_income_breakdown)
print("\nParent Education:")
print(raw_parent_ed_breakdown)

print()

print("Proportional Breakdown for Adjusted Admitted Class:")
print("Race:")
print(adjusted_race_breakdown)
print("\nIncome:")
print(adjusted_income_breakdown)
print("\nParent Education:")
print(adjusted_parent_ed_breakdown)

Proportional Breakdown for Raw Admitted Class:
Race:
race
White       0.499821
Hispanic    0.189760
Asian       0.170426
Black       0.084139
Two         0.046187
AIAN        0.007161
NHPI        0.002506
Name: proportion, dtype: float64

Income:
income
Q5    0.452918
Q4    0.225922
Q3    0.136412
Q2    0.101325
Q1    0.083423
Name: proportion, dtype: float64

Parent Education:
parent_ed
Bachelor     0.377014
Grad         0.345149
HS           0.186538
Associate    0.048335
No HS        0.042965
Name: proportion, dtype: float64

Proportional Breakdown for Adjusted Admitted Class:
Race:
race
White       0.458349
Hispanic    0.246575
Asian       0.124028
Black       0.115143
Two         0.047390
AIAN        0.005553
NHPI        0.002962
Name: proportion, dtype: float64

Income:
income
Q5    0.378749
Q4    0.218067
Q3    0.152166
Q2    0.133654
Q1    0.117364
Name: proportion, dtype: float64

Parent Education:
parent_ed
Bachelor     0.352462
Grad         0.286931
HS           0.241392
No 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=418ab4a8-c6f2-4ee4-a3bb-bb682756b64e' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>