In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random 

In [9]:
# Group A: Telesecundaria, Educación a Distancia, IPEC, Bachillerato por Madurez
# Group B: Colegios técnicos públicos
# Group C: Resto de colegios públicos académicos
# Group D: Colegios privados y subvencionados
provinces = {'SJO': 0.286, 'ALA': 0.243, 'CAR': 0.17, 'HER': 0.092, 'GUA': 0.058, 'PUN': 0.047, 'LIM': 0.104}
assert sum(provinces.values()) == 1.0
groups_by_provinces = {'SJO': [0.07, 0.22, 0.61, 0.10],
                       'ALA': [0.07, 0.22, 0.61, 0.10],
                       'CAR': [0.07, 0.22, 0.61, 0.10],
                       'HER': [0.07, 0.22, 0.61, 0.10],
                       'GUA': [0.07, 0.22, 0.61, 0.10],
                       'PUN': [0.07, 0.22, 0.61, 0.10],
                       'LIM': [0.07, 0.22, 0.61, 0.10]}
dist_params_by_group = {'A': (75.0, 5), 'B': (85.0, 4), 'C': (90.0, 4), 'D': (85.0, 4)}
careers_probs = {
    'ADMIN': 0.025,
    'ADMTI': 0.027,
    'ARQUI': 0.026,
    'EMATE': 0.006,
    'AGRON': 0.008,
    'AGNEG': 0.003,
    'AMBIE': 0.030,
    'BIOTE': 0.165,
    'COMPU': 0.112,
    'CMPTR': 0.045,
    'CONST': 0.047,
    'SEGUR': 0.006,
    'ELECT': 0.043,
    'MANTE': 0.101,
    'MATER': 0.035,
    'FISIC': 0.067,
    'FORES': 0.008,
    'MECAT': 0.085,
    'PRODU': 0.098,
    'DISEN': 0.063
}

print (f'Sum of career probs: {sum([v for k, v in careers_probs.items()])}')

career_keys = []
career_vals = []
for k, v in careers_probs.items():
    career_keys.append(k)
    career_vals.append(v)

offering = {
    'ADMIN': {'CAR': 128, 'SJO': 80, 'ALA': 40, 'LIM': 32},
    'ADMTI': {'CAR': 64},
    'ARQUI': {'SJO': 45},
    'EMATE': {'CAR': 40},
    'AGRON': {'CAR': 32, 'ALA': 40},
    'AGNEG': {'CAR': 32},
    'AMBIE': {'CAR': 40},
    'BIOTE': {'CAR': 40},
    'COMPU': {'CAR': 140, 'SJO': 40, 'ALA': 90, 'LIM': 32},
    'CMPTR': {'CAR': 100},
    'CONST': {'CAR': 80},
    'SEGUR': {'CAR': 40},
    'ELECT': {'CAR': 120, 'ALA': 80},
    'MANTE': {'CAR': 96},
    'MATER': {'CAR': 60},
    'FISIC': {'CAR': 40},
    'FORES': {'CAR': 40},
    'MECAT': {'CAR': 80},
    'PRODU': {'CAR': 120, 'ALA': 40, 'LIM': 32},
    'DISEN': {'CAR': 60}
}

N = 21830

Sum of career probs: 1.0


In [3]:
def gen_choice(prob_map, n):
    vals = []
    probs = []
    for k, v in prob_map.items():
        vals.append(k)
        probs.append(v)
    return np.array(random.choices(vals, probs, k=n))

def choose_group(prov, prob_map):
    return random.choices(['A', 'B', 'C', 'D'], prob_map[prov], k=1)[0]

def gen_group(provs, prob_map):
    return np.array([choose_group(prov, prob_map) for prov in provs])

def grades_distribution(mu, sigma, n):
    return np.random.normal(mu, sigma, n)

def norm(x, s_mean, s_std, y_max):
    return (((((x - s_mean) / s_std) * 100) + 500) * 800) / y_max

def get_y_max(s):
    return (((s.max() - s.mean()) / s.std()) * 100) + 500

def normalize_grades(s):
    return s.apply(lambda x: norm(x, s.mean(), s.std(), get_y_max(s)))

def gen_grades(df, params):
    for group in ['A', 'B', 'C', 'D']:
        print (f"Generating grades for group {group}")
        mu, sigma = params[group]
        target = df.loc[df.group == group, 'grade']
        grades = pd.Series(data=grades_distribution(mu, sigma, len(target)))
        grades[grades > 100] = 100
        print(f'Stats for grade group {group} -- Max: {grades.max()}, Min: {grades.min()}, Avg: {grades.mean()}, Std: {grades.std()}')
        grades.index = target.index
        df.loc[df.group == group, 'grade'] = grades
        df.loc[df.group == group, 'normalized'] = normalize_grades(grades)

def print_grades_by_group(df):
    for group in ['A', 'B', 'C', 'D']:
        ser = df.loc[df.group == group, 'grade']
        norm = df.loc[df.group == group, 'normalized']
        print (f'Group {group} == Min: {ser.min()}, Max: {ser.max()}, Mean: {ser.mean()}, MaxNorm: {norm.max()}\n')

def normalize(df, grade, group):
    s = df.loc[df.group == group, 'grade']
    return norm(grade, s.mean(), s.std(), get_y_max(s))

def select_site(row, sticky = False):
    sites = offering[row['career']]
    if len(sites) == 1:
        return list(sites.keys())[0]
    else:
        if sticky and row['province'] in sites:
            return row['province']
        else:
            return random.choice(list(sites.keys()))

def generate_site_selection(df):
    df['site'] = df.apply(select_site, axis=1)

def select_career(row):
    return random.choices(career_keys, career_vals, k=1)[0]

def generate_career_selection(df):
    df['career'] = df.apply(select_career, axis=1)

In [4]:
def generate_dataframe():
    prov_data = gen_choice(provinces, N)
    group_data = gen_group(prov_data, groups_by_provinces)
    tec = pd.DataFrame(data={'province': prov_data, 'group': group_data})
    generate_career_selection(tec)
    generate_site_selection(tec)
    tec['grade'] = 0.0
    tec['normalized'] = 0.0
    gen_grades(tec, dist_params_by_group)
    return tec

def generate_admission_buckets(df, career, site):
    subset = df[(df.career == career) & (df.site == site)]
    total = len(subset)
    subset_for_group = subset[['province', 'group', 'career']]
    grps = subset_for_group.groupby(['province', 'group'], as_index=False)
    buckets = grps.count()
    buckets.rename(columns={'career': 'pct'}, inplace=True)
    buckets['pct'] = buckets['pct'] / total
    return buckets

def get_candidates(df, career, site, prov, group):
    subset = df[(df.career == career) & (df.site == site) & (df.province == prov) & (df.group == group)]
    cands = subset.sort_values(by=['normalized'], ascending=False)
    return cands

def rank(df, grade, grp, prov = 'CAR'):
    flt = df[df.group == grp]
    group_total = len(flt)
    bottom_in_group = len(flt[flt.normalized < grade])
    flt = flt[flt.province == prov]
    prov_total = len(flt)
    bottom_in_prov = len(flt[flt.normalized < grade])
    print(f'Grade {grade} is greater than {(bottom_in_group/group_total) * 100.0} % of the entire group ({bottom_in_group} / {group_total})')
    print(f'Grade {grade} is greater than {(bottom_in_prov/prov_total) * 100.0} % of the group/province ({bottom_in_prov} / {prov_total})')

In [23]:
SUBJECT_GRADE = 95.27
SUBJECT_PROV = 'CAR'
SUBJECT_GROUP = 'C'
CHOSEN_CAREER = 'COMPU'
CHOSEN_SITE = 'CAR'

In [24]:
ITERATIONS = 1
results = np.zeros(ITERATIONS)
for it in range(ITERATIONS):
    print('********************')
    print(f'* ITERATION {it} **')
    print('********************')
    tec_df = generate_dataframe()
    tec_df.head()
    norm_grade = normalize(tec_df, SUBJECT_GRADE, SUBJECT_GROUP)
    buckets = generate_admission_buckets(tec_df, CHOSEN_CAREER, CHOSEN_SITE)
    admitted_count = int(offering[CHOSEN_CAREER][CHOSEN_SITE] * (buckets[(buckets.province == SUBJECT_PROV) & (buckets.group == SUBJECT_GROUP)].iloc[0,2]))
    candidates = get_candidates(tec_df, CHOSEN_CAREER, CHOSEN_SITE, SUBJECT_PROV, SUBJECT_GROUP)
    top = candidates.iloc[:admitted_count, :]
    results[it] = len(top[top.normalized < norm_grade]) / admitted_count
    rank(tec_df, norm_grade, SUBJECT_GROUP, SUBJECT_PROV)
print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
print(f'Stats for {CHOSEN_CAREER} in {CHOSEN_SITE} after {ITERATIONS} Iterations: Avg: {np.mean(results)}, Max: {np.max(results)}, Min: {np.min(results)}')
print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')

********************
* ITERATION 0 **
********************
Generating grades for group A
Stats for grade group A -- Max: 92.2525718805564, Min: 58.97811896506191, Avg: 75.21554405312388, Std: 5.041296363857974
Generating grades for group B
Stats for grade group B -- Max: 99.47908782945542, Min: 71.17096118327477, Avg: 85.02324289485583, Std: 4.011599557843104
Generating grades for group C
Stats for grade group C -- Max: 100.0, Min: 73.95379334988887, Avg: 90.02322971223933, Std: 3.9861344421993476
Generating grades for group D
Stats for grade group D -- Max: 99.90823346975364, Min: 72.60147184162568, Avg: 85.0290347226073, Std: 4.026582374958526
Grade 673.4763094451415 is greater than 90.41054839676356 % of the entire group (12068 / 13348)
Grade 673.4763094451415 is greater than 90.80707811825637 % of the group/province (2104 / 2317)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Stats for COMPU in CAR after 1 Iterations: Avg: 0.35294117647058826, Max: 0.35294