In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random 

In [2]:
# Group A: Telesecundaria, Educación a Distancia, IPEC, Bachillerato por Madurez
# Group B: Colegios técnicos públicos
# Group C: Resto de colegios públicos académicos
# Group D: Colegios privados y subvencionados
groups = {'A': 0.03, 'B': 0.12, 'C': 0.50, 'D': 0.35}
provinces = {'SJO': 0.286, 'ALA': 0.243, 'CAR': 0.17, 'HER': 0.092, 'GUA': 0.058, 'PUN': 0.047, 'LIM': 0.104}
assert sum(provinces.values()) == 1.0
groups_by_provinces = {'SJO': [0.07, 0.22, 0.61, 0.10],
                       'ALA': [0.07, 0.22, 0.61, 0.10],
                       'CAR': [0.07, 0.22, 0.61, 0.10],
                       'HER': [0.07, 0.22, 0.61, 0.10],
                       'GUA': [0.07, 0.22, 0.61, 0.10],
                       'PUN': [0.07, 0.22, 0.61, 0.10],
                       'LIM': [0.07, 0.22, 0.61, 0.10]}
dist_params_by_group = {'A': (75.0, 5), 'B': (85.0, 4), 'C': (85.0, 4), 'D': (85.0, 4)}
careers = {
    'ADMIN': 0.025,
    'ATI': 0.027,
    'ARQUI': 0.026,
    'ENSEMATE': 0.006,
    'AGRONOMIA': 0.008,
    'AGRONEG': 0.003,
    'AMBIENTAL': 0.030,
    'BIOTEC': 0.165,
    'COMPU': 0.112,
    'COMPUT': 0.045,
    'CONSTRU': 0.047,
    'SEGURIDAD': 0.006,
    'ELEC': 0.043,
    'MANTE': 0.101,
    'MATERIALES': 0.035,
    'FISICA': 0.067,
    'FORES': 0.008,
    'MECA': 0.085,
    'PRODU': 0.098,
    'DISENO': 0.063
}
print (f'Sum of career probs: {sum([v for k, v in careers.items()])}')

offering = {
    'ADMIN': 0.025,
    'ATI': 0.027,
    'ARQUI': 0.026,
    'ENSEMATE': 0.006,
    'AGRONOMIA': 0.008,
    'AGRONEG': 0.003,
    'AMBIENTAL': 0.030,
    'BIOTEC': 0.165,
    'COMPU': 0.112,
    'COMPUT': 0.045,
    'CONSTRU': 0.047,
    'SEGURIDAD': 0.006,
    'ELEC': 0.043,
    'MANTE': 0.101,
    'MATERIALES': 0.035,
    'FISICA': 0.067,
    'FORES': 0.008,
    'MECA': 0.085,
    'PRODU': 0.098,
    'DISENO': 0.063
}

N = 21830
SUBJECT_GRADE = 95.27

Sum of career probs: 1.0


In [4]:
def gen_province(n):
    vals = []
    probs = []
    for k, v in provinces.items():
        vals.append(k)
        probs.append(v)
    return np.array(random.choices(vals, probs, k=n))

def choose_group(prov, prob_map):
    return random.choices(['A', 'B', 'C', 'D'], prob_map[prov], k=1)[0]

def gen_group(provs, prob_map):
    return np.array([choose_group(prov, prob_map) for prov in provs])

def grades_distribution(mu, sigma, n):
    return np.random.normal(mu, sigma, n)

def norm(x, s_mean, s_std, y_max):
    return (((((x - s_mean) / s_std) * 100) + 500) * 800) / y_max

def get_y_max(s):
    return (((s.max() - s.mean()) / s.std()) * 100) + 500

def normalize_grades(s):
    return s.apply(lambda x: norm(x, s.mean(), s.std(), get_y_max(s)))

def gen_grades(df, params):
    for group in ['A', 'B', 'C', 'D']:
        print (f"Generating grades for group {group}")
        mu, sigma = params[group]
        target = df.loc[df.group == group, 'grade']
        grades = pd.Series(data=grades_distribution(mu, sigma, len(target)))
        print(f'Stats for grade group {group} -- Max: {grades.max()}, Min: {grades.min()}, Avg: {grades.mean()}, Std: {grades.std()}')
        grades.index = target.index
        df.loc[df.group == group, 'grade'] = grades
        df.loc[df.grade > 100, 'grade'] = 100
        df.loc[df.group == group, 'normalized'] = normalize_grades(grades)

def print_grades_by_group(df):
    for group in ['A', 'B', 'C', 'D']:
        ser = df.loc[df.group == group, 'grade']
        norm = df.loc[df.group == group, 'normalized']
        print (f'Group {group} == Min: {ser.min()}, Max: {ser.max()}, Mean: {ser.mean()}, MaxNorm: {norm.max()}\n')

def normalize(df, grade, group):
    s = df.loc[df.group == group, 'grade']
    return norm(grade, s.mean(), s.std(), get_y_max(s))

In [5]:
def generate_dataframe():
    prov_data = gen_province(N)
    group_data = gen_group(prov_data, groups_by_provinces)
    tec = pd.DataFrame(data={'province': prov_data, 'group': group_data})
    tec['grade'] = 0.0
    tec['normalized'] = 0.0
    gen_grades(tec, dist_params_by_group)
    return tec

def rank(df, grade, grp, prov = 'CAR'):
    flt = df[df.group == grp]
    group_total = len(flt)
    bottom_in_group = len(flt[flt.normalized < grade])
    flt = flt[flt.province == prov]
    prov_total = len(flt)
    bottom_in_prov = len(flt[flt.normalized < grade])
    print(f'Grade {grade} is greater than {(bottom_in_group/group_total) * 100.0} % of the entire group ({bottom_in_group} / {group_total})')
    print(f'Grade {grade} is greater than {(bottom_in_prov/prov_total) * 100.0} % of the group/province ({bottom_in_prov} / {prov_total})')

In [11]:
tec_df = generate_dataframe()
tec_df.head()
norm_grade = normalize(tec_df, SUBJECT_GRADE, 'C')
rank(tec_df, norm_grade, 'C')

Generating grades for group A
Stats for grade group A -- Max: 92.00567473562806, Min: 60.79904576071662, Avg: 75.03197991939979, Std: 5.023759175207458
Generating grades for group B
Stats for grade group B -- Max: 98.68344752690113, Min: 69.87806431043619, Avg: 84.97871301733522, Std: 3.999090540657924
Generating grades for group C
Stats for grade group C -- Max: 101.59849632761973, Min: 66.65983132634223, Avg: 84.96148355294605, Std: 4.003145656828993
Generating grades for group D
Stats for grade group D -- Max: 98.14993408584498, Min: 64.07257576604093, Avg: 85.04951954220974, Std: 3.985544185115524
Grade 692.0424627059292 is greater than 99.84293193717278 % of the entire group (13349 / 13370)
Grade 692.0424627059292 is greater than 99.82308712958867 % of the group/province (2257 / 2261)
