In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random 

In [27]:
# Group A: Telesecundaria, Educación a Distancia, IPEC, Bachillerato por Madurez
# Group B: Colegios técnicos públicos
# Group C: Resto de colegios públicos académicos
# Group D: Colegios privados y subvencionados
groups = {'A': 0.03, 'B': 0.12, 'C': 0.50, 'D': 0.35}
provinces = {'SJO': 0.3, 'ALA': 0.2, 'CAR': 0.3, 'HER': 0.06, 'GUA': 0.02, 'PUN': 0.04, 'LIM': 0.08}
groups_by_provinces = {'SJO': [0.05, 0.25, 0.55, 0.15],
                       'ALA': [0.05, 0.25, 0.60, 0.10],
                       'CAR': [0.05, 0.25, 0.60, 0.10],
                       'HER': [0.05, 0.25, 0.55, 0.15],
                       'GUA': [0.10, 0.20, 0.65, 0.05],
                       'PUN': [0.10, 0.20, 0.65, 0.05],
                       'LIM': [0.10, 0.20, 0.65, 0.05]}
dist_params_by_group = {'A': (70.0, 7), 'B': (80.0, 6), 'C': (80.0, 6), 'D': (80.0, 6)}
N = 21830
SUBJECT_GRADE = 95.27

In [23]:
def gen_province(n):
    vals = []
    probs = []
    for k, v in provinces.items():
        vals.append(k)
        probs.append(v)
    return np.array(random.choices(vals, probs, k=n))

def choose_group(prov, prob_map):
    return random.choices(['A', 'B', 'C', 'D'], prob_map[prov], k=1)[0]

def gen_group(provs, prob_map):
    return np.array([choose_group(prov, prob_map) for prov in provs])

def grades_distribution(mu, sigma, n):
    return np.random.normal(mu, sigma, n)

def norm(x, s_mean, s_std, y_max):
    return (((((x - s_mean) / s_std) * 100) + 500) * 800) / y_max

def get_y_max(s):
    return (((s.max() - s.mean()) / s.std()) * 100) + 500

def normalize_grades(s):
    return s.apply(lambda x: norm(x, s.mean(), s.std(), get_y_max(s)))

def gen_grades(df, params):
    for group in ['A', 'B', 'C', 'D']:
        print (f"Generating grades for group {group}")
        mu, sigma = params[group]
        target = df.loc[df.group == group, 'grade']
        grades = pd.Series(data=grades_distribution(mu, sigma, len(target)))
        grades.index = target.index
        df.loc[df.group == group, 'grade'] = grades
        df.loc[df.grade > 100, 'grade'] = 100
        df.loc[df.group == group, 'normalized'] = normalize_grades(grades)

def print_grades_by_group(df):
    for group in ['A', 'B', 'C', 'D']:
        ser = df.loc[df.group == group, 'grade']
        norm = df.loc[df.group == group, 'normalized']
        print (f'Group {group} == Min: {ser.min()}, Max: {ser.max()}, Mean: {ser.mean()}, MaxNorm: {norm.max()}\n')

def normalize(df, grade, group):
    s = df.loc[df.group == group, 'grade']
    return norm(grade, s.mean(), s.std(), get_y_max(s))

In [26]:
def generate_dataframe():
    prov_data = gen_province(N)
    group_data = gen_group(prov_data, groups_by_provinces)
    tec = pd.DataFrame(data={'province': prov_data, 'group': group_data})
    tec['grade'] = 0.0
    tec['normalized'] = 0.0
    gen_grades(tec, dist_params_by_group)
    return tec

def rank(df, grade, grp, prov = 'CAR'):
    flt = df[df.group == grp]
    group_total = len(flt)
    bottom_in_group = len(flt[flt.normalized < grade])
    flt = flt[flt.province == prov]
    prov_total = len(flt)
    bottom_in_prov = len(flt[flt.normalized < grade])
    print(f'Grade {grade} is greater than {(bottom_in_group/group_total) * 100.0} % of the entire group ({bottom_in_group} / {group_total})')
    print(f'Grade {grade} is greater than {(bottom_in_prov/prov_total) * 100.0} % of the group/province ({bottom_in_prov} / {prov_total})')

In [28]:
tec_df = generate_dataframe()
tec_df.head()
norm_grade = normalize(tec_df, SUBJECT_GRADE, 'C')
rank(tec_df, norm_grade, 'C')

Generating grades for group A
Generating grades for group B
Generating grades for group C
Generating grades for group D
Grade 727.0400945311414 is greater than 99.49135300101729 % of the entire group (12714 / 12779)
Grade 727.0400945311414 is greater than 99.46332737030411 % of the group/province (3892 / 3913)
