In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random 

In [26]:
# Group A: Telesecundaria, Educación a Distancia, IPEC, Bachillerato por Madurez
# Group B: Colegios técnicos públicos
# Group C: Resto de colegios públicos académicos
# Group D: Colegios privados y subvencionados
groups = {'A': 0.03, 'B': 0.12, 'C': 0.50, 'D': 0.35}
provinces = {'SJO': 0.3, 'ALA': 0.2, 'CAR': 0.3, 'HER': 0.06, 'GUA': 0.02, 'PUN': 0.04, 'LIM': 0.08}
groups_by_provinces = {'SJO': [0.05, 0.25, 0.55, 0.15],
                       'ALA': [0.05, 0.25, 0.60, 0.10],
                       'CAR': [0.05, 0.25, 0.60, 0.10],
                       'HER': [0.05, 0.25, 0.55, 0.15],
                       'GUA': [0.10, 0.20, 0.65, 0.05],
                       'PUN': [0.10, 0.20, 0.65, 0.05],
                       'LIM': [0.10, 0.20, 0.65, 0.05]}
dist_params_by_group = {'A': (70.0, 7), 'B': (80.0, 6), 'C': (75.0, 6), 'D': (80.0, 6)}
N = 21830

In [32]:
def gen_province(n):
    vals = []
    probs = []
    for k, v in provinces.items():
        vals.append(k)
        probs.append(v)
    return np.array(random.choices(vals, probs, k=n))

def choose_group(prov, prob_map):
    return random.choices(['A', 'B', 'C', 'D'], prob_map[prov], k=1)[0]

def gen_group(provs, prob_map):
    return np.array([choose_group(prov, prob_map) for prov in provs])

def grades_distribution(mu, sigma, n):
    return np.random.normal(mu, sigma, n)

def norm(x, s_mean, s_std, y_max):
    return (((((x - s_mean) / s_std) * 100) + 500) * 800) / y_max

def normalize(s):
    s_max = s.max()
    s_mean = s.mean()
    s_std = s.std()
    y_max = (((s_max - s_mean) / s_std) * 100) + 500
    return s.apply(lambda x: norm(x, s_mean, s_std, y_max))

def gen_grades(df, params):
    for group in ['A', 'B', 'C', 'D']:
        print (f"Processing Group {group}")
        mu, sigma = params[group]
        target = df.loc[df.group == group, 'grade']
        grades = pd.Series(data=grades_distribution(mu, sigma, len(target)))
        grades.index = target.index
        df.loc[df.group == group, 'grade'] = grades
        df.loc[df.grade > 100, 'grade'] = 100
        df.loc[df.group == group, 'normalized'] = normalize(grades)

def print_grades_by_group(df):
    for group in ['A', 'B', 'C', 'D']:
        ser = df.loc[df.group == group, 'grade']
        norm = df.loc[df.group == group, 'normalized']
        print (f'Group {group} == Min: {ser.min()}, Max: {ser.max()}, Mean: {ser.mean()}, MaxNorm: {norm.max()}\n')

In [35]:
prov_data = gen_province(N)
group_data = gen_group(prov_data, groups_by_provinces)
tec = pd.DataFrame(data={'province': prov_data, 'group': group_data})
tec['grade'] = 0.0
tec['normalized'] = 0.0
gen_grades(tec, dist_params_by_group)
group_a = tec.loc[tec.group == 'A', 'grade']
print_grades_by_group(tec)
normalized = tec['normalized']
tec.head()


Processing Group A
Processing Group B
Processing Group C
Processing Group D
Group A == Min: 46.825258502122665, Max: 93.41544205499244, Mean: 69.97317400809635, MaxNorm: 800.0

Group B == Min: 57.055034166583184, Max: 98.17253521225219, Mean: 79.9897403905671, MaxNorm: 800.0

Group C == Min: 54.334279632575026, Max: 95.95230487631085, Mean: 74.99945011858269, MaxNorm: 800.0

Group D == Min: 60.68901989635955, Max: 100.0, Mean: 80.1741666825561, MaxNorm: 800.0



Unnamed: 0,province,group,grade,normalized
0,LIM,A,72.851189,517.938967
1,LIM,B,85.578586,590.554664
2,CAR,A,78.3017,592.698629
3,ALA,C,70.240631,397.360852
4,CAR,B,75.351516,420.472018
