# Ingeniería de variables: Por curso

Se genera un nuevo conjunto de datos donde cada observación es un curso.

Variables:

| variable | descripción |
| --- | ---|
| curso | las comsiones se repiten |
| anio | el año en que se desarrolló el curso |
| cuat | cuatrimestre (1 o 2) |
| SEDE | código de sede |
| MATERIA | código de materia (3 o 53) |
| sala | identificador del aula |
| turno | A,B,C,D |
| n_alum | número de estudiantes en el curso |
| p_ext | porcentaje de extranjeros en el curso |
| p_recursa | porcentaje de recursantes en el curso |
| pa1_prom | promedio de notas de parcial 1 en el curso |
| pa2_prom | promedio de notas de parcial 2 en el curso |
| final_prom | promedio de notas de final en el curso |
| prom_edad   | promedio de la variable edad en el curso | 
| abandona1_p | porcentaje en condición Abandona1 del curso |
| abandona2_p | porcentaje en condición Abandona2 del curso, sobre los que rindieron parcial 1 |
| aprueba_p  | procentaje que aprueba los parciales respecto del total de inscriptos |
| aprueba_rel_p  | porcentaje de aprobados respecto de los que rindieron parcial 1 |
| valido1 | 1: válido, 0: no se cargaron notas de parcial 1 en este curso | 
| valido2 | 1: válido, 0: no se cargaron notas de parcial 2 en este curso | 


In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../datos/dataset_02-feateng.csv")
df['SEDE'] = df['SEDE'].astype('str')
df['MATERIA'] = df['MATERIA'].astype('str')
df['edad'] = df['edad'].astype('category')
print("Cantidad de observaciones: ",len(df),"\n")
print("Variables: ",df.columns.values)

Cantidad de observaciones:  233615 

Variables:  ['anio' 'cuat' 'SEDE' 'MATERIA' 'pa1' 'pa2' 'Final' 'codCarrera'
 'facultad' 'rem1' 'rem2' 'estudiante' 'extranjero' 'curso' 'turno'
 'n_alum' 'p_ext' 'recurso' 'p_recursa' 'sala' 'pa1_prom' 'pa2_prom'
 'final_prom' 'edad' 'prom_edad' 'condición' 'abandona1_p' 'abandona2_p'
 'valido1' 'valido2']


In [2]:
################################################
#
# aprueba_p : procentaje que aprueba los parciales respecto del total de inscriptos.
#
# aprueba_rel_p : porcentaje de aprobados respecto (relativo) de
#                 los que rindieron parcial 1.
#
################################################

df['temp1'] = ((df['condición'] == 'Promociona') | (df['condición'] == 'Examen') ).astype(int)
df['temp1'] = df.groupby(['curso'])['temp1'].transform('sum')

# Porcentaje de aprobados respecto del total de inscriptos:
df['aprueba_p'] = df['temp1'] / df['n_alum']

# Cantidad de alumnos que rindieron el parcial 1:
df['temp2'] = (df['condición'] == 'Abandona1').astype(int)
df['temp2'] = df.groupby(['curso'])['temp2'].transform('sum')
df['temp2'] = df['n_alum'] - df['temp2']
df.loc[df['temp2'] == 0, 'temp2'] = -1

# Porcentaje de aprobados respecto (relativo) de los que rindieron parcial 1:
df['aprueba_rel_p'] = df['temp1'] / df['temp2']
df.loc[df['aprueba_rel_p'] < 0, 'aprueba_rel_p'] = 0

df = df.drop(['temp1'], axis=1)
df = df.drop(['temp2'], axis=1)

In [3]:
df = df.drop(['pa1', 'pa2', 'Final', 'codCarrera', 'facultad', 'rem1', 'rem2',
               'estudiante', 'extranjero', 'recurso', 'edad', 'condición'], axis=1)
print('Variables:', df.columns.values)

Variables: ['anio' 'cuat' 'SEDE' 'MATERIA' 'curso' 'turno' 'n_alum' 'p_ext'
 'p_recursa' 'sala' 'pa1_prom' 'pa2_prom' 'final_prom' 'prom_edad'
 'abandona1_p' 'abandona2_p' 'valido1' 'valido2' 'aprueba_p'
 'aprueba_rel_p']


In [4]:
df = df.drop_duplicates(ignore_index=True)

In [5]:
# ¿Hay cursos que aparezcan más de una vez?
df['temp2'] = df.groupby(['curso'])['curso'].transform('count')
df.loc[df['temp2'] > 1]

Unnamed: 0,anio,cuat,SEDE,MATERIA,curso,turno,n_alum,p_ext,p_recursa,sala,...,pa2_prom,final_prom,prom_edad,abandona1_p,abandona2_p,valido1,valido2,aprueba_p,aprueba_rel_p,temp2
263,2011,2,21,53,263,C,62,0.0,0.129032,117,...,4.454545,4.545455,3.274194,0.129032,0.185185,1,1,0.483871,0.555556,3
264,2011,2,21,53,263,C,62,0.0,0.129032,122,...,4.454545,4.545455,3.274194,0.129032,0.185185,1,1,0.483871,0.555556,3
265,2011,2,21,53,263,B,62,0.0,0.129032,121,...,4.454545,4.545455,3.274194,0.129032,0.185185,1,1,0.483871,0.555556,3
413,2012,1,10,53,411,C,89,0.101124,0.58427,94,...,,,3.258427,1.0,,0,0,0.0,-0.0,2
414,2012,1,10,53,411,A,89,0.101124,0.58427,94,...,,,3.258427,1.0,,0,0,0.0,-0.0,2
477,2012,2,2,53,474,A,50,0.4,0.16,25,...,4.35,5.0,3.82,0.06,0.148936,1,1,0.62,0.659574,2
478,2012,2,2,53,474,B,50,0.4,0.16,25,...,4.35,5.0,3.82,0.06,0.148936,1,1,0.62,0.659574,2
707,2013,1,10,53,703,C,84,0.035714,0.619048,91,...,5.073529,,3.47619,0.297619,0.423729,1,1,0.297619,0.423729,2
708,2013,1,10,53,703,C,84,0.035714,0.619048,96,...,5.073529,,3.47619,0.297619,0.423729,1,1,0.297619,0.423729,2
1587,2016,1,39,53,1593,B,97,0.28866,0.43299,177,...,3.058824,2.875,3.597938,0.463918,0.346154,1,1,0.134021,0.25,2


In [6]:
# Algunos pocos cursos tienen un error en el turno o en la sala.
# Se imputan valores de turno y sala para mantener consistencia,
# son muy pocos casos.
df.loc[df['curso'] == 263, 'turno'] = 'C'
df.loc[df['curso'] == 263, 'sala'] = 117
df.loc[df['curso'] == 411, 'turno'] = 'C'
df.loc[df['curso'] == 474, 'turno'] = 'A'
df.loc[df['curso'] == 703, 'sala'] = 91
df.loc[df['curso'] == 1593, 'sala'] = 177
df.loc[df['curso'] == 2279, 'turno'] = 'A'
df.loc[df['curso'] == 2279, 'sala'] = 60
df.loc[df['curso'] == 2475, 'turno'] = 'C'
df.loc[df['curso'] == 2406, 'sala'] = 16
df.loc[df['curso'] == 2406, 'MATERIA'] = 3
df.loc[df['curso'] == 2376, 'sala'] = 5
df.loc[df['curso'] == 2634, 'sala'] = 120

In [7]:
df = df.drop_duplicates(ignore_index=True)
# Verifico que no se repiten cursos
df['temp2'] = df.groupby(['curso'])['curso'].transform('count')
df.loc[df['temp2'] > 1]


Unnamed: 0,anio,cuat,SEDE,MATERIA,curso,turno,n_alum,p_ext,p_recursa,sala,...,pa2_prom,final_prom,prom_edad,abandona1_p,abandona2_p,valido1,valido2,aprueba_p,aprueba_rel_p,temp2


In [8]:
df = df.drop(['temp2'], axis=1)

In [9]:
df.sample(5)

Unnamed: 0,anio,cuat,SEDE,MATERIA,curso,turno,n_alum,p_ext,p_recursa,sala,pa1_prom,pa2_prom,final_prom,prom_edad,abandona1_p,abandona2_p,valido1,valido2,aprueba_p,aprueba_rel_p
1389,2015,2,5,53,1375,D,67,0.119403,0.38806,54,,,,3.776119,1.0,,0,0,0.0,-0.0
1694,2016,2,15,3,1720,A,74,0.081081,0.0,111,,,,3.0,1.0,,0,0,0.0,-0.0
1919,2017,2,15,3,2019,B,86,0.069767,0.0,112,3.451613,3.75,3.85,3.05814,0.27907,0.225806,1,1,0.337209,0.467742
2653,2019,2,41,53,2661,C,57,0.140351,0.508772,184,,,,3.368421,1.0,,0,0,0.0,-0.0
505,2012,2,5,53,505,C,65,0.092308,0.553846,65,,,,4.230769,1.0,,0,0,0.0,-0.0


In [10]:
df.to_csv('../datos/dataset_04-feateng-cursos.csv', index=False)