# Brazilian High School National Exam (ENEM 2019)
The ENEM exam comprises of 180 multiple-choice questions and tests students in five main areas, namely natural sciences, math, human sciences, Portuguese and either English or Spanish as a foreign language. Students are also required to write an essay.

In [1]:
import pandas as pd
import numpy as np

In [2]:
#The information provided by the INEP is standardized, this EDA will be configured to adapt to any desired year.
year= 2019
minimum_wage= 998.00

In [3]:
#Due the big size of original dataset, only essential information that will be used on this EDA are recovered
essentials= ['SG_UF_RESIDENCIA', 'TP_SEXO', 'TP_COR_RACA', 'TP_ESCOLA', 'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC',
    'NU_NOTA_MT', 'TP_LINGUA', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4',
    'NU_NOTA_COMP5', 'NU_NOTA_REDACAO', 'Q006', 'Q024', 'Q025']

In [4]:
#Loading dataset
df= pd.read_csv('input/AMOSTRA_ENEM_2019.csv', delimiter= ';', encoding= 'ISO-8859-1')[essentials]
df.head(2)

Unnamed: 0,SG_UF_RESIDENCIA,TP_SEXO,TP_COR_RACA,TP_ESCOLA,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,TP_LINGUA,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_REDACAO,Q006,Q024,Q025
0,MA,F,1,1,434.7,479.4,435.3,361.6,1,100.0,120.0,80.0,100.0,100.0,500.0,B,A,A
1,SP,F,5,2,,408.1,507.3,,1,80.0,120.0,120.0,100.0,80.0,500.0,C,A,A


In [5]:
columns_dictionary={
    'SG_UF_RESIDENCIA': 'State',
    'TP_SEXO': 'Gender',
    'TP_COR_RACA': 'Skincolor',
    'TP_ESCOLA': 'School Kind',
    'NU_NOTA_CN': 'Science Grade',
    'NU_NOTA_CH': 'Humanist Grade',
    'NU_NOTA_LC': 'Linguistic Grade',
    'NU_NOTA_MT': 'Math Grade',
    'TP_LINGUA': 'Foreign Languange',
    'NU_NOTA_COMP1': 'Essay Comp I',
    'NU_NOTA_COMP2': 'Essay Comp II',
    'NU_NOTA_COMP3': 'Essay Comp III', 
    'NU_NOTA_COMP4': 'Essay Comp IV', 
    'NU_NOTA_COMP5': 'Essay Comp V', 
    #More information about essay competences go to: https://bit.ly/3hHUhLx
    'NU_NOTA_REDACAO': 'Essay Grade',
    'Q006': 'Family Income',  
    'Q024': 'Computer in Home', 
    'Q025': 'Internet Acess',
}

skinColor_dictionary= {
    0:np.nan, #Undeclared
    1:'White',
    2:'Black',
    3:'Brown',
    4:'Yellow',
    5:'Indigenous'
}

school_dictionary= {
    1:np.nan, #Undeclared
    2:'Public',
    3:'Private',
}

income_dictionary={
    'A': 0,
    'B': f'Up to R${minimum_wage}',
    'C': f'R${minimum_wage+ 0.01} - R${1.5* minimum_wage}',
    'D': f'R${1.5*minimum_wage+ 0.01} - R${2* minimum_wage}',
    'E': f'R${2*minimum_wage+ 0.01} - R${2.5* minimum_wage}',
    'F': f'R${2.5*minimum_wage+ 0.01} - R${3* minimum_wage}',
    'G': f'R${3*minimum_wage+ 0.01} - R${4* minimum_wage}',
    'H': f'R${4*minimum_wage+ 0.01} - R${5* minimum_wage}',
    'I': f'R${5*minimum_wage+ 0.01} - R${6* minimum_wage}',
    'J': f'R${6*minimum_wage+ 0.01} - R${7* minimum_wage}',
    'K': f'R${7*minimum_wage+ 0.01} - R${8* minimum_wage}',
    'L': f'R${8*minimum_wage+ 0.01} - R${9* minimum_wage}',
    'M': f'R${9*minimum_wage+ 0.01} - R${10* minimum_wage}',
    'N': f'R${10*minimum_wage+ 0.01} - R${12* minimum_wage}',
    'O': f'R${12*minimum_wage+ 0.01} - R${15* minimum_wage}',
    'P': f'R${15*minimum_wage+ 0.01} - R${20* minimum_wage}',
    'Q': f'Above to R${20* minimum_wage}',
}

languange_dictionary= {
    0: 'English',
    1: 'Spanish',
}

gender_dictionary= {
    'F': 'Female',
    'M': 'Male'
}

computer_dictionary= {
    'A': 'None',
    'B': 'One',
    'C': 'Two',
    'D': 'Three',
    'E': 'Four or more',
}

internet_dictionary= {
    'A': 'No',
    'B': 'Yes',
}

In [6]:
#Mapping dataframe
df.rename(columns= columns_dictionary, inplace= True)
df['Gender']= df['Gender'].map(gender_dictionary)
df['Skincolor']= df['Skincolor'].map(skinColor_dictionary)
df['School Kind']= df['School Kind'].map(school_dictionary)
df['Foreign Languange']= df['Foreign Languange'].map(languange_dictionary)
df['Family Income']= df['Family Income'].map(income_dictionary)
df['Computer in Home']= df['Computer in Home'].map(computer_dictionary)
df['Internet Acess']= df['Internet Acess'].map(internet_dictionary)

In [7]:
df.head()

Unnamed: 0,State,Gender,Skincolor,School Kind,Science Grade,Humanist Grade,Linguistic Grade,Math Grade,Foreign Languange,Essay Comp I,Essay Comp II,Essay Comp III,Essay Comp IV,Essay Comp V,Essay Grade,Family Income,Computer in Home,Internet Acess
0,MA,Female,White,,434.7,479.4,435.3,361.6,Spanish,100.0,120.0,80.0,100.0,100.0,500.0,Up to R$998.0,,No
1,SP,Female,Indigenous,Public,,408.1,507.3,,Spanish,80.0,120.0,120.0,100.0,80.0,500.0,R$998.01 - R$1497.0,,No
2,SP,Male,White,Public,,,,,English,,,,,,,R$2994.01 - R$3992.0,One,Yes
3,AL,Male,Brown,,363.6,471.2,516.2,456.8,Spanish,140.0,120.0,100.0,100.0,100.0,560.0,Up to R$998.0,One,No
4,DF,Male,Brown,,681.1,673.2,622.2,797.5,English,160.0,140.0,140.0,160.0,140.0,740.0,R$11976.01 - R$14970.0,Two,Yes
