# Brazilian High School National Exam (ENEM 2019)
The ENEM exam comprises of 180 multiple-choice questions and tests students in five main areas, namely natural sciences, math, human sciences, Portuguese and either English or Spanish as a foreign language. Students are also required to write an essay.

In [1]:
import pandas as pd
import numpy as np

In [2]:
#The information provided by the INEP is standardized, this EDA will be configured to adapt to any desired year.
year= 2019
minimum_wage= 998.00

## Initial dataframe preparation

In [3]:
#Due the big size of original dataset, only essential information that will be used on this EDA are recovered
essentials= ['SG_UF_RESIDENCIA', 'NO_MUNICIPIO_RESIDENCIA','TP_SEXO', 'TP_COR_RACA', 'TP_ESCOLA', 'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC',
    'NU_NOTA_MT', 'TP_LINGUA', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4',
    'NU_NOTA_COMP5', 'NU_NOTA_REDACAO', 'Q006', 'Q024', 'Q025']

In [4]:
#Loading dataset
df= pd.read_csv('input/AMOSTRA_ENEM_2019.csv', delimiter= ';', encoding= 'UTF-8')[essentials]
df.head(2)

Unnamed: 0,SG_UF_RESIDENCIA,NO_MUNICIPIO_RESIDENCIA,TP_SEXO,TP_COR_RACA,TP_ESCOLA,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,TP_LINGUA,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_REDACAO,Q006,Q024,Q025
0,MA,Paço do Lumiar,F,1,1,434.7,479.4,435.3,361.6,1,100.0,120.0,80.0,100.0,100.0,500.0,B,A,A
1,SP,Orlândia,F,5,2,,408.1,507.3,,1,80.0,120.0,120.0,100.0,80.0,500.0,C,A,A


In [5]:
columns_dictionary={
    'SG_UF_RESIDENCIA': 'State',
    'NO_MUNICIPIO_RESIDENCIA': 'City/Town',
    'TP_SEXO': 'Gender',
    'TP_COR_RACA': 'Skincolor',
    'TP_ESCOLA': 'School Kind',
    'NU_NOTA_CN': 'Science Grade',
    'NU_NOTA_CH': 'Humanist Grade',
    'NU_NOTA_LC': 'Linguistic Grade',
    'NU_NOTA_MT': 'Math Grade',
    'TP_LINGUA': 'Foreign Languange',
    'NU_NOTA_COMP1': 'Essay Comp I',
    'NU_NOTA_COMP2': 'Essay Comp II',
    'NU_NOTA_COMP3': 'Essay Comp III', 
    'NU_NOTA_COMP4': 'Essay Comp IV', 
    'NU_NOTA_COMP5': 'Essay Comp V', 
    #More information about essay competences go to: https://bit.ly/3hHUhLx
    'NU_NOTA_REDACAO': 'Essay Grade',
    'Q006': 'Family Income',  
    'Q024': 'Computer in Home', 
    'Q025': 'Internet Acess',
}

skinColor_dictionary= {
    0:np.nan, #Undeclared
    1:'White',
    2:'Black',
    3:'Brown',
    4:'Yellow',
    5:'Indigenous'
}

school_dictionary= {
    1:np.nan, #Undeclared
    2:'Public',
    3:'Private',
}

income_dictionary={
    'A': 0,
    'B': f'Up to R${minimum_wage}',
    'C': f'R${minimum_wage+ 0.01} - R${1.5* minimum_wage}',
    'D': f'R${1.5*minimum_wage+ 0.01} - R${2* minimum_wage}',
    'E': f'R${2*minimum_wage+ 0.01} - R${2.5* minimum_wage}',
    'F': f'R${2.5*minimum_wage+ 0.01} - R${3* minimum_wage}',
    'G': f'R${3*minimum_wage+ 0.01} - R${4* minimum_wage}',
    'H': f'R${4*minimum_wage+ 0.01} - R${5* minimum_wage}',
    'I': f'R${5*minimum_wage+ 0.01} - R${6* minimum_wage}',
    'J': f'R${6*minimum_wage+ 0.01} - R${7* minimum_wage}',
    'K': f'R${7*minimum_wage+ 0.01} - R${8* minimum_wage}',
    'L': f'R${8*minimum_wage+ 0.01} - R${9* minimum_wage}',
    'M': f'R${9*minimum_wage+ 0.01} - R${10* minimum_wage}',
    'N': f'R${10*minimum_wage+ 0.01} - R${12* minimum_wage}',
    'O': f'R${12*minimum_wage+ 0.01} - R${15* minimum_wage}',
    'P': f'R${15*minimum_wage+ 0.01} - R${20* minimum_wage}',
    'Q': f'Above to R${20* minimum_wage}',
}

languange_dictionary= {
    0: 'English',
    1: 'Spanish',
}

gender_dictionary= {
    'F': 'Female',
    'M': 'Male'
}

computer_dictionary= {
    'A': 'None',
    'B': 'One',
    'C': 'Two',
    'D': 'Three',
    'E': 'Four or more',
}

internet_dictionary= {
    'A': 'No',
    'B': 'Yes',
}

In [6]:
#Mapping dataframe
df.rename(columns= columns_dictionary, inplace= True)
df['Gender']= df['Gender'].map(gender_dictionary)
df['Skincolor']= df['Skincolor'].map(skinColor_dictionary)
df['School Kind']= df['School Kind'].map(school_dictionary)
df['Foreign Languange']= df['Foreign Languange'].map(languange_dictionary)
df['Family Income']= df['Family Income'].map(income_dictionary)
df['Computer in Home']= df['Computer in Home'].map(computer_dictionary)
df['Internet Acess']= df['Internet Acess'].map(internet_dictionary)

In [7]:
df.head()

Unnamed: 0,State,City/Town,Gender,Skincolor,School Kind,Science Grade,Humanist Grade,Linguistic Grade,Math Grade,Foreign Languange,Essay Comp I,Essay Comp II,Essay Comp III,Essay Comp IV,Essay Comp V,Essay Grade,Family Income,Computer in Home,Internet Acess
0,MA,Paço do Lumiar,Female,White,,434.7,479.4,435.3,361.6,Spanish,100.0,120.0,80.0,100.0,100.0,500.0,Up to R$998.0,,No
1,SP,Orlândia,Female,Indigenous,Public,,408.1,507.3,,Spanish,80.0,120.0,120.0,100.0,80.0,500.0,R$998.01 - R$1497.0,,No
2,SP,Sertãozinho,Male,White,Public,,,,,English,,,,,,,R$2994.01 - R$3992.0,One,Yes
3,AL,Santana do Ipanema,Male,Brown,,363.6,471.2,516.2,456.8,Spanish,140.0,120.0,100.0,100.0,100.0,560.0,Up to R$998.0,One,No
4,DF,Brasília,Male,Brown,,681.1,673.2,622.2,797.5,English,160.0,140.0,140.0,160.0,140.0,740.0,R$11976.01 - R$14970.0,Two,Yes


## Dataframe basic informations

In [8]:
df.info(memory_usage= 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127382 entries, 0 to 127381
Data columns (total 19 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   State              127382 non-null  object 
 1   City/Town          127382 non-null  object 
 2   Gender             127382 non-null  object 
 3   Skincolor          124694 non-null  object 
 4   School Kind        36796 non-null   object 
 5   Science Grade      92698 non-null   float64
 6   Humanist Grade     98007 non-null   float64
 7   Linguistic Grade   98007 non-null   float64
 8   Math Grade         92698 non-null   float64
 9   Foreign Languange  127382 non-null  object 
 10  Essay Comp I       98007 non-null   float64
 11  Essay Comp II      98007 non-null   float64
 12  Essay Comp III     98007 non-null   float64
 13  Essay Comp IV      98007 non-null   float64
 14  Essay Comp V       98007 non-null   float64
 15  Essay Grade        98007 non-null   float64
 16  Fa

In [9]:
print('Sum of null data. Purposely undeclared data was considered null.')
print('Null grades correspond to candidates who did not take the test due to no-show or elimination')
df.isna().sum()

Sum of null data. Purposely undeclared data was considered null.
Null grades correspond to candidates who did not take the test due to no-show or elimination


State                    0
City/Town                0
Gender                   0
Skincolor             2688
School Kind          90586
Science Grade        34684
Humanist Grade       29375
Linguistic Grade     29375
Math Grade           34684
Foreign Languange        0
Essay Comp I         29375
Essay Comp II        29375
Essay Comp III       29375
Essay Comp IV        29375
Essay Comp V         29375
Essay Grade          29375
Family Income            0
Computer in Home         0
Internet Acess           0
dtype: int64

## Analysis
### Geographical

In [10]:
candidates= df.shape[0]
cities= len(df['City/Town'].unique())
states= len(df['State'].unique())
print(f'{candidates} candidates from {cities} cities in {states} were registered for ENEM {year}')

127382 candidates from 4909 cities in 27 were registered for ENEM 2019


The Brazilian territory is divided into 27 federative units, consisting of 26 states, plus the Federal District. According to the Brazilian Institute of Geography and Statistics (IBGE), Brazil has 5570 cities. 
This number is referred to in the last 2010 Census.

More informations: https://bit.ly/30UP8sO

In [11]:
print(f'Therefore, the participation of brazilian municipalities in Enem {year} was {round(cities/5570, 2)*100}%.')

Therefore, the participation of brazilian municipalities in Enem 2019 was 88.0%.


In [12]:
#Loading external data about the number of cities or town per brazilian regions
url='https://bit.ly/2X4VK6y'
print('Number of mubicipalities per brazilians regions')
regions= pd.read_html(url, decimal= '.', encoding= 'UTF-8')[1][['Municípios']].rename(columns= {'Municípios':'Municipalities'})
regions.index= ['Northeast', 'Southeast', 'South', 'Midwest', 'North']
regions

Number of mubicipalities per brazilians regions


Unnamed: 0,Municipalities
Northeast,1.794
Southeast,1.668
South,1.191
Midwest,466.0
North,450.0


In [13]:
#Counting the number of municipalities where the exam occured
SE= (df['State'] == 'ES') | (df['State'] == 'MG') | (df['State'] == 'RJ') | (df['State'] == 'SP')
SE= len(df[SE]['City/Town'].unique())

NE= (df['State'] == 'BA') | (df['State'] == 'SE') | (df['State'] == 'AL') | (df['State'] == 'PE') | (df['State'] == 'PB') | (df['State'] == 'RN') | (df['State'] == 'CE') | (df['State'] == 'MA') | (df['State'] == 'PI')
NE= len(df[NE]['City/Town'].unique())

S= (df['State'] == 'PR') | (df['State'] == 'RS') | (df['State'] == 'SC')
S= len(df[S]['City/Town'].unique())

NW= (df['State'] == 'GO') | (df['State'] == 'MS') | (df['State'] == 'MT')
NW= len(df[NW]['City/Town'].unique())

N= (df['State'] == 'AC') | (df['State'] == 'AM') | (df['State'] == 'RO') | (df['State'] == 'RR') | (df['State'] == 'AP') | (df['State'] == 'PA') | (df['State'] == 'TO')
N= len(df[N]['City/Town'].unique())