In [1]:
import re
import os
import pandas as pd

In [2]:
enem_2018_path = os.getcwd() + '/data/enem2018/enem_2018.csv'
enem2018_df = pd.read_csv(enem_2018_path, delimiter=";")
enem2018_df.fillna(0, inplace=True)

print(enem2018_df.shape)
print(enem2018_df.head())

def process_enem_data(enem2018_df, input_format='csv', **options):
    enem2018_df.fillna(0, inplace=True)
    enem2018_df.drop_duplicates('NU_INSCRICAO')
    enem2018_df = enem2018_df.rename(columns={ 'NU_INSCRICAO': 'registration'
                                             , 'CO_MUNICIPIO_RESIDENCIA': 'city_residence_code'
                                             , 'NO_MUNICIPIO_RESIDENCIA': 'city_residence'
                                             , 'CO_UF_RESIDENCIA': 'state_residence_code'
                                             , 'SG_UF_RESIDENCIA': 'state_residence'
                                             , 'NU_IDADE': 'age'
                                             , 'TP_SEXO': 'gender'
                                             , 'TP_ESTADO_CIVIL': 'matiral_status'
                                             , 'TP_COR_RACA': 'color_race'
                                             , 'TP_NACIONALIDADE': 'nationality'
                                             , 'TP_ST_CONCLUSAO': 'high_school_status'
                                             , 'TP_ANO_CONCLUIU': 'high_school_year_conclusion'
                                             , 'TP_ESCOLA': 'school_type'
                                             , 'IN_BAIXA_VISAO': 'def_low_vision'
                                             , 'IN_CEGUEIRA': 'def_blind'
                                             , 'IN_SURDEZ': 'def_deaf'
                                             , 'IN_DEFICIENCIA_AUDITIVA': 'def_low_hearing'
                                             , 'IN_SURDO_CEGUEIRA': 'def_blind_deaf'
                                             , 'IN_DEFICIENCIA_FISICA': 'def_physical'
                                             , 'IN_DEFICIENCIA_MENTAL': 'def_mental'
                                             , 'IN_DEFICIT_ATENCAO': 'def_attention'
                                             , 'IN_DISLEXIA': 'def_dyslexia'
                                             , 'IN_DISCALCULIA': 'def_dyscalculia'
                                             , 'IN_AUTISMO': 'def_autism'
                                             , 'IN_VISAO_MONOCULAR': 'def_monocular_vision'
                                             , 'IN_OUTRA_DEF': 'def_other'
                                             , 'IN_NOME_SOCIAL': 'social_name'
                                             , 'CO_MUNICIPIO_PROVA': 'city_test_code'
                                             , 'NO_MUNICIPIO_PROVA': 'city_test'
                                             , 'CO_UF_PROVA': 'state_test_code'
                                             , 'SG_UF_PROVA': 'state_test'
                                             , 'TP_PRESENCA_CN': 'presence_natural_science'
                                             , 'TP_PRESENCA_CH': 'presence_human_science'
                                             , 'TP_PRESENCA_LC': 'presence_languages'
                                             , 'TP_PRESENCA_MT': 'presence_math'
                                             , 'NU_NOTA_CN': 'grade_natural_science'
                                             , 'NU_NOTA_CH': 'grade_human_science'
                                             , 'NU_NOTA_LC': 'grade_languages'
                                             , 'NU_NOTA_MT': 'grade_math'
                                             , 'TP_STATUS_REDACAO': 'essay_status'
                                             , 'NU_NOTA_REDACAO': 'grade_essay'
                                             })
    return enem2018_df

enem = process_enem_data(enem2018_df)
print(enem.shape)
print(enem.head())

enem.to_csv(os.getcwd() + '/data/analysis/enem_analysis.csv', sep=';', index = False)

print('ENEM process has been finalized')

(5513747, 41)
   NU_INSCRICAO  CO_MUNICIPIO_RESIDENCIA NO_MUNICIPIO_RESIDENCIA  \
0  180008202043                  5300108                Brasília   
1  180007197856                  2111102      São João dos Patos   
2  180008517434                  3530607         Mogi das Cruzes   
3  180007661228                  2916401              Itapetinga   
4  180008787987                  2918100               Jeremoabo   

   CO_UF_RESIDENCIA SG_UF_RESIDENCIA  NU_IDADE TP_SEXO  TP_ESTADO_CIVIL  \
0                53               DF      44.0       M              1.0   
1                21               MA      23.0       F              0.0   
2                35               SP      23.0       F              0.0   
3                29               BA      26.0       F              0.0   
4                29               BA      20.0       M              0.0   

   TP_COR_RACA  TP_NACIONALIDADE  ...  TP_PRESENCA_CN  TP_PRESENCA_CH  \
0            1                 0  ...             0.0

In [3]:
brazil_cities_path = os.getcwd() + '/data/brazil_cities.csv'
brazil_df = pd.read_csv(brazil_cities_path, delimiter=";")
brazil_df.fillna(0, inplace=True)

print(brazil_df.shape)
print(brazil_df.head())

def process_cities_data(brazil_df, input_format='csv', **options):
    brazil_df.fillna(0, inplace=True)
    brazil_df.drop_duplicates(['CITY', 'STATE'], keep='first')
    brazil_df = brazil_df.filter([ 'CITY', 'STATE', 'CAPITAL', 'IDHM Ranking 2010', 'IDHM'
                                 , 'IDHM_Renda', 'IDHM_Longevidade', 'IDHM_Educacao', 'LONG'
                                 , 'LAT', 'ALT'
                                 ])
    brazil_df = brazil_df.rename(columns={ 'CITY': 'city'
                                         , 'STATE': 'state'
                                         , 'CAPITAL': 'capital'
                                         , 'IDHM Ranking 2010': 'hdi_ranking'
                                         , 'IDHM': 'hdi'
                                         , 'IDHM_Renda': 'hdi_gni'
                                         , 'IDHM_Longevidade': 'hdi_life'
                                         , 'IDHM_Educacao': 'hdi_education'
                                         , 'LONG': 'longitude'
                                         , 'LAT': 'latitude'
                                         , 'ALT': 'altitude'
                                         })
    return brazil_df

brazil = process_cities_data(brazil_df)
print(brazil.shape)
print(brazil.head())

brazil.to_csv(os.getcwd() + '/data/analysis/cities_analysis.csv', sep=';', index = False)

print('Process of Brazilian cities has been finalized')

(5573, 81)
                  CITY STATE  CAPITAL  IBGE_RES_POP  IBGE_RES_POP_BRAS  \
0      Abadia De Goiás    GO        0        6876.0             6876.0   
1  Abadia Dos Dourados    MG        0        6704.0             6704.0   
2            Abadiânia    GO        0       15757.0            15609.0   
3               Abaeté    MG        0       22690.0            22690.0   
4           Abaetetuba    PA        0      141100.0           141040.0   

   IBGE_RES_POP_ESTR  IBGE_DU  IBGE_DU_URBAN  IBGE_DU_RURAL  IBGE_POP  ...  \
0                0.0   2137.0         1546.0          591.0    5300.0  ...   
1                0.0   2328.0         1481.0          847.0    4154.0  ...   
2              148.0   4655.0         3233.0         1422.0   10656.0  ...   
3                0.0   7694.0         6667.0         1027.0   18464.0  ...   
4               60.0  31061.0        19057.0        12004.0   82956.0  ...   

   Pu_Bank   Pr_Assets    Pu_Assets    Cars  Motorcycles  Wheeled_tractor  