In [2]:
# Creating our internal database with sqlite3 and sqlalchemy.
import sqlite3
from sqlalchemy import create_engine
import pandas as pd
csv_database = create_engine('sqlite:///csv_database.db')
'''for chunk in pd.read_csv('MICRODADOS_ENEM_2019.csv',encoding = 'ISO-8859-1', 
                         sep = ';', chunksize = 50000, iterator = True):
    chunk.to_sql('enem', csv_database, if_exists = 'append')'''

"for chunk in pd.read_csv('MICRODADOS_ENEM_2019.csv',encoding = 'ISO-8859-1', \n                         sep = ';', chunksize = 50000, iterator = True):\n    chunk.to_sql('enem', csv_database, if_exists = 'append')"

In [3]:
# And now, the informations will be pulled from the database through the use of the read_sql method!
# To enhance the speed of the DataFrame receipt, we'll only list the columns that we'll be of use in the project. 
import pandas as pd
import numpy as np
query = ''' SELECT TP_ST_CONCLUSAO, TP_ESCOLA, TP_PRESENCA_CN, TP_PRESENCA_CH, NU_NOTA_LC, NU_NOTA_CH,NU_NOTA_MT, 
            NU_NOTA_CN, NU_NOTA_REDACAO
            FROM ENEM'''
enem = pd.read_sql(query, csv_database)
enem.head()

OperationalError: (sqlite3.OperationalError) no such table: ENEM
[SQL:  SELECT TP_ST_CONCLUSAO, TP_ESCOLA, TP_PRESENCA_CN, TP_PRESENCA_CH, NU_NOTA_LC, NU_NOTA_CH,NU_NOTA_MT, 
            NU_NOTA_CN, NU_NOTA_REDACAO
            FROM ENEM]
(Background on this error at: http://sqlalche.me/e/14/e3q8)

In [21]:
# Here, we are filtering out the people who did not taken the first or second day of ENEM.
# With this inconvenience corrected we can code the tasks solutions.
enem = enem[(enem['TP_PRESENCA_CN'] == 1) & (enem['TP_PRESENCA_CH'] == 1)]

# Another useful procedure to be conducted is the grades normalization. Unlike other tests you may have taken, ENEM 
# assesses the students grades based on how they performed in comparison to others in each subject.
def min_max(column):
    return (column - column.min()) / (column.max() - column.min())

enem[['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_MT', 'NU_NOTA_REDACAO']] = (enem[
                                                                            ['NU_NOTA_CN', 'NU_NOTA_CH',
                                                                            'NU_NOTA_LC', 'NU_NOTA_MT',
                                                                             'NU_NOTA_REDACAO']].apply(min_max))

enem.head()

In [None]:
# Getting the private and public schools grades separately. 
private = (enem[(enem['TP_ST_CONCLUSAO'] == 2 ) & (enem['TP_ESCOLA'] == 3)]
           [['NU_NOTA_LC', 'NU_NOTA_CH', 'NU_NOTA_MT', 'NU_NOTA_CN', 'NU_NOTA_REDACAO']].mean())

private.name = 'Private'
public = (enem[(enem['TP_ST_CONCLUSAO'] == 2 ) & (enem['TP_ESCOLA'] == 2)]
          [['NU_NOTA_LC', 'NU_NOTA_CH', 'NU_NOTA_MT', 'NU_NOTA_CN', 'NU_NOTA_REDACAO']].mean()) 

public.name= 'Public'

In [None]:
# Merging the Series produced, from here it is evident the poorer performance from government schools students.
grades = pd.merge(private, public, left_index = True, right_index = True).stack().reset_index()
grades.columns = ['Subject', 'School Type', 'Avg Grade']

In [None]:
# To end the task let's plot a bar chart exposing the grade differences.
# Private school students outperform the others in ENEM's every single subject.
import seaborn as sns
plt.figure(figsize = (8, 5))
sns.set_style('dark')
bars = sns.barplot(x = 'Subject', y = 'Avg Grade' ,hue = 'School Type',data = grades)

# Setting the aesthetic aspects of the chart.
bars.set_title('ENEM Private x Public School Students Performance', fontfamily = 'Times' ,fontsize = 14, pad = 5)
bars.set(xlabel = None, ylabel = None, yticks = [], ylim = [0,1], 
         xticklabels =['Portuguese', 'Human Sciences', 'Math', 'Natural Sciences', 'Composition'] )

for i in range(0, len(grades), 2):
    plt.text( (i/2)-0.3 , grades.iloc[i, 2] - 0.1, f'{grades.iloc[i, 2]:.2f}', rotation = 270, color = 'white',
            fontsize = 13, weight = 'bold')
    
for i in range(1, len(grades), 2):
    plt.text( (i/2)-0.4 , grades.iloc[i, 2] - 0.1, f'{grades.iloc[i, 2]:.2f}', rotation = 270, color = 'white',
             fontsize = 13,weight = 'bold')
plt.legend(loc = 'upper right', frameon = False)