In [2]:
#Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [3]:
#Import Filtered Dataset containing the selected features
dataEnem = pd.read_csv ("microdados_enem_2019/DADOS/brDataEnem.csv", encoding='ISO-8859-1')

In [4]:
brDataEnem = dataEnem
brDataEnem.head()

Unnamed: 0.1,Unnamed: 0,NU_INSCRICAO,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ESCOLA,TP_ENSINO,SG_UF_ESC,TP_DEPENDENCIA_ADM_ESC,TP_LOCALIZACAO_ESC,TP_SIT_FUNC_ESC,...,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025
0,2,190001004629,1,2,3,1.0,PA,4.0,1.0,1.0,...,B,A,A,D,B,B,D,A,C,B
1,7,190001004634,1,2,2,1.0,PA,2.0,1.0,1.0,...,A,A,A,B,A,A,B,A,A,A
2,15,190001004642,1,2,2,,PA,2.0,1.0,1.0,...,A,A,A,B,B,A,D,A,A,A
3,16,190001004643,1,2,2,1.0,PA,2.0,1.0,1.0,...,A,A,A,D,A,A,C,A,A,A
4,24,190001004651,1,2,2,1.0,PA,2.0,1.0,1.0,...,B,A,A,B,A,A,D,A,A,B


In [5]:
#Parsing Questions to Numbers
#Q005 already has numbers as input
questions = ['Q001', 'Q002', 'Q003', 'Q004', 'Q006', 'Q007', 'Q008', 
             'Q009', 'Q010', 'Q011', 'Q012', 'Q013', 'Q014','Q015', 'Q016', 
             'Q017', 'Q018', 'Q019', 'Q020', 'Q021', 'Q022','Q023', 'Q024', 'Q025']

for i in questions:
    brDataEnem[i] = [ ord(x) - 64 for x in brDataEnem[i] ]
brDataEnem.head()

Unnamed: 0.1,Unnamed: 0,NU_INSCRICAO,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ESCOLA,TP_ENSINO,SG_UF_ESC,TP_DEPENDENCIA_ADM_ESC,TP_LOCALIZACAO_ESC,TP_SIT_FUNC_ESC,...,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025
0,2,190001004629,1,2,3,1.0,PA,4.0,1.0,1.0,...,2,1,1,4,2,2,4,1,3,2
1,7,190001004634,1,2,2,1.0,PA,2.0,1.0,1.0,...,1,1,1,2,1,1,2,1,1,1
2,15,190001004642,1,2,2,,PA,2.0,1.0,1.0,...,1,1,1,2,2,1,4,1,1,1
3,16,190001004643,1,2,2,1.0,PA,2.0,1.0,1.0,...,1,1,1,4,1,1,3,1,1,1
4,24,190001004651,1,2,2,1.0,PA,2.0,1.0,1.0,...,2,1,1,2,1,1,4,1,1,2


In [6]:
brDataEnem.to_csv('DataEnemQuestionsParsed.csv')

In [7]:
#Finds the median for every test
medianRe = brDataEnem['NU_NOTA_REDACAO'].median()
medianCN = brDataEnem['NU_NOTA_CN'].median()
medianCH = brDataEnem['NU_NOTA_CH'].median()
medianLC = brDataEnem['NU_NOTA_LC'].median()
medianMT = brDataEnem['NU_NOTA_MT'].median()
   

In [8]:
#Create a new column called brDataEnem.CH where the value
#is yes if the score is greater than the median
#repeats the process to all other tests
#1 = greater than median, 0 = lesser than median
brDataEnem['Re'] = np.where(brDataEnem['NU_NOTA_REDACAO'] >= medianRe, 1, 0)
brDataEnem['CN'] = np.where(brDataEnem['NU_NOTA_CN'] >= medianCN, 1, 0)
brDataEnem['CH'] = np.where(brDataEnem['NU_NOTA_CH'] >= medianCH, 1, 0)
brDataEnem['LC'] = np.where(brDataEnem['NU_NOTA_LC'] >= medianLC, 1, 0)
brDataEnem['MT'] = np.where(brDataEnem['NU_NOTA_MT'] >= medianMT, 1, 0)


In [9]:
#Getting features and labels that will be used in classification
X = brDataEnem[['TP_ESCOLA','Q001', 'Q002', 'Q003', 'Q004', 'Q005', 'Q006',
                'Q007', 'Q008', 'Q009', 'Q010', 'Q011', 'Q012', 'Q013', 'Q014',
                'Q015', 'Q016', 'Q017', 'Q018', 'Q019', 'Q020', 'Q021', 'Q022',
                'Q023', 'Q024', 'Q025', 'TP_DEPENDENCIA_ADM_ESC']]
y = brDataEnem['MT']


In [10]:
#Defining the training set 
SEED = 30
treino_x, teste_x, treino_y, teste_y = train_test_split(X, y, random_state = SEED, test_size = 0.25)


In [11]:
#Training
modelo = make_pipeline(StandardScaler(),LinearSVC(dual=False, max_iter=5000))
modelo.fit(treino_x, treino_y)
previsoes = modelo.predict(teste_x)
acuracia = accuracy_score(teste_y, previsoes) * 100
print("Accuracy: %.2f%%" % acuracia)

Accuracy: 66.87%
