## Predizer a nota de matemática de um candidato do ENEM

### Problema da plataforma codenation:
http://www.codenation.com.br/journey/data-science/challenge/enem-2.html

### Importação das bibliotecas

In [21]:
import pandas as pd
import math
from sklearn import model_selection
import xgboost as xgb
from sklearn import metrics

In [22]:
data = pd.read_csv("train.csv")

In [23]:
data.head()

Unnamed: 0.1,Unnamed: 0,NU_INSCRICAO,NU_ANO,CO_MUNICIPIO_RESIDENCIA,NO_MUNICIPIO_RESIDENCIA,CO_UF_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,...,Q041,Q042,Q043,Q044,Q045,Q046,Q047,Q048,Q049,Q050
0,1,ed50e8aaa58e7a806c337585efee9ca41f1eb1ad,2016,4314902,Porto Alegre,43,RS,24,M,0.0,...,5.0,A,A,A,A,A,A,A,B,D
1,2,2c3acac4b33ec2b195d77e7c04a2d75727fad723,2016,2304707,Granja,23,CE,17,F,0.0,...,,A,A,C,A,B,A,A,C,A
2,3,f4545f8ccb9ff5c8aad7d32951b3f251a26e6568,2016,2304400,Fortaleza,23,CE,21,F,0.0,...,,A,A,A,A,C,A,A,B,A
3,4,3d6ec248fef899c414e77f82d5c6d2bffbeaf7fe,2016,3304557,Rio de Janeiro,33,RJ,25,F,0.0,...,5.0,C,A,A,A,A,D,A,A,A
4,5,bf896ac8d3ecadd6dba1dfbf50110afcbf5d3268,2016,1302603,Manaus,13,AM,28,M,0.0,...,,A,A,A,A,A,A,A,A,A


## Filtragem das colunas importantes para um novo dataset

In [24]:
data.shape

(13730, 167)

In [25]:
columns = [
    'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_REDACAO',
    'NU_IDADE', 'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 
    'TP_ESCOLA', 'IN_TREINEIRO', 'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC',
    'TP_STATUS_REDACAO', 'NU_NOTA_MT'
]

In [26]:
data = data[columns]

In [27]:
data.shape

(13730, 14)

In [28]:
data.head()

Unnamed: 0,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_REDACAO,NU_IDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,IN_TREINEIRO,TP_PRESENCA_CN,TP_PRESENCA_CH,TP_PRESENCA_LC,TP_STATUS_REDACAO,NU_NOTA_MT
0,436.3,495.4,581.2,520.0,24,1,4,1,0,1,1,1,1.0,399.4
1,474.5,544.1,599.0,580.0,17,2,0,2,0,1,1,1,1.0,459.8
2,,,,,21,3,0,1,0,0,0,0,,
3,,,,,25,1,9,1,0,0,0,0,,
4,,,,,28,1,4,1,0,0,0,0,,


## Remoção das linhas com elementos faltantes

In [29]:
data.isnull().sum()

NU_NOTA_CN           3389
NU_NOTA_CH           3389
NU_NOTA_LC           3597
NU_NOTA_REDACAO      3597
NU_IDADE                0
TP_ST_CONCLUSAO         0
TP_ANO_CONCLUIU         0
TP_ESCOLA               0
IN_TREINEIRO            0
TP_PRESENCA_CN          0
TP_PRESENCA_CH          0
TP_PRESENCA_LC          0
TP_STATUS_REDACAO    3597
NU_NOTA_MT           3597
dtype: int64

In [30]:
data.fillna(0, inplace=True)

## Criação dos datasets de treino e testes

In [31]:
X = data.values[:, :-1]
y = data.values[:, -1]

In [32]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
    test_size=0.3, random_state=1)

## Criação do modelo preditivo

In [33]:
modelo_xgb = xgb.sklearn.XGBRegressor(random_state=1, max_depth=5, learning_rate=0.01, n_estimators=1000)

In [34]:
eval_set = [(X_test, y_test)]

## Treino do modelo preditivo

In [35]:
modelo_xgb.fit(X_train, y_train, early_stopping_rounds=100, eval_set=eval_set, verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=1,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [36]:
modelo_xgb.best_iteration

557

## Teste e avaliação do modelo preditivo

In [37]:
resposta = modelo_xgb.predict(X_test)

In [38]:
print("MSE: ")
print(metrics.mean_squared_error(y_test, resposta))
print("RMSE: ")
print(math.sqrt(metrics.mean_squared_error(y_test, resposta)))
print("MAE: ")
print(metrics.mean_absolute_error(y_test, resposta))

MSE: 
3852.5363423806557
RMSE: 
62.06880329425287
MAE: 
41.72683082201378


## Criação do modelo preditivo definitivo

In [39]:
columns = [
    'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_REDACAO',
    'NU_IDADE', 'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 
    'TP_ESCOLA', 'IN_TREINEIRO', 'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC',
    'TP_STATUS_REDACAO'
]

In [40]:
test = pd.read_csv('test.csv')

In [41]:
inscricoes = test['NU_INSCRICAO']

In [42]:
test = test[columns]

In [43]:
test.fillna(0, inplace=True)

In [44]:
modelo_xgb_definitivo = xgb.sklearn.XGBRegressor(random_state=1, max_depth=5, learning_rate=0.01, n_estimators=1000)

In [45]:
modelo_xgb_definitivo.fit(X, y, early_stopping_rounds=100, eval_set=eval_set, verbose=False);

In [46]:
resposta_definitiva = modelo_xgb_definitivo.predict(test.values)

## Criação do arquivo de respostas

In [47]:
answer = pd.DataFrame()

In [48]:
answer['NU_INSCRICAO'] = inscricoes
answer['NU_NOTA_MT'] = resposta_definitiva

In [49]:
answer.head()

Unnamed: 0,NU_INSCRICAO,NU_NOTA_MT
0,73ff9fcc02f0a99919906c942c2e1a1042cdcf98,436.790894
1,71a95f9f1b91a82c65ad94abbdf9f54e6066f968,461.720337
2,b38a03232f43b11c9d0788abaf060f7366053b6d,600.340332
3,70b682d9a3636be23f6120fa9d6b164eb3c6002d,0.62228
4,715494628a50142ce8cb17191cfe6d0f3cae0934,542.281799


In [50]:
answer.to_csv("answerXgb.csv", index=False)