In [1316]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import scipy.stats as sct
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import (
    OneHotEncoder, Binarizer, KBinsDiscretizer,
    MinMaxScaler, StandardScaler, PolynomialFeatures
)
from sklearn.linear_model import LogisticRegression
import operator
from sklearn.model_selection import train_test_split

In [1317]:
from IPython.core.pylabtools import figsize


figsize(12, 8)

sns.set()

In [1318]:
df = pd.read_csv('train.csv')

In [1319]:
df

Unnamed: 0.1,Unnamed: 0,NU_INSCRICAO,NU_ANO,CO_MUNICIPIO_RESIDENCIA,NO_MUNICIPIO_RESIDENCIA,CO_UF_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,...,Q041,Q042,Q043,Q044,Q045,Q046,Q047,Q048,Q049,Q050
0,1,ed50e8aaa58e7a806c337585efee9ca41f1eb1ad,2016,4314902,Porto Alegre,43,RS,24,M,0.0,...,5.0,A,A,A,A,A,A,A,B,D
1,2,2c3acac4b33ec2b195d77e7c04a2d75727fad723,2016,2304707,Granja,23,CE,17,F,0.0,...,,A,A,C,A,B,A,A,C,A
2,3,f4545f8ccb9ff5c8aad7d32951b3f251a26e6568,2016,2304400,Fortaleza,23,CE,21,F,0.0,...,,A,A,A,A,C,A,A,B,A
3,4,3d6ec248fef899c414e77f82d5c6d2bffbeaf7fe,2016,3304557,Rio de Janeiro,33,RJ,25,F,0.0,...,5.0,C,A,A,A,A,D,A,A,A
4,5,bf896ac8d3ecadd6dba1dfbf50110afcbf5d3268,2016,1302603,Manaus,13,AM,28,M,0.0,...,,A,A,A,A,A,A,A,A,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13725,4582,aadf671ef8c1c6efa42e69432093ab83c17a52ac,2016,3502903,Araçoiaba da Serra,35,SP,20,M,0.0,...,,A,A,A,A,A,A,A,A,A
13726,4583,461f7ef2753d9d5fa0b054930d6b0e0eec318e81,2016,2613800,São Vicente Ferrer,26,PE,33,F,0.0,...,,A,A,B,B,A,A,A,A,A
13727,4584,5354503700ecf2388f68990435390b6e1ad454ee,2016,4322186,Tupanci do Sul,43,RS,15,F,0.0,...,,A,A,A,A,C,A,A,A,A
13728,4585,c7a9e94a42bd76b7978f13c013a61702e36cc42c,2016,3303203,Nilópolis,33,RJ,36,M,1.0,...,5.0,A,A,A,A,D,A,A,A,D


In [1320]:
corr = df[df.columns[1:]].corr()['IN_TREINEIRO'][:]
corr.sort_values(ascending=False).head(6)

IN_TREINEIRO       1.000000
TP_ST_CONCLUSAO    0.533983
TP_PRESENCA_CN     0.094692
TP_PRESENCA_CH     0.094692
TP_PRESENCA_LC     0.092454
TP_PRESENCA_MT     0.092454
Name: IN_TREINEIRO, dtype: float64

In [1321]:
corr.sort_values().head()

NU_IDADE          -0.295091
TP_ANO_CONCLUIU   -0.257710
TP_ESCOLA         -0.244562
IN_CERTIFICADO    -0.142185
TP_ESTADO_CIVIL   -0.117932
Name: IN_TREINEIRO, dtype: float64

In [1322]:
columns = ["TP_ST_CONCLUSAO","NU_IDADE","TP_ANO_CONCLUIU","IN_TREINEIRO"]

enem_features = pd.DataFrame(df, columns=columns)
enem_features

Unnamed: 0,TP_ST_CONCLUSAO,NU_IDADE,TP_ANO_CONCLUIU,IN_TREINEIRO
0,1,24,4,0
1,2,17,0,0
2,3,21,0,0
3,1,25,9,0
4,1,28,4,0
...,...,...,...,...
13725,1,20,3,0
13726,1,33,10,0
13727,3,15,0,1
13728,4,36,0,0


In [1323]:
cons = pd.DataFrame({'colunas' : enem_features.columns,
                    'tipo': enem_features.dtypes,
                    'missing' : enem_features.isna().sum(),
                    'size' : enem_features.shape[0],
                    'unicos': enem_features.nunique()})
cons['percentual'] = round(cons['missing'] / cons['size'],2)

cons

Unnamed: 0,colunas,tipo,missing,size,unicos,percentual
TP_ST_CONCLUSAO,TP_ST_CONCLUSAO,int64,0,13730,4,0.0
NU_IDADE,NU_IDADE,int64,0,13730,55,0.0
TP_ANO_CONCLUIU,TP_ANO_CONCLUIU,int64,0,13730,11,0.0
IN_TREINEIRO,IN_TREINEIRO,int64,0,13730,2,0.0


In [1324]:
correlation_matrix = enem_features.corr(method = "pearson")
correlation_matrix.style.background_gradient(cmap='coolwarm')

Unnamed: 0,TP_ST_CONCLUSAO,NU_IDADE,TP_ANO_CONCLUIU,IN_TREINEIRO
TP_ST_CONCLUSAO,1.0,-0.253403,-0.592834,0.533983
NU_IDADE,-0.253403,1.0,0.681435,-0.295091
TP_ANO_CONCLUIU,-0.592834,0.681435,1.0,-0.25771
IN_TREINEIRO,0.533983,-0.295091,-0.25771,1.0


TP_ST_CONCLUSAO

1	Já concluí o Ensino Médio

2	Estou cursando e concluirei o Ensino Médio em 2016

3	Estou cursando e concluirei o Ensino Médio após 2016

4	Não concluí e não estou cursando o Ensino Médio



In [1325]:
def conclusao_ensino_binarization(dataframe):
    binarizer = Binarizer(threshold=2).fit(dataframe)
    tp_st_conclusao_binary = binarizer.transform(dataframe)

    concluiu = tp_st_conclusao_binary.flatten().astype(bool)
    concluiu = list(map(operator.not_, concluiu))
    concluiu = pd.DataFrame(concluiu, columns=["CONCLUIU_ENSINO"])
    return concluiu

concluiu = conclusao_ensino_binarization(enem_features[["TP_ST_CONCLUSAO"]])

In [1326]:
enem_features = pd.concat([enem_features, tp_st_conclusao], axis = 1)
enem_features = enem_features.drop(columns= "TP_ST_CONCLUSAO")
enem_features

Unnamed: 0,NU_IDADE,TP_ANO_CONCLUIU,IN_TREINEIRO,CONCLUIU_ENSINO
0,24,4,0,True
1,17,0,0,True
2,21,0,0,False
3,25,9,0,True
4,28,4,0,True
...,...,...,...,...
13725,20,3,0,True
13726,33,10,0,True
13727,15,0,1,False
13728,36,0,0,False


In [1327]:
correlation_matrix = enem_features.corr(method = "pearson")
correlation_matrix.style.background_gradient(cmap='coolwarm')

Unnamed: 0,NU_IDADE,TP_ANO_CONCLUIU,IN_TREINEIRO,CONCLUIU_ENSINO
NU_IDADE,1.0,0.681435,-0.295091,0.148325
TP_ANO_CONCLUIU,0.681435,1.0,-0.25771,0.346935
IN_TREINEIRO,-0.295091,-0.25771,1.0,-0.742821
CONCLUIU_ENSINO,0.148325,0.346935,-0.742821,1.0


In [1328]:
logistic = LogisticRegression()

X_train = enem_features.loc[:, enem_features.columns != "IN_TREINEIRO"]
y_train = enem_features["IN_TREINEIRO"]

x_train, x_test, y_train, y_test = train_test_split(X_train.values, y_train.values, test_size = 0.4)

logistic.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [1329]:
score = logistic.score(x_test, y_test)
print(score)

0.9963583394027676


In [1334]:
df_test = pd.read_csv("test.csv")
df_test  = pd.DataFrame(df_test, columns= columns)
df_test = df_test.drop(columns ="IN_TREINEIRO")

concluiu = conclusao_ensino_binarization(df_test[["TP_ST_CONCLUSAO"]])
df_test = pd.concat([df_test, concluiu], axis=1)
df_test = df_test.drop(columns= "TP_ST_CONCLUSAO")



In [1335]:
df_test

Unnamed: 0,NU_IDADE,TP_ANO_CONCLUIU,CONCLUIU_ENSINO
0,19,3,True
1,24,4,True
2,16,0,False
3,17,0,True
4,19,1,True
...,...,...,...
4565,17,0,True
4566,20,2,True
4567,22,5,True
4568,19,0,True


In [1336]:

Y = logistic.predict(df_test)

df_test = pd.read_csv("test.csv")

test_result = pd.DataFrame(data=df_test['NU_INSCRICAO'],
              columns=['NU_INSCRICAO'])
test_result['IN_TREINEIRO'] = Y
test_result

Unnamed: 0,NU_INSCRICAO,IN_TREINEIRO
0,ba0cc30ba34e7a46764c09dfc38ed83d15828897,0
1,177f281c68fa032aedbd842a745da68490926cd2,0
2,6cf0d8b97597d7625cdedc7bdb6c0f052286c334,1
3,5c356d810fa57671402502cd0933e5601a2ebf1e,0
4,df47c07bd881c2db3f38c6048bf77c132ad0ceb3,0
...,...,...
4565,361b7fcd8867119550fe2af5aa729ffad89a7cf5,0
4566,d8a0e4c9e29494cc9bba2422bd79333931475ee1,0
4567,3f1c3388244df8d6521e983a809292d9f3bca643,0
4568,1778e9c4cef591beb6b986d191d15ed05de816b0,0


In [1337]:
test_result.to_csv('answer.csv', index=False)