# <font color='orange'>0. IMPORTAÇÕES</font>

## <font color='green'>0.1. Bibliotecas</font>

In [141]:
import pandas as pd
import numpy as np


from helper_functions import jupyter_settings

from sklearn.preprocessing  import RobustScaler
from sklearn.linear_model   import LogisticRegression
from sklearn.ensemble       import GradientBoostingClassifier

jupyter_settings()

## <font color='green'>0.2. Dados</font>

In [142]:
df = pd.read_csv('../data/raw/Dataset_model.csv', low_memory=False)
df_add = pd.read_csv('../data/raw/subjects_questions.csv', low_memory=False)
df_submit = pd.read_csv('../data/raw/Submit.csv', sep=';')

In [143]:
df_add = df_add.groupby('novo_question_id').agg({'subject_id': list}).reset_index()

In [144]:
df = df.merge(df_add, how='left', on='novo_question_id')
df_submit = df_submit.merge(df_add, how='left', on='novo_question_id')

## <font color='green'>0.3. Definindo valores constantes no estudo</font>

In [145]:
target_col = 'acertou'
random_seed = 42

# <font color='orange'>1. DESCRIÇÃO DOS DADOS</font>

In [146]:
df.head()

Unnamed: 0,novo_user_id,city,country,device,device_type,os,platform,region,gp:carrers,gp:college type,gp:degree course,gp:previous experience,gp:school type,gp:segment,gp:source_project,acertou,created_at,row,commented_by_professor,difficulty,discipline_id,examining_board_id,institute_id,knowledge_area_id,modality_id,nullified,outdated,product_id,publication_year,right_answer,scholarity_id,novo_question_id,subject_id
0,1,Rio de Janeiro,Brazil,Apple iPhone,Apple iPhone 11 Pro Max,ios 15.4.1,iOS,Rio de Janeiro,"Tribunal, Administrativa",,,beginner,,Concurso Público,ils,0,2022-03-20 10:06:06,1,0.0,4.0,238.0,73.0,4126.0,13.0,1.0,0.0,0.0,1.0,2022.0,A,2.0,489400,[nan]
1,1,Rio de Janeiro,Brazil,Apple iPhone,Apple iPhone 11 Pro Max,ios 15.4.1,iOS,Rio de Janeiro,"Tribunal, Administrativa",,,beginner,,Concurso Público,ils,0,2022-03-20 10:08:25,2,0.0,4.0,238.0,73.0,4126.0,13.0,1.0,0.0,0.0,1.0,2022.0,C,2.0,489399,[nan]
2,1,Rio de Janeiro,Brazil,Apple iPhone,Apple iPhone 11 Pro Max,ios 15.4.1,iOS,Rio de Janeiro,"Tribunal, Administrativa",,,beginner,,Concurso Público,ils,1,2022-03-20 10:09:49,3,0.0,3.0,238.0,73.0,4126.0,13.0,1.0,0.0,0.0,1.0,2022.0,D,2.0,489398,[nan]
3,1,Rio de Janeiro,Brazil,Apple iPhone,Apple iPhone 11 Pro Max,ios 15.4.1,iOS,Rio de Janeiro,"Tribunal, Administrativa",,,beginner,,Concurso Público,ils,0,2022-03-20 10:10:27,4,0.0,4.0,238.0,73.0,4126.0,13.0,1.0,0.0,0.0,1.0,2022.0,A,2.0,489397,[nan]
4,1,Rio de Janeiro,Brazil,Apple iPhone,Apple iPhone 11 Pro Max,ios 15.4.1,iOS,Rio de Janeiro,"Tribunal, Administrativa",,,beginner,,Concurso Público,ils,1,2022-03-20 10:12:12,5,0.0,3.0,238.0,73.0,4126.0,13.0,1.0,0.0,0.0,1.0,2022.0,E,2.0,489396,[nan]


## <font color='green'>1.1. Dimensão dos Dados</font>

In [147]:
print('O dataset possui {} linhas'.format(df.shape[0]))
print('O dataset possui {} colunas'.format(df.shape[1]))

O dataset possui 2000000 linhas
O dataset possui 33 colunas


## <font color='green'>1.2. Tipos das Variáveis</font>

In [148]:
df.dtypes

novo_user_id                int64
city                       object
country                    object
device                     object
device_type                object
os                         object
platform                   object
region                     object
gp:carrers                 object
gp:college type            object
gp:degree course           object
gp:previous experience     object
gp:school type             object
gp:segment                 object
gp:source_project          object
acertou                     int64
created_at                 object
row                         int64
commented_by_professor    float64
difficulty                float64
discipline_id             float64
examining_board_id        float64
institute_id              float64
knowledge_area_id         float64
modality_id               float64
nullified                 float64
outdated                  float64
product_id                float64
publication_year          float64
right_answer  

## <font color='green'>1.3. Verificando NAs</font>

In [149]:
df.isna().sum()

novo_user_id                    0
city                       142400
country                     13200
device                      71600
device_type                163200
os                          13200
platform                    13200
region                      13700
gp:carrers                 116900
gp:college type           1997500
gp:degree course          1997500
gp:previous experience    1052200
gp:school type            1997500
gp:segment                  35200
gp:source_project           38000
acertou                         0
created_at                      0
row                             0
commented_by_professor        178
difficulty                   2629
discipline_id                 178
examining_board_id            182
institute_id                  182
knowledge_area_id             178
modality_id                   178
nullified                     178
outdated                      178
product_id                    178
publication_year              187
right_answer  

## <font color='green'>1.4. Preenchendo NAs</font>

In [150]:
# Removendo colunas com muitos NAs (serão utilizadas em próximos ciclos)
remove_cols = [ 'city', 'country', 'device', 'device_type', 'os', 'platform',
                'region', 'gp:carrers', 'gp:college type', 'gp:degree course',
                'gp:previous experience', 'gp:school type', 'gp:segment', 'gp:source_project'   ]

df.drop(remove_cols, axis=1, inplace=True)
df_submit.drop(remove_cols, axis=1, inplace=True)

In [151]:
df.isna().sum()

novo_user_id                 0
acertou                      0
created_at                   0
row                          0
commented_by_professor     178
difficulty                2629
discipline_id              178
examining_board_id         182
institute_id               182
knowledge_area_id          178
modality_id                178
nullified                  178
outdated                   178
product_id                 178
publication_year           187
right_answer               178
scholarity_id              178
novo_question_id             0
subject_id                   0
dtype: int64

In [152]:
# Preenchendo faltantes com 0 no primeiro ciclo
cols_fill_na = [    'commented_by_professor', 'difficulty', 'discipline_id', 'examining_board_id',
                    'institute_id', 'knowledge_area_id', 'modality_id', 'nullified', 'outdated',
                    'product_id', 'publication_year', 'right_answer', 'scholarity_id'    ]

for i in cols_fill_na:
    df[i].fillna(0, inplace=True)
    df_submit[i].fillna(0, inplace=True)

## <font color='green'>1.5. Alterando os tipos de Variáveis</font>

In [153]:
cols_to_int = [ 'commented_by_professor', 'difficulty', 'discipline_id', 'examining_board_id',
                'institute_id', 'knowledge_area_id', 'modality_id', 'nullified', 'outdated',
                'product_id', 'publication_year', 'scholarity_id', 'row'    ]

df[cols_to_int] = df[cols_to_int].astype(np.int64()).astype(object)
df_submit[cols_to_int] = df_submit[cols_to_int].astype(np.int64()).astype(object)

# <font color='orange'>2. FEATURE ENGINEERING</font>

# <font color='orange'>3. FILTRAGEM DOS DADOS</font>

# <font color='orange'>4. ANÁLISE EXPLORATÓRIA DOS DADOS</font>

## <font color='green'>4.1. Variável Resposta</font>

In [154]:
df[target_col].value_counts(normalize=True)

1    0.654281
0    0.345720
Name: acertou, dtype: float64

# <font color='orange'>5. PREPARAÇÃO DOS DADOS</font>

In [155]:
from category_encoders.target_encoder import TargetEncoder

In [156]:
cols_selected = [ 
'row',
'commented_by_professor',
'difficulty',
'discipline_id',
'examining_board_id',
'institute_id',
'knowledge_area_id',
'modality_id',
'nullified',
'outdated',
'product_id',
'publication_year',
'scholarity_id',
'novo_question_id',
]



In [157]:
# ---
te1 = TargetEncoder()
te1.fit(df[cols_selected[0:7]], df[target_col])

# ---
te2 = TargetEncoder()
te2.fit(df[cols_selected[7:-1]], df[target_col])



In [158]:
# ---
df[cols_selected[0:7]] = te1.transform(df[cols_selected[0:7]])
df[cols_selected[7:-1]] = te2.transform(df[cols_selected[7:-1]])

# ---
df_submit[cols_selected[0:7]] = te1.transform(df_submit[cols_selected[0:7]])
df_submit[cols_selected[7:-1]] = te2.transform(df_submit[cols_selected[7:-1]])

# <font color='orange'>6. SELEÇÃO DE VARIÁVEIS</font>

# <font color='orange'>7. TREINAMENTO DE MODELOS</font>

In [126]:
from sklearn.model_selection import cross_validate

In [127]:
model = GradientBoostingClassifier()

In [128]:
# cv_results = cross_validate(model, df[cols_selected], df[target_col], cv=3, scoring='f1')

In [133]:
cv_results

{'fit_time': array([227.38399839, 223.11022592, 217.23665261]),
 'score_time': array([1.34260583, 1.37035227, 1.37064481]),
 'test_score': array([0.7816661 , 0.77564845, 0.76989178])}

In [134]:
model.fit(df[cols_selected], df[target_col])

In [159]:
df_submit.head()

Unnamed: 0,novo_user_id,acertou,created_at,row,commented_by_professor,difficulty,discipline_id,examining_board_id,institute_id,knowledge_area_id,modality_id,nullified,outdated,product_id,publication_year,right_answer,scholarity_id,novo_question_id,subject_id
0,1,,23/03/2022 21:46,0.654281,0.656691,0.650133,0.609048,0.596064,0.595023,0.667238,0.629218,0.654642,0.655232,0.657642,0.631144,B,0.64085,484766,"[1908.0, 6564.0]"
1,2,,25/03/2016 22:23,0.654281,0.656691,0.835415,0.640998,0.676633,0.703349,0.639369,0.629218,0.654642,0.655232,0.657642,0.660656,C,0.64085,159545,"[20423.0, 20867.0]"
2,3,,02/12/2017 23:21,0.654281,0.656691,0.320012,0.624068,0.624384,0.636194,0.645611,0.629218,0.654642,0.655232,0.657642,0.654476,C,0.64085,233145,"[13881.0, 13902.0, 13933.0]"
3,4,,29/09/2019 17:04,0.654281,0.652337,0.835415,0.680741,0.697314,0.709656,0.657894,0.730685,0.654642,0.655232,0.657642,0.657146,C,0.64085,108082,"[49.0, 8608.0]"
4,5,,11/09/2017 18:50,0.654281,0.656691,0.474593,0.668954,0.630929,0.640983,0.667238,0.629218,0.654642,0.655232,0.657642,0.663414,A,0.64085,151695,"[305.0, 10772.0]"


In [160]:
y_hat = model.predict(df_submit[cols_selected])

In [161]:
df_submit['pred'] = y_hat

In [164]:
df_submit['pred'].to_csv('../data/submissions/submission_01_20221007.csv', index=False)

In [None]:
import pickle
pickle.dump(model, open('../models/model_01.pkl', 'wb'))

# <font color='orange'>8. AJUSTE DOS HIPERPARÂMETROS</font>