## Tarefas de classificação na base de dados do ENEM.

##### Alunos:
- Gabriel Fonseca (2111066)
- Yasmim Santos (2116925)
- Alejandro Elias (2111189)
- Pedro Lucas (2111131)

Base de dados escolhida - Exame Nacional do Ensino Médio (Enem): https://basedosdados.org/dataset/3e9c8804-c31c-4f48-9a45-d67f1c21a859

### Importando as dependências:

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

### Lendo e visualizando os dados:

In [6]:
df_enem = pd.read_csv(
    f"../data/out/enem-dados-tratados.csv",
    dtype={
        "id_inscricao": np.int64,
        "ensino": int,
        "nota_ciencias_natureza": float,
        "nota_ciencias_humanas": float,
        "nota_linguagens_codigos": float,
        "nota_matematica": float,
        "nota_redacao": float,
        "q_formacao_pai": str,
        "q_formacao_mae": str,
        "q_renda_familia": str,
    },
)

df_enem

Unnamed: 0,id_inscricao,ensino,nota_ciencias_natureza,nota_ciencias_humanas,nota_linguagens_codigos,nota_matematica,nota_redacao,q_formacao_pai,q_formacao_mae,q_renda_familia,ano
0,150001892848,3,366.8,436.9,374.2,331.4,380.0,B,A,C,2015
1,150002421428,1,512.0,636.9,552.0,549.2,760.0,A,A,C,2015
2,150004396764,1,470.8,519.3,465.2,350.8,580.0,B,A,B,2015
3,150001657786,1,492.6,641.2,553.2,649.5,840.0,A,A,A,2015
4,150005415838,1,473.3,533.4,443.3,447.4,400.0,A,A,A,2015
...,...,...,...,...,...,...,...,...,...,...,...
357268,210054596750,1,450.6,403.1,443.3,479.8,0.0,E,E,B,2022
357269,210056286560,1,416.5,427.3,484.6,376.2,0.0,D,D,A,2022
357270,210057495281,1,462.1,421.7,432.1,530.9,0.0,C,D,B,2022
357271,210056812211,1,519.1,570.4,537.3,388.7,0.0,D,H,B,2022


### Preparando os dados para utilização no modelo:

In [7]:
mm_scaler = MinMaxScaler()

df_enem = df_enem[df_enem["nota_ciencias_natureza"] != 0.0]
df_enem = df_enem[df_enem["nota_ciencias_humanas"] != 0.0]

df_enem["nota_objetiva"] = (
    df_enem["nota_ciencias_natureza"]
    + df_enem["nota_ciencias_humanas"]
    + df_enem["nota_linguagens_codigos"]
    + df_enem["nota_matematica"]
) / 4

df_enem["nota_objetiva_scl"] = mm_scaler.fit_transform(df_enem[["nota_objetiva"]])

map_grupo_renda = {
    "A": "nenhuma_renda",
    "B": "muito_baixa_renda",
    "C": "muito_baixa_renda",
    "D": "muito_baixa_renda",
    "E": "muito_baixa_renda",
    "F": "baixa_renda",
    "G": "baixa_renda",
    "H": "baixa_renda",
    "I": "baixa_renda",
    "J": "media_renda",
    "K": "media_renda",
    "L": "media_renda",
    "M": "media_renda",
    "N": "alta_renda",
    "O": "alta_renda",
    "P": "alta_renda",
    "Q": "alta_renda"
}

df_enem["q_renda_familia_classe"] = df_enem["q_renda_familia"].map(map_grupo_renda)
df_enem

Unnamed: 0,id_inscricao,ensino,nota_ciencias_natureza,nota_ciencias_humanas,nota_linguagens_codigos,nota_matematica,nota_redacao,q_formacao_pai,q_formacao_mae,q_renda_familia,ano,nota_objetiva,nota_objetiva_scl,q_renda_familia_classe
0,150001892848,3,366.8,436.9,374.2,331.4,380.0,B,A,C,2015,377.325,0.293561,muito_baixa_renda
1,150002421428,1,512.0,636.9,552.0,549.2,760.0,A,A,C,2015,562.525,0.579518,muito_baixa_renda
2,150004396764,1,470.8,519.3,465.2,350.8,580.0,B,A,B,2015,451.525,0.408129,muito_baixa_renda
3,150001657786,1,492.6,641.2,553.2,649.5,840.0,A,A,A,2015,584.125,0.612870,nenhuma_renda
4,150005415838,1,473.3,533.4,443.3,447.4,400.0,A,A,A,2015,474.350,0.443372,nenhuma_renda
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357268,210054596750,1,450.6,403.1,443.3,479.8,0.0,E,E,B,2022,444.200,0.396819,muito_baixa_renda
357269,210056286560,1,416.5,427.3,484.6,376.2,0.0,D,D,A,2022,426.150,0.368949,nenhuma_renda
357270,210057495281,1,462.1,421.7,432.1,530.9,0.0,C,D,B,2022,461.700,0.423840,muito_baixa_renda
357271,210056812211,1,519.1,570.4,537.3,388.7,0.0,D,H,B,2022,503.875,0.488960,muito_baixa_renda


In [8]:
features = [
    "nota_objetiva_scl",
]

X = np.array(df_enem[features])
Y = np.array(df_enem["q_renda_familia_classe"])

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, train_size=0.8, random_state=5487
)

X_train: np.ndarray = X_train
Y_train: np.ndarray = Y_train
X_test: np.ndarray = X_test
Y_test: np.ndarray = Y_test

pd.DataFrame(X_train).head()

Unnamed: 0,0
0,0.403883
1,0.423686
2,0.445881
3,0.297614
4,0.454605


### Realizando a classificação:

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

logreg_params = {'C': 100, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}
rfc_params = {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 200}
gbc_params = {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}

logreg = LogisticRegression(**logreg_params)
rfc = RandomForestClassifier(**rfc_params)
gbc = GradientBoostingClassifier(**gbc_params)

logreg.fit(X_train, Y_train)
rfc.fit(X_train, Y_train)
gbc.fit(X_train, Y_train)

result_logreg = logreg.score(X_test, Y_test)
result_rfc = rfc.score(X_test, Y_test)
result_gbc = gbc.score(X_test, Y_test)

pd.DataFrame({
  "Acurácia Reg. Logística": [f"{result_logreg:.2%}"],
  "Acurácia RFC": [f"{result_rfc:.2%}"],
  "Acurácia GBC": [f"{result_gbc:.2%}"]
})

Unnamed: 0,Acurácia Reg. Logística,Acurácia RFC,Acurácia GBC
0,75.00%,75.00%,75.00%
