In [1]:
# Pacote para exploração e análise de dados
import pandas as pd

# Pacote com métodos numéricos e representações matriciais
import numpy as np

# Pacotes do scikit-learn para pré-processamento de dados
# "SimpleImputer" é uma transformação para preencher valores faltantes em conjuntos de dados
from sklearn.impute import SimpleImputer

# importacao do tensorflow (tem que ser o 2.1)
import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

# regressor linear
from sklearn.linear_model import LinearRegression

## Lendo os dados

In [2]:
#<< INSIRA O DATASET COMO UM PANDAS DATAFRAME NESTA CÉLULA! >>>
df_data_1 = pd.read_csv("../Data/dataset_desafio_2.csv")
df_data_1.head()

Unnamed: 0,MATRICULA,NOME,REPROVACOES_DE,REPROVACOES_EM,REPROVACOES_MF,REPROVACOES_GO,NOTA_DE,NOTA_EM,NOTA_MF,NOTA_GO,INGLES,H_AULA_PRES,TAREFAS_ONLINE,FALTAS,PERFIL
0,502375,Márcia Illiglener,0,0,0,0,6.2,5.8,4.6,5.9,0.0,2,4,3,EXATAS
1,397093,Jason Jytereoman Izoimum,0,0,0,0,6.0,6.2,5.2,4.5,1.0,2,4,3,EXATAS
2,915288,Bartolomeu Inácio da Gama,0,0,0,0,7.3,6.7,7.1,7.2,0.0,5,0,3,HUMANAS
3,192652,Fernanda Guedes,1,3,1,1,0.0,0.0,0.0,0.0,1.0,4,4,4,DIFICULDADE
4,949491,Alessandre Borba Gomes,1,3,1,1,0.0,0.0,0.0,0.0,1.0,5,2,5,DIFICULDADE


## Removendo colunas não usadas

In [3]:
df_data_2 = df_data_1.drop(columns = ['NOME', 'MATRICULA'])
df_data_2.head()

Unnamed: 0,REPROVACOES_DE,REPROVACOES_EM,REPROVACOES_MF,REPROVACOES_GO,NOTA_DE,NOTA_EM,NOTA_MF,NOTA_GO,INGLES,H_AULA_PRES,TAREFAS_ONLINE,FALTAS,PERFIL
0,0,0,0,0,6.2,5.8,4.6,5.9,0.0,2,4,3,EXATAS
1,0,0,0,0,6.0,6.2,5.2,4.5,1.0,2,4,3,EXATAS
2,0,0,0,0,7.3,6.7,7.1,7.2,0.0,5,0,3,HUMANAS
3,1,3,1,1,0.0,0.0,0.0,0.0,1.0,4,4,4,DIFICULDADE
4,1,3,1,1,0.0,0.0,0.0,0.0,1.0,5,2,5,DIFICULDADE


## Completando dados faltantes em NOTA_GO com valores da regressão dos demais dados

In [4]:
features = ['REPROVACOES_DE', 'REPROVACOES_EM', "REPROVACOES_MF", "REPROVACOES_GO",
    "NOTA_DE", "NOTA_EM", "NOTA_MF", "H_AULA_PRES", "TAREFAS_ONLINE", "FALTAS"]

# Definição da variável-alvo
target = ["PERFIL"]

df_data_3 = df_data_2.query('NOTA_GO == NOTA_GO')

Xrl = df_data_3[features]

yrl = df_data_3["NOTA_GO"]

reg = LinearRegression()
reg.fit(Xrl, yrl)

df_data_2.loc[df_data_2["NOTA_GO"].isnull(), 'NOTA_GO'] = reg.predict(df_data_2[features])[df_data_2["NOTA_GO"].isnull()]

df_data_2.head()

Unnamed: 0,REPROVACOES_DE,REPROVACOES_EM,REPROVACOES_MF,REPROVACOES_GO,NOTA_DE,NOTA_EM,NOTA_MF,NOTA_GO,INGLES,H_AULA_PRES,TAREFAS_ONLINE,FALTAS,PERFIL
0,0,0,0,0,6.2,5.8,4.6,5.9,0.0,2,4,3,EXATAS
1,0,0,0,0,6.0,6.2,5.2,4.5,1.0,2,4,3,EXATAS
2,0,0,0,0,7.3,6.7,7.1,7.2,0.0,5,0,3,HUMANAS
3,1,3,1,1,0.0,0.0,0.0,0.0,1.0,4,4,4,DIFICULDADE
4,1,3,1,1,0.0,0.0,0.0,0.0,1.0,5,2,5,DIFICULDADE


In [5]:
df_data_2[df_data_2['NOTA_GO'].isnull()].count()['NOTA_GO']

0

## Completando valores faltantes com 0

In [6]:
# cria o preprocessamento
si = SimpleImputer(
    missing_values=np.nan,  # os valores faltantes são do tipo ``np.nan`` (padrão Pandas)
    strategy='constant',  # a estratégia escolhida é a alteração do valor faltante por uma constante
    fill_value=0,  # a constante que será usada para preenchimento dos valores faltantes é um int64=0.
    verbose=0,
    copy=True
)

# Aplicamos o SimpleImputer ``si`` ao conjunto de dados df_data_2 (resultado da primeira transformação)
si.fit(X=df_data_2)

# Reconstrução de um novo DataFrame Pandas com o conjunto imputado (df_data_3)
df_data_3 = pd.DataFrame.from_records(
    data=si.transform(
        X=df_data_2
    ),  # o resultado SimpleImputer.transform(<<pandas dataframe>>) é lista de listas
    columns=df_data_2.columns  # as colunas originais devem ser conservadas nessa transformação
)

df_data_3['PERFIL'] = df_data_3['PERFIL'].astype('category')

df_data_3['COD_PERFIL'] = df_data_3['PERFIL'].cat.codes

categorias = dict( enumerate(df_data_3['PERFIL'].cat.categories ) )

print(categorias)
df_data_3 = df_data_3.drop(columns = ['PERFIL'])
df_data_3.head(20)

{0: 'DIFICULDADE', 1: 'EXATAS', 2: 'EXCELENTE', 3: 'HUMANAS', 4: 'MUITO_BOM'}


Unnamed: 0,REPROVACOES_DE,REPROVACOES_EM,REPROVACOES_MF,REPROVACOES_GO,NOTA_DE,NOTA_EM,NOTA_MF,NOTA_GO,INGLES,H_AULA_PRES,TAREFAS_ONLINE,FALTAS,COD_PERFIL
0,0,0,0,0,6.2,5.8,4.6,5.9,0.0,2,4,3,1
1,0,0,0,0,6.0,6.2,5.2,4.5,1.0,2,4,3,1
2,0,0,0,0,7.3,6.7,7.1,7.2,0.0,5,0,3,3
3,1,3,1,1,0.0,0.0,0.0,0.0,1.0,4,4,4,0
4,1,3,1,1,0.0,0.0,0.0,0.0,1.0,5,2,5,0
5,0,0,0,0,7.3,7.4,7.6,6.5,1.0,5,3,5,3
6,0,0,0,0,5.8,6.0,7.3,5.1,1.0,5,2,6,0
7,0,0,0,0,4.9,5.0,5.9,4.6,0.0,2,2,6,0
8,0,0,0,0,4.4,4.8,4.7,4.6,1.0,3,4,4,0
9,0,0,0,0,6.4,5.4,5.0,5.5,1.0,3,5,3,1


   ## Separando o treino e teste

In [7]:
train, test = train_test_split(df_data_3, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

## Criando o dataset do tf

In [8]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(df, target, shuffle=True, batch_size=32):
    dataframe = df.copy()
    labels = dataframe.pop(target)
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    if batch_size is not None:
        ds = ds.batch(batch_size)
    return ds

In [9]:
batch_size = 32
train_ds = df_to_dataset(train, target = 'COD_PERFIL', batch_size=batch_size)
val_ds = df_to_dataset(val, target = 'COD_PERFIL', shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, target = 'COD_PERFIL', shuffle=False, batch_size=batch_size)

In [10]:
for feature_batch, label_batch in train_ds.take(1):
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of NOTA_EM:', feature_batch['NOTA_EM'])
    print('A batch of targets:', label_batch )

Every feature: ['REPROVACOES_DE', 'REPROVACOES_EM', 'REPROVACOES_MF', 'REPROVACOES_GO', 'NOTA_DE', 'NOTA_EM', 'NOTA_MF', 'NOTA_GO', 'INGLES', 'H_AULA_PRES', 'TAREFAS_ONLINE', 'FALTAS']
A batch of NOTA_EM: tf.Tensor(
[5.4 6.4 4.4 0.  5.4 5.8 5.8 5.2 5.1 5.4 4.8 6.  5.  7.5 7.4 7.  6.1 7.5
 7.1 7.4 0.  0.  5.9 5.  8.  0.  5.7 7.2 5.5 6.9 6.6 0. ], shape=(32,), dtype=float64)
A batch of targets: tf.Tensor([1 0 3 0 1 0 0 1 0 3 0 3 1 3 4 4 1 3 1 3 0 0 1 0 2 0 1 4 0 3 1 0], shape=(32,), dtype=int32)


## Agora é necessário configurar as colunas que serão usadas

### Colunas numéricas

In [11]:
feature_columns = []

# numeric cols
for header in ['REPROVACOES_DE', 'REPROVACOES_EM', "REPROVACOES_MF", "REPROVACOES_GO",
    "NOTA_DE", "NOTA_EM", "NOTA_MF", "H_AULA_PRES", "TAREFAS_ONLINE", "FALTAS"]:
      feature_columns.append(feature_column.numeric_column(header))

## Criando o modelo

In [12]:
classificacoes = len(categorias)

In [17]:
# camada de entrada
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

# criando o modelo 
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dropout(.2),
  layers.Dense(classificacoes, activation = tf.nn.softmax)
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [18]:
model.fit(train_ds,
          validation_data=val_ds,
          epochs=10)

Train for 400 steps, validate for 100 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2979a7f6b38>

In [19]:
perda_teste, acuracia_teste = model.evaluate(test_ds)
print('Perda do teste: ', perda_teste)
print('Acuracia do teste: ', acuracia_teste)

Perda do teste:  0.5185059700012207
Acuracia do teste:  0.79725
