## PROCESSANDO AS QUESTÕES
#### Utilizando GloVe 100 Dim - Português
#### Dataset: Questão sobre cientificidade da Psicologia - PUC GO - Prof. Weber Martins, PhD


In [1]:
from __future__ import print_function 

import os
import sys
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

import ml_metrics as metrics

from keras.models import load_model
from keras.models import model_from_json


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### definir variáveis globais

In [2]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'glove_s100')
FNAME = 'preprocessado_sem_stopwords.csv'
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 10000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.3

#### indexando word vectors

In [3]:
print('Indexing word vectors.')
embeddings_index = {}
aux = 0 # a ideia do aux é pq a primeira linha do arquivo precisa ser descartada 
with open(os.path.join(GLOVE_DIR, 'glove_s100.txt'), encoding="utf8") as f:
    for line in f:
        if aux > 0:
            values = line.split(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        aux = 1
        
print('Found %s word vectors.' % len(embeddings_index))        
        

Indexing word vectors.
Found 934963 word vectors.


In [4]:
len(embeddings_index)

934963

In [5]:
print('Processing text dataset')

df = pd.read_csv(FNAME, encoding = "iso-8859-1")
texts = df['resposta'].values.tolist()
labels = df['nota'].values.tolist()

Processing text dataset


In [6]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) #instancia Tokenizer e num_words manterá só as MAX_NUM_WORDS mais frequentes do corpus
tokenizer.fit_on_texts(texts)                  #aplica o modelo nos textos
sequences = tokenizer.texts_to_sequences(texts)#cada palavra de cada posição da lista texts vira um token

word_index = tokenizer.word_index #word_index =  A dictionary of words and their uniquely assigned integers
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) #Pads sequences to the same length. - adiciona 0.0 caso seja menor que o 
                                                            #tamanho máximo e trunca caso for maior que ele

labels = to_categorical(np.asarray(labels)) # to_categorical = Converts a class vector (integers) to binary class matrix. #asarray = converts the input to an array
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 1338 unique tokens.
Shape of data tensor: (242, 100)
Shape of label tensor: (242, 11)


In [7]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0]) #cria um vetor de inteiros do tamanho de data.shape[0]
np.random.shuffle(indices) #mistura-se eles aleatoriamente
data = data[indices] #atribui os dados e labels de indices randomizados
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) #divide os dados em um fator inteiro

In [8]:
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [9]:
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1) #min = Return the smallest item in an iterable or the smallest of two or more arguments.
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) #np.zeros cria uma lista de zeros da dimensão especificada por args
for word, i in word_index.items(): #para cada (palavra, numero do token dela) em word_index.itens()
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)


Preparing embedding matrix.


In [10]:
print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(256, 4, activation='relu')(embedded_sequences)
x = MaxPooling1D(4)(x)
x = Conv1D(256, 4, activation='relu')(x)
x = MaxPooling1D(4)(x)
x = Conv1D(256, 4, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
#x = Dense(256, activation='relu')(x)
preds = Dense(11, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])


Training model.


In [11]:
model.fit(x_train, y_train,
          batch_size=500,
          epochs=120,
          validation_data=(x_val, y_val))

Train on 170 samples, validate on 72 samples
Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120


Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78/120
Epoch 79/120
Epoch 80/120
Epoch 81/120
Epoch 82/120
Epoch 83/120
Epoch 84/120
Epoch 85/120
Epoch 86/120
Epoch 87/120
Epoch 88/120
Epoch 89/120
Epoch 90/120
Epoch 91/120
Epoch 92/120
Epoch 93/120
Epoch 94/120
Epoch 95/120
Epoch 96/120
Epoch 97/120
Epoch 98/120
Epoch 99/120
Epoch 100/120
Epoch 101/120
Epoch 102/120
Epoch 103/120
Epoch 104/120
Epoch 105/120
Epoch 106/120
Epoch 107/120
Epoch 108/120
Epoch 109/120
Epoch 110/120
Epoch 111/120
Epoch 112/120
Epoch 113/120
Epoch 114/120
Epoch 115/120
Epoch 116/120
Epoch 117/120
Epoch 118/120
Epoch 119/120
Epoch 120/120


<keras.callbacks.History at 0x1997ebd5ef0>

# Testando o modelo

In [12]:
predicted = model.predict(x_val)

In [13]:
aux = np.array(predicted)
np_predicted = np.zeros_like(aux)
np_predicted[np.arange(len(aux)), aux.argmax(1)] = 1

#### visualizando as notas esperadas e previstas pelo modelo

In [14]:
print("             expected                           predicted")
for count, l in enumerate(predicted, start = 0):
    print(str(y_val[count]), np_predicted[count])

             expected                           predicted
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.] [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0

### MAPEAMENTO REVERSO DAS RESPOSTAS PARA INSPEÇÃO DAS NOTAS DADA PELA REDE

In [15]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

In [16]:
Resposta = []
v = []
for val in x_val:
    #print(val)
    for word in val:
        #print(word)
        if word in reverse_word_map:
            #print(word, reverse_word_map[word])
            v.append(reverse_word_map[word])
    Resposta.append(v)
    v = []

Esperado = np.argmax(y_val, axis = 1)
Obtido = np.argmax(np_predicted,axis = 1)

In [2]:
print(Resposta[15])
print("Nota Esperada: ", Esperado[15])
print("Nota Obtida: ", Obtido[15])

NameError: name 'Resposta' is not defined

In [18]:
print(Resposta[6])
print("Nota Esperada: ", Esperado[6])
print("Nota Obtida: ", Obtido[6])

['cientificidade', 'psicologia', 'decorrente', 'multideterminação', 'método', 'forma', 'validação', 'teoria']
Nota Esperada:  5
Nota Obtida:  0


### QUADRATIC WEIGHTED KAPPA METRICS

In [19]:
predicted = np.argmax(np_predicted,axis = 1)
expected = np.argmax(y_val, axis = 1)
print("predicted: ", predicted)
print("expected: ", expected)
qwk = metrics.quadratic_weighted_kappa(predicted, expected) 
print("Quadratic Weighted Kappa:", qwk)

predicted:  [6 6 6 5 6 6 0 5 4 3 3 5 5 3 3 5 2 5 7 3 5 5 5 2 6 3 5 6 4 3 4 6 5 1 4 8 1
 1 3 4 6 4 5 6 5 5 5 1 5 0 6 4 5 0 2 3 3 6 6 4 5 2 6 6 5 6 5 6 4 5 7 9]
expected:  [7 4 6 3 7 6 5 3 4 2 3 4 3 3 2 3 2 6 5 5 5 8 6 5 3 2 5 7 6 2 4 4 4 1 4 3 4
 0 8 3 6 6 2 6 4 6 4 3 3 8 3 5 2 6 1 4 4 2 4 6 1 6 6 4 5 1 3 8 6 6 6 5]
Quadratic Weighted Kappa: 0.16246684350132623


# Registro de performance

#### run 1, qwk: 0.10436893203883502, val_acc = 0.1250  , epochs = 30, vsplit = 0.1
#### run 2, qwk: 0.2349468713105075, val_acc = 0.2292  , epochs = 30, vsplit = 0.2
#### run 3, qwk: 0.35749318801089935, val_acc = 0.1250  , epochs = 30, vsplit = 0.3
#### run 4, qwk: 0.5113365155131265, val_acc = 0.2361  , epochs = 50, vsplit = 0.3
####  <font color='red'> run 5, qwk: 0.5361608651571477, val_acc = 0.2500  , epochs = 100, vsplit = 0.3 </font>
#### run 5, qwk: 0.46097530711006296, val_acc = 0.1667  , epochs = 120, vsplit = 0.3


## Sobre o dataset :

#### tamanho de cada resposta

In [20]:
tamanho = []
for t in texts:
    tamanho.append(len(t.split()))
    if(len(t.split()) == 1):
        print(t)
print(tamanho)

resposta
respondida
[20, 12, 17, 17, 10, 27, 24, 6, 21, 15, 20, 15, 15, 14, 36, 12, 25, 34, 25, 13, 17, 4, 11, 31, 27, 17, 15, 23, 18, 21, 10, 10, 37, 15, 25, 13, 34, 10, 21, 29, 22, 29, 22, 23, 11, 26, 18, 13, 34, 17, 33, 17, 28, 18, 22, 25, 19, 28, 22, 21, 13, 20, 40, 32, 16, 14, 31, 26, 38, 37, 43, 19, 19, 8, 21, 38, 27, 28, 8, 26, 18, 46, 23, 9, 33, 35, 13, 31, 20, 17, 14, 11, 30, 12, 25, 8, 29, 17, 26, 32, 21, 22, 27, 26, 10, 5, 23, 20, 12, 26, 16, 15, 24, 15, 12, 21, 34, 9, 11, 16, 18, 21, 17, 29, 15, 21, 23, 16, 31, 18, 13, 11, 35, 21, 17, 18, 19, 21, 10, 16, 25, 16, 24, 8, 22, 20, 16, 19, 10, 15, 20, 11, 8, 23, 22, 14, 21, 14, 17, 22, 22, 22, 17, 25, 18, 22, 22, 21, 23, 22, 1, 23, 22, 7, 22, 18, 22, 23, 22, 23, 22, 22, 23, 21, 14, 20, 5, 22, 21, 12, 17, 11, 21, 22, 17, 21, 20, 20, 24, 1, 24, 21, 20, 19, 24, 23, 17, 21, 19, 23, 20, 17, 22, 12, 20, 14, 25, 20, 20, 26, 21, 8, 22, 20, 9, 22, 14, 15, 25, 23, 19, 10, 3, 22, 20, 21, 17, 23, 21, 25, 22, 16]


#### tamanho mínimo , tamanho máximo, tamanho médio

In [21]:
min(tamanho)

1

In [22]:
max(tamanho)

46

In [23]:
print(int(sum(tamanho)/len(tamanho)))

19
