## PROCESSANDO AS QUESTÕES
#### Utilizando GloVe 100 Dim - Português
#### Dataset: Questão sobre cientificidade da Psicologia - PUC GO - Prof. Weber Martins, PhD

#### 2 labels classificando por notas agrupadas da seguinte forma:
##### notas menores que 5, notas maiores que 6

In [0]:
#imports


from __future__ import print_function 

import gensim
from gensim import utils
import os
import sys
import numpy as np
import pandas as pd
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import ml_metrics as metrics

##### edit
from keras.models import load_model
from keras.models import model_from_json
 
####

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### definir variáveis globais

In [0]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'glove_s100')
FNAME = 'preprocessado_sem_stopwords.csv'
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 10000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.3

#### Carregar o word embedding no gensim


In [0]:
model = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(GLOVE_DIR, 'glove_s100.txt'), binary=False)

#### indexando word vectors

In [0]:
print('Processing text dataset')

df = pd.read_csv(FNAME, encoding = "iso-8859-1")
texts = df['resposta'].values.tolist()
labels = df['nota'].values.tolist()

Processing text dataset


In [0]:
quantidade= {'0_5': 0, '6_10':0}
for count, l in enumerate(labels, start = 0):
    if l < 6:
        labels[count] = 0
        quantidade['0_5'] += 1
    else:
        labels[count] = 1
        quantidade['6_10'] += 1

labels
print(quantidade)

{'0_6': 169, '6_10': 73}


### Tokenize

In [0]:
def preprocess(text):
    doc = text_to_word_sequence(text)
    doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
    return doc

In [0]:
corpus = [preprocess(text) for text in texts]

### Remove empty docs

In [0]:
def filter_docs(corpus, texts, labels, condition_on_doc):
    """
    Filter corpus, texts and labels given the function condition_on_doc which takes
    a doc.
    The document doc is kept if condition_on_doc(doc) is true.
    """
    number_of_docs = len(corpus)

    if texts is not None:
        texts = [text for (text, doc) in zip(texts, corpus)
                 if condition_on_doc(doc)]

    labels = [i for (i, doc) in zip(labels, corpus) if condition_on_doc(doc)]
    corpus = [doc for doc in corpus if condition_on_doc(doc)]

    print("{} docs removed".format(number_of_docs - len(corpus)))

    return (corpus, texts, labels)

In [0]:
corpus, texts, labels = filter_docs(corpus, texts, labels, lambda doc: (len(doc) != 0))

0 docs removed


### Remove words that are not in the model and documents that dont have a representation

In [0]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    return np.mean(word2vec_model[doc], axis=0)

In [0]:
def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.vocab for word in doc)

In [0]:
corpus, texts, labels = filter_docs(corpus, texts, labels, lambda doc: has_vector_representation(model, doc))

0 docs removed


In [0]:
x =[]
for doc in corpus: #look up each doc in model
    x.append(document_vector(model, doc))

In [0]:
X = np.array(x) #list to array

In [0]:
labels = to_categorical(np.asarray(labels)) # to_categorical = Converts a class vector (integers) to binary class matrix. #asarray = converts the input to an array
print('Shape of data tensor:', X.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (242, 100)
Shape of label tensor: (242, 2)


In [0]:
# split the data into a training set and a validation set
indices = np.arange(X.shape[0]) #cria um vetor de inteiros do tamanho de data.shape[0]
np.random.shuffle(indices) #mistura-se eles aleatoriamente
data = X[indices] #atribui os dados e labels de indices randomizados
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) #divide os dados em um fator inteiro

In [0]:
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [0]:
print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='float32')
x = Dense(200, activation='relu')(sequence_input)
#x = Dense(200, activation='relu')(x)
preds = Dense(len(labels[1]), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.summary()



Training model.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_23 (InputLayer)        (None, 100)               0         
_________________________________________________________________
dense_60 (Dense)             (None, 200)               20200     
_________________________________________________________________
dense_61 (Dense)             (None, 2)                 402       
Total params: 20,602
Trainable params: 20,602
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.fit(x_train, y_train,
          batch_size=500,
          epochs=100,
          validation_data=(x_val, y_val))

Train on 170 samples, validate on 72 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100


Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x2512948d128>

# testando o modelo

In [0]:
predicted = model.predict(x_val)

#### visualizando as notas esperadas e previstas pelo modelo

In [0]:
aux = np.array(predicted)
np_predicted = np.zeros_like(aux)
np_predicted[np.arange(len(aux)), aux.argmax(1)] = 1

In [0]:
print("expected predicted")
for count, l in enumerate(predicted, start = 0):
    print(str(y_val[count])," ", np_predicted[count])

expected predicted
[0. 1.]   [1. 0.]
[0. 1.]   [0. 1.]
[1. 0.]   [1. 0.]
[0. 1.]   [0. 1.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[0. 1.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[0. 1.]   [1. 0.]
[0. 1.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[0. 1.]   [1. 0.]
[1. 0.]   [1. 0.]
[0. 1.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[0. 1.]   [1. 0.]
[0. 1.]   [0. 1.]
[1. 0.]   [1. 0.]
[1. 0.]   [0. 1.]
[1. 0.]   [1. 0.]
[0. 1.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [0. 1.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [0. 1.]
[0. 1.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [0. 1.]
[0. 1.]   [1. 0.]
[0. 1.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [0. 1.]
[0. 1.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]   [1. 0.]
[1. 0.]  

### QUADRATIC WEIGHTED KAPPA METRICS

In [0]:


predicted = np.argmax(np_predicted,axis = 1)
expected = np.argmax(y_val, axis = 1)
print(predicted)
print(expected)

qwk = metrics.quadratic_weighted_kappa(predicted, expected) 

qwk

[0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0]
[1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0
 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1]


0.030172413793103536

# Registro de performance

#### run 1, qwk: 0.16000000000000003, val_acc = 0.7083  , epochs = 30, vsplit = 0.1
#### run 2, qwk: 0.00917431192660545, val_acc = 0.6250  , epochs = 30, vsplit = 0.2
#### run 3, qwk: 0.04166666666666674, val_acc = 0.6806  , epochs = 30, vsplit = 0.3
#### <font color = "red"> run 4, qwk: 0.2660944206008583, val_acc = 0.7361, epochs = 30, vsplit = 0.3 </font>
#### run 5, qwk: 0.10891089108910901, val_acc = 0.6528, epochs = 50, vsplit = 0.3
#### run 6, qwk: 0.030172413793103536, val_acc = 0.6528, epochs = 100, vsplit = 0.3


## Sobre o dataset :

#### tamanho de cada resposta

In [0]:
tamanho = []
for t in texts:
    tamanho.append(len(t.split()))
    if(len(t.split()) == 1):
        print(t)
print(tamanho)

#### tamanho mínimo , tamanho máximo, tamanho médio

In [0]:
min(tamanho)

In [0]:
max(tamanho)

In [0]:
print(int(sum(tamanho)/len(tamanho)))