In [None]:
import functions as func
import pandas as pd
import numpy as np
import gensim
import re

from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

from collections import defaultdict
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

# Preprocessing

In [None]:
# Um diretorio que contém todos os datasets
path = 'data'

df_dict = func.readCSV_DATA(path)

In [None]:
df_train = df_dict['train.csv']

In [None]:
df_train.head()

In [None]:
question_lst = []
answer_lst = []

for column in df_train.columns[11:]:
    match = re.match('([a-z]*)_', column)
    if match.group(1) == 'question':
        question_lst.append(column)
    else:
        answer_lst.append(column)

In [None]:
df_train = func.prepareData(df_train)

# Training word2vec

In [None]:
corpus = [word_tokenize(i) for i in df_train.text_all_clean]

In [None]:
embed_dim = 150

model = gensim.models.Word2Vec(corpus,
                               size=embed_dim,
                               window=5,
                               min_count=2,
                               workers=10)

model.save("word2vec.model")

In [None]:
model = gensim.models.Word2Vec.load("word2vec.model")

In [None]:
vocab = defaultdict(lambda : np.zeros(150))
for w in tqdm(list(model.wv.vocab.keys())):
    vocab[w] = model.wv[w]

## Using as input for model

In [None]:
question_X = func.word2vec_vectorizer(df_train.question_text_clean, word_tokenize, vocab)

In [None]:
answer_X = func.word2vec_vectorizer(df_train.answer_text_clean, word_tokenize, vocab)

In [None]:
question_y = df_train[question_lst]
answer_y = df_train[answer_lst]

In [None]:
X_train_question, X_test_question, y_train_question, y_test_question = train_test_split(
    question_X, question_y, test_size=0.2)

X_train_answer, X_test_answer, y_train_answer, y_test_answer = train_test_split(
    answer_X, answer_y, test_size=0.2)

## Training answer model

In [None]:
output_dim = len(answer_lst)
input_dim = 150

answer_model = func.create_model(output_dim, input_dim, 'binary_crossentropy')

answer_model = func.train_model(answer_model, X_train_answer, y_train_answer, X_test_answer,
                                y_test_answer, batch_size=128, nb_epoch=140, verbose=0)

In [None]:
func.spearman_corr(y_train_answer, answer_model.predict(X_train_answer))

In [None]:
func.spearman_corr(y_test_answer, answer_model.predict(X_test_answer))

## Training question model

In [None]:
output_dim = len(question_lst)

question_model = func.create_model(output_dim, input_dim, 'binary_crossentropy')

question_model = func.train_model(question_model, X_train_question, y_train_question,
                                  X_test_question, y_test_question, batch_size=128, nb_epoch=100, verbose=1)

In [None]:
func.spearman_corr(y_train_question, question_model.predict(X_train_question))

In [None]:
func.spearman_corr(y_test_question, question_model.predict(X_test_question))

## Submission

In [None]:
df_test = func.prepareData(df_dict['test.csv'])

In [None]:
question_X = func.word2vec_vectorizer(df_test.question_text_clean, word_tokenize, vocab)

In [None]:
answer_X = func.word2vec_vectorizer(df_test.answer_text_clean, word_tokenize, vocab)

In [None]:
sub = func.make_submission_df(df_dict['sample_submission.csv'], question_X, answer_X,
                         question_model, answer_model)

In [None]:
sub