In [None]:
import functions as func
import pandas as pd
import numpy as np
import re

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

# Preprocessing

In [None]:
# Um diretorio que contém todos os datasets
path = 'data'

df_dict = func.readCSV_DATA(path)

In [None]:
df_train = df_dict['train.csv']

In [None]:
question_lst = []
answer_lst = []

for column in df_train.columns[11:]:
    match = re.match('([a-z]*)_', column)
    if match.group(1) == 'question':
        question_lst.append(column)
    else:
        answer_lst.append(column)

In [None]:
df_train = func.prepareData(df_train)

# Extracting simple text features

In [None]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
%%time
question_vectorizer = TfidfVectorizer(tokenizer=word_tokenize,
                                      strip_accents='ascii',
                                      stop_words='english',
                                      min_df=3,
                                      max_df=int(df_train.shape[0]/30))

question_X = question_vectorizer.fit_transform(df_train['question_text_clean'])

In [None]:
%%time
answer_vectorizer = TfidfVectorizer(tokenizer=word_tokenize,
                                    strip_accents='ascii',
                                    stop_words='english',
                                    min_df=3,
                                    max_df=int(df_train.shape[0]/30))
answer_X = answer_vectorizer.fit_transform(df_train['answer_text_clean'])

In [None]:
question_y = df_train[question_lst]
answer_y = df_train[answer_lst]

In [None]:
X_train_question, X_test_question, y_train_question, y_test_question = train_test_split(
    question_X, question_y, test_size=0.2)

X_train_answer, X_test_answer, y_train_answer, y_test_answer = train_test_split(
    answer_X, answer_y, test_size=0.2)

# Train a Linear Regression in NN format

In [None]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Activation

## Create model - Answer

In [None]:
output_dim = len(answer_lst)
input_dim  = len(answer_vectorizer.vocabulary_)

answer_model = Sequential()
answer_model.add(Dense(output_dim, input_dim=input_dim, activation='sigmoid')) 

batch_size = 128
nb_epoch = 20

In [None]:
answer_model.compile(optimizer='adam', loss='mae', metrics=['mae'])

history = answer_model.fit(X_train_answer.toarray(), y_train_answer.as_matrix(),
                           batch_size=batch_size,
                           epochs=nb_epoch,
                           verbose=1,
                           validation_data=(X_test_answer.toarray(), y_test_answer.as_matrix()))

## Create model - Question

In [None]:
output_dim = len(question_lst)
input_dim  = len(question_vectorizer.vocabulary_)

question_model = Sequential()
question_model.add(Dense(output_dim, input_dim=input_dim, activation='sigmoid')) 

batch_size = 64
nb_epoch = 100

In [None]:
question_model.compile(optimizer='adam', loss='mae', metrics=['mae'])

history = question_model.fit(X_train_question.toarray(), y_train_question.as_matrix(),
                             batch_size=batch_size,
                             epochs=nb_epoch,
                             verbose=1,
                             validation_data=(X_test_question.toarray(), y_test_question.as_matrix()))

## Evaluate

In [None]:
print("MSE")
print("Treino: %.2f" % model.evaluate(X_train, y_train, verbose=0)[1])
print("Teste : %.2f" % model.evaluate(X_test, y_test, verbose=0)[1])

In [None]:
y_hat_train = model.predict(X_train)
y_hat_test = model.predict(X_test)

In [None]:
from scipy.stats import spearmanr

spears_train = []
spears_test = []

for i in range(y.shape[1]):  
    spears_test.append(spearmanr(y_hat_test[:,i], y_test[:,i]))
    spears_train.append(spearmanr(y_hat_train[:,i], y_train[:,i]))

In [None]:
print('Spearman Correlation')
print("Treino: %.2f" % np.mean([i.correlation for i in spears_train if ~np.isnan(i.correlation)]))
print("Teste : %.2f" % np.mean([i.correlation for i in spears_test if ~np.isnan(i.correlation)]))

# Submission

In [None]:
sub = df_dict['sample_submission.csv']
df_test = df_dict['test.csv']

In [None]:
df_test = func.prepareData(df_test)

answer_X_test = answer_vectorizer.transform(df_test['answer_text_clean'])
question_X_test = question_vectorizer.transform(df_test['question_text_clean'])

In [None]:
answer_y_hat_test = answer_model.predict(answer_X_test.toarray())
question_y_hat_test = question_model.predict(question_X_test.toarray())

In [None]:
y_hat_test = np.hstack((question_y_hat_test, answer_y_hat_test))

In [None]:
for col_index, col in enumerate(question_lst + answer_lst):
    sub[col] = y_hat_test[:, col_index]

In [None]:
sub.head()

In [None]:
sub.to_csv("submission.csv", index = False)