In [1]:
import functions as func
import pandas as pd
import numpy as np
import re

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

# Preprocessing

In [2]:
# Um diretorio que contém todos os datasets
path = 'data'

df_dict = func.readCSV_DATA(path)

In [3]:
df_train = df_dict['train.csv']

In [4]:
question_lst = []
answer_lst = []

for column in df_train.columns[11:]:
    match = re.match('([a-z]*)_', column)
    if match.group(1) == 'question':
        question_lst.append(column)
    else:
        answer_lst.append(column)

In [5]:
df_train = func.prepareData(df_train)

# Extracting simple text features

In [6]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [7]:
%%time
question_vectorizer = TfidfVectorizer(tokenizer=word_tokenize,
                                      strip_accents='ascii',
                                      stop_words='english',
                                      min_df=3,
                                      max_df=int(df_train.shape[0]/30))

question_X = question_vectorizer.fit_transform(df_train['question_text_clean'])

CPU times: user 5.94 s, sys: 3.62 ms, total: 5.94 s
Wall time: 6.06 s


In [8]:
%%time
answer_vectorizer = TfidfVectorizer(tokenizer=word_tokenize,
                                    strip_accents='ascii',
                                    stop_words='english',
                                    min_df=3,
                                    max_df=int(df_train.shape[0]/30))
answer_X = answer_vectorizer.fit_transform(df_train['answer_text_clean'])

CPU times: user 5.51 s, sys: 16 ms, total: 5.53 s
Wall time: 5.53 s


In [9]:
question_y = df_train[question_lst]
answer_y = df_train[answer_lst]

In [10]:
X_train_question, X_test_question, y_train_question, y_test_question = train_test_split(
    question_X, question_y, test_size=0.2)

X_train_answer, X_test_answer, y_train_answer, y_test_answer = train_test_split(
    answer_X, answer_y, test_size=0.2)

# Train a Linear Regression in NN format

## Create model - Answer

In [11]:
output_dim = len(answer_lst)
input_dim = len(answer_vectorizer.vocabulary_)

answer_model = func.create_model(output_dim, input_dim, 'binary_crossentropy')

answer_model = func.train_model(answer_model, X_train_answer, y_train_answer, X_test_answer,
                                y_test_answer, batch_size=128, nb_epoch=60, verbose=0)

In [12]:
func.spearman_corr(y_train_answer, answer_model.predict(X_train_answer.toarray()))

0.5736139166916241

In [13]:
func.spearman_corr(y_test_answer, answer_model.predict(X_test_answer.toarray()))

0.233128979166548

## Create model - Question

In [14]:
output_dim = len(question_lst)
input_dim  = len(question_vectorizer.vocabulary_)

question_model = func.create_model(output_dim, input_dim, 'binary_crossentropy')

In [15]:
question_model = func.train_model(question_model, X_train_question, y_train_question, X_test_question,
                                  y_test_question, batch_size=64, nb_epoch=100, verbose=1)

Train on 4863 samples, validate on 1216 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [18]:
func.spearman_corr(y_train_question, question_model.predict(X_train_question.toarray()))

0.6636870012671393

In [17]:
func.spearman_corr(y_test_question, question_model.predict(X_test_question.toarray()))

0.31949589982895704

# Submission

In [28]:
df_test = func.prepareData(df_dict['test.csv'])

In [29]:
question_X = question_vectorizer.transform(df_test.question_text_clean)

answer_X = answer_vectorizer.transform(df_test.answer_text_clean)

In [30]:
sub = func.make_submission_df(df_dict['sample_submission.csv'],
                              question_X.toarray(), answer_X.toarray(),
                              question_model, answer_model)