In [7]:
import functions as func
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Preprocessing

In [2]:
#Um diretorio que contém todos os datasets
path = 'data'

df_dict = func.readCSV_DATA(path)

In [3]:
df_train = df_dict['train.csv']
df_train = func.prepareData(df_train)

# Extracting simple text features

In [4]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [5]:
%%time
vectorizer = TfidfVectorizer(tokenizer=word_tokenize,
                             strip_accents='ascii',
                             stop_words='english',
                             min_df = 3,
                             max_df = int(df_train.shape[0]/30))
X = vectorizer.fit_transform(df_train['text_concat_filter'])

CPU times: user 13.2 s, sys: 21.8 ms, total: 13.3 s
Wall time: 15.1 s


In [9]:
y = df_train.iloc[:,11:41]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

# Train a Linear Regression in NN format

In [12]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Activation

In [13]:
X_train, X_test, y_train, y_test = X_train.toarray(), X_test.toarray(), y_train.as_matrix(), y_test.as_matrix()

## Create model

In [14]:
output_dim = y.shape[1]
input_dim  = len(vectorizer.vocabulary_)

model = Sequential() 
model.add(Dense(output_dim, input_dim=input_dim, activation='sigmoid')) 

batch_size = 128
nb_epoch = 20

In [15]:
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=nb_epoch,
                    verbose=1,
                    validation_data = (X_test, y_test)) 

Train on 5167 samples, validate on 912 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Evaluate

In [16]:
print("MSE")
print("Treino: %.2f" % model.evaluate(X_train, y_train, verbose=0)[1])
print("Teste : %.2f" % model.evaluate(X_test, y_test, verbose=0)[1])

MSE
Treino: 0.05
Teste : 0.06


In [17]:
y_hat_train = model.predict(X_train)
y_hat_test = model.predict(X_test)

In [18]:
from scipy.stats import spearmanr

spears_train = []
spears_test = []

for i in range(y.shape[1]):  
    spears_test.append(spearmanr(y_hat_test[:,i], y_test[:,i]))
    spears_train.append(spearmanr(y_hat_train[:,i], y_train[:,i]))

In [19]:
print('Spearman Correlation')
print("Treino: %.2f" % np.mean([i.correlation for i in spears_train if ~np.isnan(i.correlation)]))
print("Teste : %.2f" % np.mean([i.correlation for i in spears_test if ~np.isnan(i.correlation)]))

Spearman Correlation
Treino: 0.41
Teste : 0.19


# Submission

In [20]:
sub = df_dict['sample_submission.csv']
df_test = df_dict['test.csv']

In [21]:
df_test = func.prepareData(df_test)

X_test = vectorizer.transform(df_test['text_concat_filter'])

In [22]:
y_hat_test = model.predict(X_test.toarray())

In [23]:
for col_index, col in enumerate(y.columns.tolist()):
    sub[col] = y_hat_test[:, col_index]

In [24]:
sub.head()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.872079,0.66974,0.164625,0.680046,0.696886,0.679883,0.672172,0.652595,0.290822,...,0.833562,0.870048,0.699514,0.87974,0.881179,0.843958,0.320865,0.162392,0.616357,0.866328
1,46,0.794487,0.561225,0.176894,0.65769,0.726701,0.740977,0.530179,0.461946,0.281977,...,0.730334,0.798691,0.618893,0.811722,0.818366,0.766045,0.631856,0.224015,0.422958,0.78823
2,70,0.834753,0.57826,0.167457,0.646698,0.729259,0.681578,0.600121,0.560264,0.324556,...,0.796926,0.846578,0.651904,0.857214,0.859079,0.821498,0.348282,0.193854,0.629352,0.840291
3,132,0.82071,0.540696,0.153079,0.716076,0.75722,0.793849,0.567747,0.436264,0.278183,...,0.753776,0.838405,0.65753,0.848023,0.852265,0.806483,0.587206,0.18779,0.518986,0.833823
4,200,0.795944,0.648624,0.217606,0.626658,0.663419,0.692278,0.63158,0.610963,0.384792,...,0.75195,0.797245,0.6398,0.804321,0.804819,0.746243,0.389154,0.230377,0.561255,0.78716


In [25]:
sub.to_csv("submission.csv", index = False)