In [None]:
import pandas as pd
import re

import warnings
warnings.filterwarnings("ignore")

# Preprocessing

In [None]:
df = pd.read_csv('train.csv')

In [None]:
# Concat three main text columns
df['text_concat'] = ['\n\n'.join([i,j,k]) for i,j,k in zip(df.question_title, df.question_body, df.answer)]

# Clean numbers
df['text_concat_filter'] = [re.sub('[0-9]+[^ ,.]*[0-9]*', '_num_', i) for i in df.text_concat]

In [None]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
%%time
vectorizer = TfidfVectorizer(tokenizer=word_tokenize,
                             strip_accents='ascii',
                             stop_words='english',
                             min_df = 3,
                             max_df = int(df.shape[0]/30))
X = vectorizer.fit_transform(df['text_concat_filter'])

In [None]:
y = df.iloc[:,11:41]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

# Train a Linear Regression in NN format

In [None]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Activation

In [None]:
X_train, X_test, y_train, y_test = X_train.toarray(), X_test.toarray(), y_train.as_matrix(), y_test.as_matrix()

## Create model

In [None]:
output_dim = y.shape[1]
input_dim  = len(vectorizer.vocabulary_)

model = Sequential() 
model.add(Dense(output_dim, input_dim=input_dim, activation='sigmoid')) 

batch_size = 128
nb_epoch = 20

In [None]:
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=nb_epoch,
                    verbose=1,
                    validation_data = (X_test, y_test)) 

## Evaluate

In [None]:
print("MSE")
print("Treino: %.2f" % model.evaluate(X_train, y_train, verbose=0)[1])
print("Teste : %.2f" % model.evaluate(X_test, y_test, verbose=0)[1])

In [None]:
y_hat_train = model.predict(X_train)
y_hat_test = model.predict(X_test)

In [None]:
from scipy.stats import spearmanr

spears_train = []
spears_test = []

for i in range(y.shape[1]):  
    spears_test.append(spearmanr(y_hat_test[:,i], y_test[:,i]))
    spears_train.append(spearmanr(y_hat_train[:,i], y_train[:,i]))

In [None]:
print('Spearman Correlation')
print("Treino: %.2f" % np.mean([i.correlation for i in spears_train if ~np.isnan(i.correlation)]))
print("Teste : %.2f" % np.mean([i.correlation for i in spears_test if ~np.isnan(i.correlation)]))

# Submission

In [None]:
sub = pd.read_csv("sample_submission.csv")
df_test = pd.read_csv('test.csv')

In [None]:
df_test['text_concat'] = ['\n\n'.join([i,j,k]) for i,j,k in zip(df_test.question_title, df_test.question_body, df_test.answer)]
df_test['text_concat_filter'] = [re.sub('[0-9]+[^ ,.]*[0-9]*', '_num_', i) for i in df_test.text_concat]

X_test = vectorizer.transform(df_test['text_concat_filter'])

In [None]:
y_hat_test = model.predict(X_test.toarray())

In [None]:
for col_index, col in enumerate(y.columns.tolist()):
    sub[col] = y_hat_test[:, col_index]

In [None]:
sub.head()

In [None]:
sub.to_csv("submission.csv", index = False)