In [None]:
import functions as func

import warnings
warnings.filterwarnings("ignore")

# Preprocessing

In [None]:
#Um diretorio que contém todos os datasets
path = 'data'
df_dict = func.readCSV_DATA(path)

In [None]:
df_train = df_dict['train.csv']

In [None]:
func.visualizeSampleText(df_train)

In [None]:
df_train = func.prepareData(df_train)

# Extracting simple text features

In [None]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
%%time
vectorizer = TfidfVectorizer(tokenizer=word_tokenize,
                             strip_accents='ascii',
                             stop_words='english',
                             min_df = 3,
                             max_df = int(df.shape[0]/30))
X = vectorizer.fit_transform(df['text_concat_filter'])

In [None]:
len(vectorizer.vocabulary_)

In [None]:
y = df.iloc[:,11:41]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

## Baseline model

### Train a linear regression for each class column.

In [None]:
from sklearn.linear_model import LinearRegression

from scipy.special import softmax
from scipy.stats import spearmanr

In [None]:
from tqdm import tqdm

In [None]:
spears_train = []
spears_test = []
models = []

for i in tqdm(range(y.shape[1])):
    reg = LinearRegression()
    models.append(reg.fit(X_train, y_train.iloc[:,i]))
    
    spears_train.append(spearmanr(reg.predict(X_train), y_train.iloc[:,i]))
    spears_test.append(spearmanr(reg.predict(X_test), y_test.iloc[:,i]))

In [None]:
print("Train spearman corr: %.2f" % np.mean(spears_train))
print("Test spearman corr: %.2f" % np.mean(spears_test))

### Submission test

In [None]:
df_test = df_dict['test.csv']

In [None]:
df_test = func.prepareData(df_test)

X_sub = vectorizer.transform(df_test['text_concat_filter'])

In [None]:
def min_max(x):
    mini = x.min()
    maxi = x.max()
    
    return (x - mini)/(maxi - mini)

In [None]:
y_hat_test  = []

for i in range(y.shape[1]):
    pred = min_max(models[i].predict(X_sub))
    y_hat_test.append(pred)

In [None]:
sub_dict = {}
sub_dict['qa_id'] = df_test['qa_id']

for col,i in zip(df.iloc[:,11:41].columns, range(len(df.iloc[:,11:41].columns))):
    sub_dict[col] = y_hat_test[i]

In [None]:
df_sub = pd.DataFrame(sub_dict)

In [None]:
df_sub

In [None]:
df_sub.to_csv('submission.csv')

## Spearman correlation tests

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
spears = []

for e in range(0, 50):
    spears_e = []
    for i in range(y.shape[1]):
        spears_e.append(spearmanr([i-e/100 for i in y.iloc[:,i].tolist()], y.iloc[:,i].tolist()).correlation)
    spears.append(spears_e)
    
plt.scatter(list(range(0, 50)), [np.mean(e) for e in spears])
plt.plot(list(range(0, 50)), [np.mean(e) for e in spears])
plt.show()