In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from scipy import sparse
from scipy import stats
# from sklearn import linear_model
from sklearn import tree
# from xgboost import XGBClassifier
from sklearn import metrics

In [2]:
def create_corpus(df):
    corpus = []
    for col in df.columns.values.tolist():
        corpus = corpus + df[col].values.tolist()
    return corpus

In [3]:
def create_vocabulary(corpus):
    ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
    ctv.fit(corpus)
    return ctv

In [4]:
# read in pre-processed file
fn = '../input/preprocessed-data/train_preprocessed.csv'
df = pd.read_csv(fn)

# Fit the vocabulary
corpus = create_corpus(df[['anchor_w_context', 'target_w_context']])
ctv = create_vocabulary(corpus)

# Create a numpy to store the results
np_headings = ['fold', 'accuracy', 'pearsons']
np_scores = np.zeros([5,3])

for fold in range(5):
    df_train = df[df.kfold != fold].reset_index(drop=True)  # use only data not equal to provided fold
    df_valid = df[df.kfold == fold].reset_index(drop=True)  # validation data is equal to provided fold

    # transform each sentense to a token
    tr_anchor = ctv.transform(df_train['anchor_w_context'])
    tr_target = ctv.transform(df_train['target_w_context'])
    x_train = sparse.hstack([tr_anchor, tr_target])
    
    va_anchor = ctv.transform(df_valid['anchor_w_context'])
    va_target = ctv.transform(df_valid['target_w_context'])
    x_valid = sparse.hstack([va_anchor, va_target])
                           
    # get the labels
    y_train = df_train['score'].values.tolist()
    y_valid = df_valid['score'].values.tolist()

    model = tree.DecisionTreeClassifier()

    model.fit(x_train,y_train)

    preds = model.predict(x_valid)
    accuracy = metrics.accuracy_score(y_valid, preds)
    pearsons_c, p_value = stats.pearsonr(y_valid, preds)

    print(f"Fold: {fold}")
    print(f"Accuracy = {round(accuracy, 4)}")
    print(f"Pearsons Coefficient = {round(pearsons_c, 4)}")
    print(" ")

    # Store results in numpy
    np_scores[fold, 0] = fold
    np_scores[fold, 1] = accuracy
    np_scores[fold, 2] = pearsons_c

print("Evaluation complete")
average = np.mean(np_scores, axis=0)
scores_df = pd.DataFrame(np_scores, columns=np_headings)
print(scores_df)
print(average)
