In [217]:
# using BEA dataset at https://github.com/sheffieldnlp/cwisharedtask2018-teaching
import pandas as pd
import numpy as np
import nltk
import spacy

from joblib import dump, load

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

from scipy.stats import pearsonr, spearmanr

from nltk.stem import WordNetLemmatizer
from nltk import wordnet


In [218]:
ES_AVG_WORD_LEN = 6.2

In [219]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/eriwang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Setup Features

In [220]:
PATH = '../../cwisharedtask2018-teaching/datasets/spanish/Spanish'
names = ['HIT ID', 'sent', 'start_offset', 'end_offset', 'target', 'native', 'non_native', 'native_diff', 'non_native_diff', 'bin', 'prob']
TO_KEEP = ['sent', 'target', 'bin', 'prob']

# see https://sites.google.com/view/cwisharedtask2018/datasets
dev_df = pd.read_csv(f'{PATH}_Dev.tsv', sep='\t', names=names)  # to be honest I dunno what this is for
train_df = pd.read_csv(f'{PATH}_Train.tsv', sep='\t', names=names)
test_df = pd.read_csv(f'{PATH}_Test.tsv', sep='\t', names=names)

dev_df = dev_df[TO_KEEP]
train_df = train_df[TO_KEEP]
test_df = test_df[TO_KEEP]

dev_df[:3]

Unnamed: 0,sent,target,bin,prob
0,Los Bronces de Riace conocidos también como Lo...,Los Bronces de Riace,1,0.1
1,Los Bronces de Riace conocidos también como Lo...,Bronces,1,0.2
2,Los Bronces de Riace conocidos también como Lo...,Riace,1,0.3


In [221]:
wnl = WordNetLemmatizer()

fdata = pd.read_csv('../features/span_40k_lemmas.txt', sep='\t')
fdata = dict(zip(fdata['lemma'],fdata['freq']))

In [222]:
def return_freq(lemma):
    try:
        return fdata[lemma]
    except:
        return 0

def get_wordnet_tag(word):
    treebank_tag = nltk.pos_tag([word])  # list weirdness to fix function input
    treebank_tag = treebank_tag[0][1]
    
    return treebank_tag

def add_statistical_features(df):
    df_out = df

    df_out['len_chars'] = df['target'].apply(len)
    df_out['len_chars'] = df['len_chars'] / ES_AVG_WORD_LEN

    df_out['len_tokens'] = df['target'].apply(str.split, sep=" ")
    df_out['len_tokens'] = df['len_tokens'].apply(len)

    df_out['lemma'] = df['target'].apply(wnl.lemmatize)
    df_out['pos_tag'] = df['target'].apply(get_wordnet_tag)
    df_out['lemma_freq'] = df['lemma'].apply(return_freq)    
    
    return df_out


dev_df = add_statistical_features(dev_df)
train_df = add_statistical_features(dev_df)
test_df = add_statistical_features(dev_df)
"""
dev_df.drop(dev_df[dev_df.len_tokens>1].index, inplace=True)
train_df.drop(train_df[train_df.len_tokens>1].index, inplace=True)
test_df.drop(test_df[test_df.len_tokens>1].index, inplace=True)
"""

dev_df

Unnamed: 0,sent,target,bin,prob,len_chars,len_tokens,lemma,pos_tag,lemma_freq
0,Los Bronces de Riace conocidos también como Lo...,Los Bronces de Riace,1,0.1,3.225806,4,Los Bronces de Riace,NN,0
1,Los Bronces de Riace conocidos también como Lo...,Bronces,1,0.2,1.129032,1,Bronces,NNS,0
2,Los Bronces de Riace conocidos también como Lo...,Riace,1,0.3,0.806452,1,Riace,NN,0
3,Los Bronces de Riace conocidos también como Lo...,griegas,1,0.1,1.129032,1,griegas,NN,0
4,Los Bronces de Riace conocidos también como Lo...,conocidos,0,0.0,1.451613,1,conocidos,NN,0
...,...,...,...,...,...,...,...,...,...
1617,"Esto no gusto en la cúpula del ejército, que e...",exigía,1,0.1,0.967742,1,exigía,NN,0
1618,"Esto no gusto en la cúpula del ejército, que e...",ejército,0,0.0,1.290323,1,ejército,NN,0
1619,"Esto no gusto en la cúpula del ejército, que e...",duras,0,0.0,0.806452,1,dura,NNS,0
1620,"Esto no gusto en la cúpula del ejército, que e...",condenas,0,0.0,1.290323,1,condenas,NNS,0


In [223]:
INDEP = ['len_chars', 'lemma_freq', 'len_tokens']
DEP = 'bin'

X_train = train_df.loc[:, INDEP] 
y_train = train_df.loc[:, DEP]
X_test = test_df.loc[:, INDEP]
y_test = test_df.loc[:, DEP]
print(y_test)

0       1
1       1
2       1
3       1
4       0
       ..
1617    1
1618    0
1619    0
1620    0
1621    0
Name: bin, Length: 1622, dtype: int64


In [224]:
model = RandomForestRegressor(n_estimators=120, max_depth=70)
model.fit(X_train, y_train)

In [225]:
def metrics(y_true, y_pred):
    ROUND = 5
    print(f"Pearson R: {pearsonr(y_true, y_pred).statistic.round(ROUND)}")
    print(f"Spearman R: {spearmanr(y_true, y_pred).statistic.round(ROUND)}")
    print(f"MAE: {mean_absolute_error(y_true, y_pred).round(ROUND)}")
    print(f"MSE: {mean_squared_error(y_true, y_pred).round(ROUND)}")

In [226]:
y_pred = model.predict(X_test)
y_true = np.array(y_test)
metrics(y_true, y_pred)

result = test_df
result['y_true'] = y_true
result['y_pred'] = y_pred
result

Pearson R: 0.59192
Spearman R: 0.53787
MAE: 0.31495
MSE: 0.15626


Unnamed: 0,sent,target,bin,prob,len_chars,len_tokens,lemma,pos_tag,lemma_freq,y_true,y_pred
0,Los Bronces de Riace conocidos también como Lo...,Los Bronces de Riace,1,0.1,3.225806,4,Los Bronces de Riace,NN,0,1,1.000000
1,Los Bronces de Riace conocidos también como Lo...,Bronces,1,0.2,1.129032,1,Bronces,NNS,0,1,0.283569
2,Los Bronces de Riace conocidos también como Lo...,Riace,1,0.3,0.806452,1,Riace,NN,0,1,0.201702
3,Los Bronces de Riace conocidos también como Lo...,griegas,1,0.1,1.129032,1,griegas,NN,0,1,0.283569
4,Los Bronces de Riace conocidos también como Lo...,conocidos,0,0.0,1.451613,1,conocidos,NN,0,0,0.353511
...,...,...,...,...,...,...,...,...,...,...,...
1617,"Esto no gusto en la cúpula del ejército, que e...",exigía,1,0.1,0.967742,1,exigía,NN,0,1,0.206251
1618,"Esto no gusto en la cúpula del ejército, que e...",ejército,0,0.0,1.290323,1,ejército,NN,0,0,0.319885
1619,"Esto no gusto en la cúpula del ejército, que e...",duras,0,0.0,0.806452,1,dura,NNS,0,0,0.201702
1620,"Esto no gusto en la cúpula del ejército, que e...",condenas,0,0.0,1.290323,1,condenas,NNS,0,0,0.319885


In [227]:
dump(model, '../lcp_model.joblib') 

['../lcp_model.joblib']