In [84]:
# using BEA dataset at https://github.com/sheffieldnlp/cwisharedtask2018-teaching
import pandas as pd
import numpy as np
import nltk
import spacy

from joblib import dump, load

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

from scipy.stats import pearsonr, spearmanr

from nltk.stem import WordNetLemmatizer
from nltk import wordnet


In [None]:
ES_AVG_WORD_LEN = 6.2

In [85]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/eriwang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Setup Features

In [86]:
PATH = '../../cwisharedtask2018-teaching/datasets/spanish'
names = ['HIT ID', 'sent', 'start_offset', 'end_offset', 'target', 'native', 'non_native', 'native_diff', 'non_native_diff', 'bin', 'prob']
TO_KEEP = ['sent', 'target', 'bin', 'prob']

# see https://sites.google.com/view/cwisharedtask2018/datasets
dev_df = pd.read_csv(f'{PATH}/Spanish_Dev.tsv', sep='\t', names=names)  # to be honest I dunno what this is for
train_df = pd.read_csv(f'{PATH}/Spanish_Train.tsv', sep='\t', names=names)
test_df = pd.read_csv(f'{PATH}/Spanish_Test.tsv', sep='\t', names=names)

dev_df = dev_df[TO_KEEP]
train_df = train_df[TO_KEEP]
test_df = test_df[TO_KEEP]

dev_df[:3]

Unnamed: 0,sent,target,bin,prob
0,Los Bronces de Riace conocidos también como Lo...,Los Bronces de Riace,1,0.1
1,Los Bronces de Riace conocidos también como Lo...,Bronces,1,0.2
2,Los Bronces de Riace conocidos también como Lo...,Riace,1,0.3


In [87]:
wnl = WordNetLemmatizer()

fdata = pd.read_csv('../features/span_40k_lemmas.txt', sep='\t')
fdata = dict(zip(fdata['lemma'],fdata['freq']))

In [96]:
def return_freq(lemma):
    try:
        return fdata[lemma]
    except:
        return 0

def get_wordnet_tag(word):
    treebank_tag = nltk.pos_tag([word])  # list weirdness to fix function input
    treebank_tag = treebank_tag[0][1]
    
    return treebank_tag

def add_statistical_features(df):
    df_out = df

    df_out[''] = df['target'].apply(len)
    df_out['num_tokens'] = df['target'].apply(str.split, sep=" ")
    df_out['num_tokens'] = df['num_tokens'].apply(len)

    df_out['lemma'] = df['target'].apply(wnl.lemmatize)
    df_out['pos_tag'] = df['target'].apply(get_wordnet_tag)
    df_out['lemma_freq'] = df['lemma'].apply(return_freq)
    
    return df_out


dev_df = add_statistical_features(dev_df)
train_df = add_statistical_features(dev_df)
test_df = add_statistical_features(dev_df)
dev_df[:3]

TypeError: object of type 'method_descriptor' has no len()

In [89]:
INDEP = ['target_len', 'lemma_freq']
DEP = ['prob']

X_train = train_df.loc[:, INDEP] 
y_train = train_df.loc[:, DEP]
X_test = test_df.loc[:, INDEP]
y_test = test_df.loc[:, DEP]

In [90]:
model = RandomForestClassifier(n_estimators=120, max_depth=750)
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [91]:
def metrics(y_true, y_pred):
    ROUND = 5
    print(f"Pearson R: {pearsonr(y_true, y_pred).statistic.round(ROUND)}")
    print(f"Spearman R: {spearmanr(y_true, y_pred).statistic.round(ROUND)}")
    print(f"MAE: {mean_absolute_error(y_true, y_pred).round(ROUND)}")
    print(f"MSE: {mean_squared_error(y_true, y_pred).round(ROUND)}")

In [92]:
y_pred = model.predict(X_test)
y_true = np.array(y_test['prob'])
metrics(y_true, y_pred)

result = test_df
result['y_true'] = y_true
result['y_pred'] = y_pred
result

Pearson R: 0.42013
Spearman R: 0.49618
MAE: 0.07891
MSE: 0.01582


Unnamed: 0,sent,target,bin,prob,target_len,lemma,pos_tag,lemma_freq,y_true,y_pred
0,Los Bronces de Riace conocidos también como Lo...,Los Bronces de Riace,1,0.1,20,Los Bronces de Riace,NN,0,0.1,0.200000
1,Los Bronces de Riace conocidos también como Lo...,Bronces,1,0.2,7,Bronces,NNS,0,0.2,0.050000
2,Los Bronces de Riace conocidos también como Lo...,Riace,1,0.3,5,Riace,NN,0,0.3,0.033541
3,Los Bronces de Riace conocidos también como Lo...,griegas,1,0.1,7,griegas,NN,0,0.1,0.050000
4,Los Bronces de Riace conocidos también como Lo...,conocidos,0,0.0,9,conocidos,NN,0,0.0,0.076875
...,...,...,...,...,...,...,...,...,...,...
1617,"Esto no gusto en la cúpula del ejército, que e...",exigía,1,0.1,6,exigía,NN,0,0.1,0.041667
1618,"Esto no gusto en la cúpula del ejército, que e...",ejército,0,0.0,8,ejército,NN,0,0.0,0.072105
1619,"Esto no gusto en la cúpula del ejército, que e...",duras,0,0.0,5,dura,NNS,0,0.0,0.033541
1620,"Esto no gusto en la cúpula del ejército, que e...",condenas,0,0.0,8,condenas,NNS,0,0.0,0.072105


In [93]:
dump(model, 'lcp.joblib') 

['lcp.joblib']