# Bag-of-Words (BoW) baseline for sentence annotation

This was an experimental notebook. I made a naive bag-of-words model using the annotated sentences, then used this to evaluate predictive performance on an unseen dataset. This matches was the NLP team worked on, but avoids BERTje altogether. The purpose was to provide a baseline to verify that the BERTje method was performing at least as well as the most naive alternative.

In [2]:
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import sklearn
import statsmodels
import torch
from matplotlib import pyplot as plt
from tqdm import tqdm as tqdm

# Make graphics nice
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Set sensible defaults
sns.set()
sns.set_style("ticks")
sns.set_context('paper')

In [None]:
df_train = pd.read_csv('//data2/Documents/a-proof/bow_data/covid_traindata.tsv', sep='\t')
df_train.head()

In [54]:
df_test = pd.read_csv('//data2/Documents/a-proof/bow_data/covid_testdata.tsv', sep='\t')
df_test.shape

(17365, 6)

In [68]:
# extract all annotated sentences
df_train = df_train[~df_train.annotations.isna()]
df_test = df_test[~df_test.annotations.isna()]

# Make initial corpus from train data
corpus = df_train.sentence.values
len(corpus)

14718

In [73]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

stop_words = stopwords.words('dutch')

vectorizer = CountVectorizer(
    lowercase=True,
    stop_words=stop_words,
    strip_accents='ascii',
    min_df=0.002,
    max_df=0.9,
)
X_train = vectorizer.fit_transform(corpus)
print('Vocabulary size:', len(vectorizer.vocabulary_))
print('Train:', X_train.shape)

X_test = vectorizer.transform(df_test.sentence.values)
print('Test:', X_test.shape)

Vocabulary size: 479
Train: (14718, 479)
Test: (2983, 479)


In [59]:
train_vocab = vectorizer.vocabulary_
train_vocab

{'passieve': 333,
 'lift': 259,
 'lopen': 271,
 'mogelijk': 292,
 'beweegadvies': 102,
 'starten': 389,
 'stoel': 392,
 'unit': 419,
 '30': 38,
 '25': 32,
 'min': 288,
 'doel': 159,
 'ft': 180,
 'plan': 342,
 'bed': 85,
 'week': 460,
 'respiratoire': 363,
 'insufficientie': 232,
 'covid': 134,
 'voorgeschiedenis': 445,
 'medicatie': 279,
 'beloop': 96,
 'opname': 322,
 'overname': 327,
 'aldaar': 65,
 '27': 34,
 '03': 3,
 'opgenomen': 320,
 'bekend': 94,
 'obesitas': 306,
 'hypertensie': 224,
 'hypercholesterolemie': 223,
 'depressie': 148,
 'gecompliceerd': 185,
 'hit': 217,
 'toename': 407,
 'adequaat': 54,
 'wv': 470,
 'icc': 226,
 'herstart': 211,
 '22': 29,
 '05': 5,
 'verdere': 432,
 'afname': 62,
 'consolidaties': 129,
 'icu': 227,
 'dd': 146,
 '19': 19,
 'pneumonie': 344,
 'behandeling': 91,
 'gedurende': 189,
 'dagen': 145,
 'waarna': 454,
 'langzaam': 255,
 'waarbij': 452,
 'patient': 334,
 'klinisch': 244,
 'heel': 207,
 'herstel': 212,
 'zien': 475,
 'verpleegafdeling': 434

In [118]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.80)
pca = pca.fit(X_train.toarray())
_X_train = pca.transform(X_train.toarray())
_X_test = pca.transform(X_test.toarray())
_X_train.shape, _X_test.shape

((14718, 219), (2983, 219))

In [None]:
DOMAINS = ['STM', 'INS', 'BER', 'FAC']

def indicate_domain(s, d):
    assert d in DOMAINS
    if not isinstance(s, str):
        return 0
    if d in s:
        return 1
    return 0

for df in [df_train, df_test]:
    for d in DOMAINS:
        df[f"ind_{d}"] = df['annotations'].apply(indicate_domain, d=d)

df_train.sample(5)

In [122]:
from collections import defaultdict

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as prfs
from sklearn.svm import SVC

results = defaultdict(list)

for target in ['ind_STM', 'ind_INS', 'ind_FAC', 'ind_BER']:
    print(target)
    y_train = df_train[target].values
    y_test = df_test[target].values
    for model in [LogisticRegression, SVC]:
        for pca_flag in [True, False]:
            
            traindata = _X_train if pca_flag else X_train
            testdata = _X_test if pca_flag else X_test
            mod = model().fit(traindata, y_train)
            preds = mod.predict(testdata)
            acc = accuracy_score(y_test, preds)
            prec, rec, f1, sup = prfs(y_test, preds, average='binary', pos_label=1)
            results['model'].append(type(mod).__name__)
            results['pca'].append(pca_flag)
            results['class'].append(target)
            results['accuracy'].append(acc)
            results['precision'].append(prec)
            results['recall'].append(rec)
            results['f1_score'].append(f1)

df_results = pd.DataFrame(results)
df_results

ind_STM
ind_INS
ind_FAC
ind_BER


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,model,pca,class,accuracy,precision,recall,f1_score
0,LogisticRegression,True,ind_STM,0.97184,0.633333,0.206522,0.311475
1,LogisticRegression,False,ind_STM,0.97117,0.578947,0.23913,0.338462
2,SVC,True,ind_STM,0.97117,0.607143,0.184783,0.283333
3,SVC,False,ind_STM,0.970164,0.555556,0.163043,0.252101
4,LogisticRegression,True,ind_INS,0.987261,0.666667,0.232558,0.344828
5,LogisticRegression,False,ind_INS,0.987261,0.631579,0.27907,0.387097
6,SVC,True,ind_INS,0.987596,0.875,0.162791,0.27451
7,SVC,False,ind_INS,0.987596,0.875,0.162791,0.27451
8,LogisticRegression,True,ind_FAC,0.984579,0.833333,0.185185,0.30303
9,LogisticRegression,False,ind_FAC,0.984244,0.684211,0.240741,0.356164


In [124]:
df_results.set_index(['class', 'model', 'pca']).round(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,precision,recall,f1_score
class,model,pca,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ind_STM,LogisticRegression,True,0.9718,0.6333,0.2065,0.3115
ind_STM,LogisticRegression,False,0.9712,0.5789,0.2391,0.3385
ind_STM,SVC,True,0.9712,0.6071,0.1848,0.2833
ind_STM,SVC,False,0.9702,0.5556,0.163,0.2521
ind_INS,LogisticRegression,True,0.9873,0.6667,0.2326,0.3448
ind_INS,LogisticRegression,False,0.9873,0.6316,0.2791,0.3871
ind_INS,SVC,True,0.9876,0.875,0.1628,0.2745
ind_INS,SVC,False,0.9876,0.875,0.1628,0.2745
ind_FAC,LogisticRegression,True,0.9846,0.8333,0.1852,0.303
ind_FAC,LogisticRegression,False,0.9842,0.6842,0.2407,0.3562
