In [None]:
import joblib
import csv
from pathlib import Path
import numpy as np

Perform 5-fold cross validation with the baseline from the task organizers in order to have a point of reference for the evaluation of our models' performance.

In [None]:
data_dir = Path("../data")
train_path = "haspeede2_dev/haspeede2_dev_taskAB.tsv"

In [None]:
train_set = []

with open(data_dir / train_path, 'r') as infile:
    reader = csv.DictReader(infile, delimiter='\t')
    for row in reader:
        train_set.append(row)

In [4]:
with open('baseline_svc_taskA.joblib', 'rb') as infile:
    baseline = joblib.load(infile)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [5]:
baseline

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('unigrams',
                                                 TfidfVectorizer(max_features=5000)),
                                                ('chars',
                                                 TfidfVectorizer(analyzer='char',
                                                                 max_features=5000,
                                                                 ngram_range=(2,
                                                                              5)))])),
                ('svm', LinearSVC())])

In [6]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    'accuracy': 'accuracy',
    'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
    'recall_macro': make_scorer(recall_score, average='macro', zero_division=0),
    'f1_macro': make_scorer(f1_score, average='macro', zero_division=0),
    'f1_positive': make_scorer(f1_score, pos_label=1, zero_division=0)
}

In [7]:
train_X = [record['text '] for record in train_set]
train_y = [int(record['hs']) for record in train_set]

In [8]:
scores = cross_validate(baseline, train_X, train_y, scoring=scoring, cv=splitter)

print("Mean macro-f1", np.mean(scores['test_f1_macro']).round(3))
print("Standard deviation macro-f1", np.std(scores['test_f1_macro']).round(3))

Mean macro-f1 0.755
Standard deviation macro-f1 0.007


In [9]:
results_report = [f"{np.mean(scores['test_'+ metric]).round(3)} ({np.std(scores['test_' +metric]).round(3)})"
                  for metric in scoring]
results_report

['0.765 (0.005)',
 '0.757 (0.005)',
 '0.754 (0.008)',
 '0.755 (0.007)',
 '0.706 (0.012)']

In [10]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_scores = cross_validate(dummy_clf, train_X, train_y, scoring=scoring, cv=splitter)
dummy_results_report = [f"{np.mean(dummy_scores['test_'+ metric]).round(3)} ({np.std(dummy_scores['test_' +metric]).round(3)})"
                  for metric in scoring]

In [None]:
tables_dir = Path('../results/tables')
with open(tables_dir / 'performance_validation.csv', 'a') as outfile:
    outfile.writelines("baseline_svc,Linear SVC with TF-IDF of word unigrams and char 2-5-grams," + ','.join(results_report) + '\n')
    outfile.writelines("baseline_mfc,Dummy classifier with most frequent class strategy," + ','.join(dummy_results_report) + '\n')