A simple baseline for the shared task on Ideology and Power Detection.

See the [shared task web page](https://touche.webis.de/clef24/touche24-web/ideology-and-power-identification-in-parliamentary-debates.html)
for details.


In [1]:
import argparse
import os
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from data import get_data

In [11]:
task = "power"
pcode = "ba"
data_dir = "data"
teamname = "baseline"

In [15]:
%%time
# Train a simple logistic regression classifier
# with character n-gram features, print the evaluation
# scores on the validation set.
train_file = os.path.join(data_dir, task,
                          f"{task}-{pcode}-train.tsv")
if not os.path.exists(train_file):
    print(f"{pcode}: skipping, no training data.")
else:
    t_trn, y_trn, t_val, y_val = get_data(train_file)
    vec = TfidfVectorizer(sublinear_tf=True, analyzer="char",
                      ngram_range=(1,3))
    x_trn = vec.fit_transform(t_trn)
    x_val = vec.transform(t_val)
    m = LogisticRegression()
    m.fit(x_trn, y_trn)
    pred = m.predict(x_val)
    p, r, f, _ = precision_recall_fscore_support(
        y_val, pred, average='macro')
    print(f"{pcode}: {100*p:.4f} / {100*r:.4f} / {100*f:.4f}")

ba: 41.6988 / 50.0000 / 45.4737
CPU times: user 8.19 s, sys: 52.2 ms, total: 8.24 s
Wall time: 8.24 s


  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
# if test file exists, create the submission file with predictions.
test_file = os.path.join(data_dir, task,
                          f"{task}-{pcode}-test.tsv")
if os.path.exists(test_file):
    id_test, t_test = get_data(test_file, testset=True)
    x_test = vec.transform(t_test)
    test_pred = m.predict_proba(x_test)
    pred_file = f"{teamname}-{task}-{pcode}-predictions.tsv"
    with open(pred_file, "wt") as fpred:
        for i, p in enumerate(test_pred):
            print(f"{id_test[i]}\t{p[1]}", file=fpred)
