# Import and Test Project

In [None]:
! git clone https://github.com/ZurichNLP/xstance.git

In [None]:
import os
os.chdir('xstance')

In [None]:
! unzip data/xstance-data-v1.0.zip -d data

In [None]:
! python evaluate.py \
  --gold data/test.jsonl \
  --pred predictions/mbert_pred.jsonl 

new_comments_defr
DE 76.83541377429334
FR 76.61281705054353

new_questions_defr
DE 68.46881591336131
FR 68.3831150794995

new_topics_defr
DE 68.90323152487849
FR 70.8982523359103

new_comments_it
IT 70.19234360410832



# Baseline models (Ridge and SVM)

## Import modules and preprocessing

In [None]:
import numpy as np, pandas as pd
from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# parse JSON file to panda
training = pd.read_json(f"data/train.jsonl", lines=True)
testing = pd.read_json(f"data/test.jsonl", lines=True)

In [None]:
# create FeatureHasher object
v = FeatureHasher()


# clean data for X_train
def X_train_clean(df):
    df_copy = df.copy()
    df_copy.drop(
        ["id", "question_id", "label", "numerical_label", "author", "topic"],
        axis=1,
        inplace=True)
    d = df_copy.to_dict('records')
    X_train = v.fit_transform(d)
    return X_train


# clean data for X_test
def X_test_clean(df):
    df_copy = df.copy()
    df_copy.drop([
        "id", "question_id", "numerical_label", "label", "author", "topic",
        "test_set"
    ],
                 axis=1,
                 inplace=True)
    d = df_copy.to_dict('records')
    X_test = v.fit_transform(d)
    return X_test


In [None]:
# vectorize and define train and test splits
y_train = training.label.values
y_test = testing.label.values
X_train = X_train_clean(training)
X_test = X_test_clean(testing)

In [None]:
# Check training data
print("The shape of X_train: {}".format(X_train.shape))
print("The shape of y_train: {}".format(y_train.shape))

The shape of X_train: (45640, 1048576)
The shape of y_train: (45640,)


In [None]:
X_train

<45640x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 136920 stored elements in Compressed Sparse Row format>

In [None]:
y_train

array(['AGAINST', 'FAVOR', 'FAVOR', ..., 'AGAINST', 'AGAINST', 'AGAINST'],
      dtype=object)

## Model training and prediction

In [None]:
# define (linear) ridge classifier
ridge = RidgeClassifier(
    max_iter=1000
)  # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html#sklearn.linear_model.RidgeClassifier

# train the model
ridge.fit(X_train, y_train)

classifier_res = classification_report(y_pred=ridge.predict(X_test),
                                       y_true=y_test)
print(classifier_res)


              precision    recall  f1-score   support

     AGAINST       0.58      0.21      0.31      8542
       FAVOR       0.54      0.86      0.66      9163

    accuracy                           0.54     17705
   macro avg       0.56      0.53      0.48     17705
weighted avg       0.56      0.54      0.49     17705



In [None]:
# define SVM
svm = SGDClassifier(
    max_iter=1000
)  # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html

# train the model
SVM_pred = svm.fit(X_train, y_train)

classifier_res = classification_report(y_pred=svm.predict(X_test),
                                       y_true=y_test)
print(classifier_res)

              precision    recall  f1-score   support

     AGAINST       0.58      0.21      0.30      8542
       FAVOR       0.54      0.86      0.66      9163

    accuracy                           0.54     17705
   macro avg       0.56      0.53      0.48     17705
weighted avg       0.56      0.54      0.49     17705



In [None]:
# save predictions
ridge_pred = ridge.predict(X_test)
svm_pred = svm.predict(X_test)

In [None]:
# annotate predictions to original df and convert to JSON

# Ridge
ridge_pred = pd.DataFrame(ridge_pred, columns=['label'])
ridge_pred.to_json(f"predictions/ridge_pred.jsonl",
                   orient='records',
                   lines=True)

svm_pred = pd.DataFrame(svm_pred, columns=['label'])
svm_pred.to_json(f"predictions/svm_pred.jsonl", orient='records', lines=True)

## Evaluation of Ridge and SVM

In [None]:
# evaluate Ridge using allennlp
os.chdir('xstance')
! python evaluate.py \
  --gold data/test.jsonl \
  --pred predictions/ridge_pred.jsonl 

new_comments_defr
DE 61.21553079380817
FR 67.00769932649966

new_questions_defr
DE 37.48958844018237
FR 40.357913286543315

new_topics_defr
DE 34.70408897808929
FR 47.658248306490144

new_comments_it
IT 36.30933912503878



In [None]:
# evaluate SVM using allennlp
! python evaluate.py \
  --gold data/test.jsonl \
  --pred predictions/svm_pred.jsonl 

new_comments_defr
DE 61.49409370657138
FR 67.05354612414558

new_questions_defr
DE 37.48958844018237
FR 40.13281278176507

new_topics_defr
DE 34.70408897808929
FR 45.33830134445258

new_comments_it
IT 36.30933912503878



# fastText

In [None]:
! pip install -r fasttext_baseline/requirements.txt

In [None]:
import os
os.chdir('xstance/fasttext_baseline')
! wget http://www.statmt.org/europarl/v7/tools.tgz
! tar -xvf tools.tgz

## Training and Predicting

In [None]:
! python run.py --data-dir ../data --pred ../predictions/mypred.jsonl

## Evaluating

In [None]:
os.chdir('..')
! python evaluate.py \
  --gold data/test.jsonl \
  --pred predictions/mypred.jsonl 

new_comments_defr
DE 69.36650351625285
FR 71.44808401937091

new_questions_defr
DE 62.072051496361944
FR 62.69540264483374

new_topics_defr
DE 62.83243113804262
FR 63.36653804761529

new_comments_it
IT 47.57809069811202



# M-BERT

In [None]:
! pip install -r mbert_baseline/requirements.txt

In [None]:
! pip install allennlp==0.9.0

## Training

In [None]:
# os.chdir('mbert_baseline')
! allennlp train mbert.jsonnet \
    --include-package allennlp_xstance \
    -s mymodel

## Predicting

In [None]:
! allennlp predict mymodel ../data/test.jsonl \
    --include-package allennlp_xstance \
    --predictor xstance_predictor \
    --cuda-device 0 \
    --output-file ../predictions/mypred.jsonl

## Evaluating

In [None]:
os.chdir('..')
! python evaluate.py \
  --gold data/test.jsonl \
  --pred predictions/mypred.jsonl 

new_comments_defr
DE 76.57372034075941
FR 78.1281406729466

new_questions_defr
DE 66.7194473755726
FR 68.87641217539657

new_topics_defr
DE 67.95964871077341
FR 69.37265503399688

new_comments_it
IT 70.70115967885815

