In [49]:
from simpletransformers.classification import ClassificationModel
import pandas as pd
import sklearn
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

#%env CUDA_DEVICE_ORDER=PCI_BUS_ID
#%env CUDA_VISIBLE_DEVICES=2



In [69]:
def get_train_args(model_name):
    train_args = {
        "output_dir": "outputs_exp2/" + model_name,
        "cache_dir": "cache/",
        "best_model_dir": "outputs_exp2/" + model_name + "/best_model/",

        "fp16": False,
        "fp16_opt_level": "O1",
        "max_seq_length": 256,
        "train_batch_size": 8,
        "eval_batch_size": 8,
        "gradient_accumulation_steps": 1,
        "num_train_epochs": 5,
        "weight_decay": 0,
        "learning_rate": 5e-6,
        "adam_epsilon": 1e-9,
        "warmup_ratio": 0.06,
        "warmup_steps": 0,
        "max_grad_norm": 1.0,
        "do_lower_case": False,

        "logging_steps": 50,
        "evaluate_during_training": True,
        "evaluate_during_training_steps": 0,
        "evaluate_during_training_verbose": True,
        "use_cached_eval_features": False,
        "save_eval_checkpoints": False,
        "save_steps": 0,
        "no_cache": True,
        "save_model_every_epoch": True,
        "tensorboard_dir": None,

        "overwrite_output_dir": True,
        "reprocess_input_data": True,

        "n_gpu": 1,
        "silent": False,
        "use_multiprocessing": True,

        "wandb_project": None,
        "wandb_kwargs": {},

        "use_early_stopping": False,
        "early_stopping_patience": 4,
        "early_stopping_delta": 0,
        "early_stopping_metric": "f1",
        "early_stopping_metric_minimize": False,

        "manual_seed": 9721,
        "encoding": None,
        "config": {},
    }
    return train_args

In [2]:
import pickle
with open("data/distinct_sets/within-v2.pkl", "rb") as f:
    distinct_train_df = pickle.load(f)
    distinct_test_df = pickle.load(f)
    overlap_train_df = pickle.load(f)
    overlap_test_df = pickle.load(f)
    rnd_train_df = pickle.load(f)
    rnd_test_df = pickle.load(f)
    
with open("acl2020_data_split.pkl", "rb") as f:
    cross_train = pickle.load(f)
    cross_dev = pickle.load(f)
    cross_test = pickle.load(f)   
    within_train = pickle.load(f)
    within_dev = pickle.load(f)
    within_test = pickle.load(f)

In [149]:
# dataset statistics
# exp 1
wd = pd.concat([within_train, within_dev])
print(len(wd), len(within_test))
print(len(wd[wd.tag == "abortion"]), len(within_test[within_test.tag == "abortion"]))
print(len(wd[wd.tag == "gay marriage"]), len(within_test[within_test.tag == "gay marriage"]))
print(len(set(wd[wd.tag == "abortion"].argument1.tolist() + wd[wd.tag == "abortion"].argument2.tolist())), len(set(within_test[within_test.tag == "abortion"].argument1.tolist() + within_test[within_test.tag == "abortion"].argument2.tolist())))
print(len(set(wd[wd.tag == "gay marriage"].argument1.tolist() + wd[wd.tag == "gay marriage"].argument2.tolist())), len(set(within_test[within_test.tag == "gay marriage"].argument1.tolist() + within_test[within_test.tag == "gay marriage"].argument2.tolist())))
print()
print(len(cross_train) + len(cross_dev), len(cross_test))
print(len(set(cross_train.argument1.tolist() + cross_train.argument2.tolist())), len(set(cross_test.argument1.tolist() + cross_test.argument2.tolist())))
print()

# exp 2
print(len(rnd_train_df), len(rnd_test_df))
print(len(overlap_train_df), len(overlap_test_df))
print(len(distinct_train_df), len(distinct_test_df))
print()
print(len(set(rnd_train_df.argument1.tolist() + rnd_train_df.argument2.tolist())), len(set(rnd_test_df.argument1.tolist() + rnd_test_df.argument2.tolist())))
print(len(set(overlap_train_df.argument1.tolist() + overlap_train_df.argument2.tolist())), len(set(overlap_test_df.argument1.tolist() + overlap_test_df.argument2.tolist())))
print(len(set(distinct_train_df.argument1.tolist() + distinct_train_df.argument2.tolist())), len(set(distinct_test_df.argument1.tolist() + distinct_test_df.argument2.tolist())))

s1_test = set(within_test[within_test.tag == "gay marriage"].argument1.tolist() + within_test[within_test.tag == "gay marriage"].argument2.tolist())
s1_overlap = set(wd[wd.tag == "gay marriage"].argument1.tolist() + wd[wd.tag == "gay marriage"].argument2.tolist()).intersection(s1_test)
print("overlap", len(s1_overlap) / len(s1_test))

57512 6391
36746 4094
20766 2297
9100 3248
4359 1583

61048 18724
9328 4297

57977 4381
57977 4381
57980 4378

13453 3649
13574 8760
10080 3494
overlap 0.9797852179406191


In [70]:
def precision_macro(y_true, y_pred):
    return sklearn.metrics.precision_score(y_true, y_pred, average='macro')
def recall_macro(y_true, y_pred):
    return sklearn.metrics.recall_score(y_true, y_pred, average='macro')
def f1_macro(y_true, y_pred):
    return sklearn.metrics.f1_score(y_true, y_pred, average='macro')

In [71]:
def reformat_df(data_df):
    df = data_df[['argument1', 'argument2', 'is_same_side']].copy()
    df["is_same_side"] = df["is_same_side"].astype(int)
    df = df.rename(columns={"argument1": "text_a", "argument2": "text_b", "is_same_side" : 'labels'})
    return df

In [72]:
train_df = reformat_df(within_train)
dev_df = reformat_df(within_dev)
test_df = reformat_df(within_test)

In [None]:
models_to_test = [
    #('bert', 'bert-base-cased'),
    #('roberta', 'roberta-base'),
    #('xlnet', 'xlnet-base-case'),
    ('distilbert', 'distilbert-base-cased'),
    ('albert', 'albert-base-v2'),
    ('longformer', 'allenai/longformer-base-4096')
]

In [None]:
for model_to_test in models_to_test:
    
    print(model_to_test)
    
    # get training args
    train_args = get_train_args(model_to_test[0])
    
    # Create a ClassificationModel
    model = ClassificationModel(model_to_test[0], model_to_test[1], num_labels=2, use_cuda=True, cuda_device=0, args=train_args)
    
    # Train the model
    model.train_model(train_df, eval_df=dev_df, precision=precision_macro, recall=recall_macro, f1=f1_macro)
    
    # Eval best model
    model = ClassificationModel(model_to_test[0], train_args['best_model_dir'], num_labels=2, use_cuda=True, cuda_device=0, args=train_args)
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df, acc=sklearn.metrics.accuracy_score, precision=precision_macro, recall=recall_macro, f1=f1_macro
    )
    print("***************************+")
    print(result)

In [None]:
# predict_df = [[row['text_a'], row['text_b']] for index, row in test_df[1:10].iterrows()]
# predictions, raw_outputs = model.predict(predict_df)
# print(predictions)
# print(raw_outputs)

## Evaluation

In [7]:
models_to_test = [
    ('bert', 'bert-base-cased'),
    ('roberta', 'roberta-base'),
    #('xlnet', 'xlnet-base-case'),
    ('distilbert', 'distilbert-base-cased'),
    ('albert', 'albert-base-v2'),
    #('longformer', 'allenai/longformer-base-4096')
]
for model_to_test in models_to_test:
    train_args = get_train_args(model_to_test[0])
    print("***************************")
    print(model_to_test)
    model = ClassificationModel(model_to_test[0], train_args['best_model_dir'], num_labels=2, use_cuda=True, cuda_device=0, args=train_args)
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df, acc=sklearn.metrics.accuracy_score, precision=precision_macro, recall=recall_macro, f1=f1_macro
    )
    print(result)

***************************
('bert', 'bert-base-cased')


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=6391.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=799.0, style=ProgressStyle(descr…

INFO:simpletransformers.classification.classification_model:{'mcc': 0.7673335255474553, 'tp': 2991, 'tn': 2658, 'fp': 340, 'fn': 402, 'acc': 0.8838992332968236, 'precision': 0.8832780004826908, 'recall': 0.8840559193898201, 'f1': 0.8835831754471795, 'eval_loss': 0.4852882198835444}



{'mcc': 0.7673335255474553, 'tp': 2991, 'tn': 2658, 'fp': 340, 'fn': 402, 'acc': 0.8838992332968236, 'precision': 0.8832780004826908, 'recall': 0.8840559193898201, 'f1': 0.8835831754471795, 'eval_loss': 0.4852882198835444}
***************************
('roberta', 'roberta-base')


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=6391.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=799.0, style=ProgressStyle(descr…

INFO:simpletransformers.classification.classification_model:{'mcc': 0.7295700774033835, 'tp': 2826, 'tn': 2692, 'fp': 306, 'fn': 567, 'acc': 0.8634016585823815, 'precision': 0.8641595510928182, 'recall': 0.8654116006603871, 'f1': 0.8633415815665764, 'eval_loss': 0.37576449096280035}



{'mcc': 0.7295700774033835, 'tp': 2826, 'tn': 2692, 'fp': 306, 'fn': 567, 'acc': 0.8634016585823815, 'precision': 0.8641595510928182, 'recall': 0.8654116006603871, 'f1': 0.8633415815665764, 'eval_loss': 0.37576449096280035}
***************************
('distilbert', 'distilbert-base-cased')


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=6391.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=799.0, style=ProgressStyle(descr…




INFO:simpletransformers.classification.classification_model:{'mcc': 0.6765254079252396, 'tp': 2332, 'tn': 2909, 'fp': 89, 'fn': 1061, 'acc': 0.8200594586136755, 'precision': 0.8479919616038087, 'recall': 0.8288054596570619, 'f1': 0.8185806993452204, 'eval_loss': 0.31102553187997417}


{'mcc': 0.6765254079252396, 'tp': 2332, 'tn': 2909, 'fp': 89, 'fn': 1061, 'acc': 0.8200594586136755, 'precision': 0.8479919616038087, 'recall': 0.8288054596570619, 'f1': 0.8185806993452204, 'eval_loss': 0.31102553187997417}
***************************
('albert', 'albert-base-v2')


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=6391.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=799.0, style=ProgressStyle(descr…




INFO:simpletransformers.classification.classification_model:{'mcc': 0.7613398728160554, 'tp': 2954, 'tn': 2674, 'fp': 324, 'fn': 439, 'acc': 0.8806133625410734, 'precision': 0.8800688603970792, 'recall': 0.8812719630161143, 'f1': 0.8803837642989102, 'eval_loss': 0.5124524853964705}


{'mcc': 0.7613398728160554, 'tp': 2954, 'tn': 2674, 'fp': 324, 'fn': 439, 'acc': 0.8806133625410734, 'precision': 0.8800688603970792, 'recall': 0.8812719630161143, 'f1': 0.8803837642989102, 'eval_loss': 0.5124524853964705}


# Experiment 2: Eval on recompiled, more realistic data

In [73]:
from sklearn.model_selection import train_test_split

datasets = [
    (reformat_df(rnd_train_df), reformat_df(rnd_test_df)),
    (reformat_df(distinct_train_df), reformat_df(distinct_test_df)),
    (reformat_df(overlap_train_df), reformat_df(overlap_test_df))
]

In [21]:
model_to_test = ('albert', 'albert-base-v2')

for i, (train_df, test_df) in enumerate(datasets):
    
    # get training args
    train_args = get_train_args("exp_" + str(i))
    
    print("***************************+")
    
    train_df, dev_df = train_test_split(train_df, test_size = 0.1, random_state = 9721)
    print(len(train_df))
    print(len(dev_df))
    print(len(test_df))

    # Create a ClassificationModel
    model = ClassificationModel(model_to_test[0], model_to_test[1], num_labels=2, use_cuda=True, cuda_device=0, args=train_args)

    # Train the model
    model.train_model(train_df, eval_df=dev_df, precision=precision_macro, recall=recall_macro, f1=f1_macro)

    # Eval best model
    model = ClassificationModel(model_to_test[0], train_args['best_model_dir'], num_labels=2, use_cuda=True, cuda_device=0, args=train_args)
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df, acc=sklearn.metrics.accuracy_score, precision=precision_macro, recall=recall_macro, f1=f1_macro
    )
    
    print(result)

***************************+
52179
5798
4381


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

HBox(children=(FloatProgress(value=0.0, max=52179.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7100312698535334, 'tp': 2719, 'tn': 2243, 'fp': 460, 'fn': 376, 'precision': 0.8558670811372984, 'recall': 0.8541662258831657, 'f1': 0.8548339352459647, 'eval_loss': 0.27323519166944354}


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7399710086843452, 'tp': 2587, 'tn': 2448, 'fp': 255, 'fn': 508, 'precision': 0.8692102990232535, 'recall': 0.8707623373060627, 'f1': 0.8683272196313039, 'eval_loss': 0.29825163410767785}


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7469215793018105, 'tp': 2610, 'tn': 2447, 'fp': 256, 'fn': 485, 'precision': 0.872630401689279, 'recall': 0.8742930280900119, 'f1': 0.8720962208213191, 'eval_loss': 0.36673127798102634}


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7444852861108283, 'tp': 2739, 'tn': 2322, 'fp': 381, 'fn': 356, 'precision': 0.8724747946228528, 'recall': 0.8720106361805856, 'f1': 0.872226269749512, 'eval_loss': 0.5434511430191221}


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7396496663002526, 'tp': 2703, 'tn': 2343, 'fp': 360, 'fn': 392, 'precision': 0.8695704644870874, 'recall': 0.870079376890513, 'f1': 0.869798147375935, 'eval_loss': 0.6797208085305816}
INFO:simpletransformers.classification.classification_model: Training of albert model complete. Saved to outputs_exp2/exp_0.





INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=4381.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=548.0, style=ProgressStyle(descr…




INFO:simpletransformers.classification.classification_model:{'mcc': 0.7627888997085139, 'tp': 2088, 'tn': 1776, 'fp': 253, 'fn': 264, 'acc': 0.8819904131476832, 'precision': 0.8812573812096389, 'recall': 0.8815315677774314, 'f1': 0.8813888402889491, 'eval_loss': 0.48301613785701275}


{'mcc': 0.7627888997085139, 'tp': 2088, 'tn': 1776, 'fp': 253, 'fn': 264, 'acc': 0.8819904131476832, 'precision': 0.8812573812096389, 'recall': 0.8815315677774314, 'f1': 0.8813888402889491, 'eval_loss': 0.48301613785701275}
***************************+
52182
5798
4378


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

HBox(children=(FloatProgress(value=0.0, max=52182.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.746293578716533, 'tp': 2516, 'tn': 2505, 'fp': 126, 'fn': 651, 'precision': 0.8730175463340077, 'recall': 0.8732760771626151, 'f1': 0.8659877894556105, 'eval_loss': 0.28043604125702304}


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7586205963241343, 'tp': 2880, 'tn': 2225, 'fp': 406, 'fn': 287, 'precision': 0.8810969670596354, 'recall': 0.8775320055729596, 'f1': 0.8789309161061969, 'eval_loss': 0.308079354782942}


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7836372664851883, 'tp': 2853, 'tn': 2323, 'fp': 308, 'fn': 314, 'precision': 0.8917438870611766, 'recall': 0.8918933936858593, 'f1': 0.8918176630218276, 'eval_loss': 0.32567066123269517}


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7947099060512103, 'tp': 2852, 'tn': 2355, 'fp': 276, 'fn': 315, 'precision': 0.8968935888962326, 'recall': 0.8978168534621034, 'f1': 0.8973137833181175, 'eval_loss': 0.4421367796229857}


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8031784988668221, 'tp': 2858, 'tn': 2373, 'fp': 258, 'fn': 309, 'precision': 0.9009945062361255, 'recall': 0.9021848747362247, 'f1': 0.9015185590831398, 'eval_loss': 0.5258311948583688}
INFO:simpletransformers.classification.classification_model: Training of albert model complete. Saved to outputs_exp2/exp_1.





INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=4378.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=548.0, style=ProgressStyle(descr…




INFO:simpletransformers.classification.classification_model:{'mcc': 0.3897261500444874, 'tp': 1101, 'tn': 1997, 'fp': 630, 'fn': 650, 'acc': 0.7076290543627227, 'precision': 0.6952437571980705, 'recall': 0.6944831350925253, 'f1': 0.6948475686323967, 'eval_loss': 1.6204442783798656}


{'mcc': 0.3897261500444874, 'tp': 1101, 'tn': 1997, 'fp': 630, 'fn': 650, 'acc': 0.7076290543627227, 'precision': 0.6952437571980705, 'recall': 0.6944831350925253, 'f1': 0.6948475686323967, 'eval_loss': 1.6204442783798656}
***************************+
52179
5798
4381


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

HBox(children=(FloatProgress(value=0.0, max=52179.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7566449546869678, 'tp': 2564, 'tn': 2503, 'fp': 160, 'fn': 571, 'precision': 0.8777556924305696, 'recall': 0.8788901126608896, 'f1': 0.8739080851345888, 'eval_loss': 0.2615384744510761}


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.779632313875081, 'tp': 2677, 'tn': 2472, 'fp': 191, 'fn': 458, 'precision': 0.8885445375831227, 'recall': 0.8910919380176451, 'f1': 0.8879247425575012, 'eval_loss': 0.22578365563029615}


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7742136496328526, 'tp': 2763, 'tn': 2382, 'fp': 281, 'fn': 372, 'precision': 0.8863055004451773, 'recall': 0.88790981139737, 'f1': 0.8868865206288201, 'eval_loss': 0.31709651809049255}


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7722599249647313, 'tp': 2803, 'tn': 2339, 'fp': 324, 'fn': 332, 'precision': 0.886044148517693, 'recall': 0.8862157955226715, 'f1': 0.8861282550476464, 'eval_loss': 0.497128615981286}


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=6523.0, style=ProgressStyle(de…




INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.774281676843272, 'tp': 2789, 'tn': 2358, 'fp': 305, 'fn': 346, 'precision': 0.8867317637114017, 'recall': 0.8875503458403631, 'f1': 0.8870960143241717, 'eval_loss': 0.5791396513797347}
INFO:simpletransformers.classification.classification_model: Training of albert model complete. Saved to outputs_exp2/exp_2.





INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=4381.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=548.0, style=ProgressStyle(descr…




INFO:simpletransformers.classification.classification_model:{'mcc': 0.41935149500238844, 'tp': 1027, 'tn': 2105, 'fp': 341, 'fn': 908, 'acc': 0.7149052727687742, 'precision': 0.7246851120776829, 'recall': 0.6956690351383158, 'f1': 0.6965312166322621, 'eval_loss': 0.5707390998839825}


{'mcc': 0.41935149500238844, 'tp': 1027, 'tn': 2105, 'fp': 341, 'fn': 908, 'acc': 0.7149052727687742, 'precision': 0.7246851120776829, 'recall': 0.6956690351383158, 'f1': 0.6965312166322621, 'eval_loss': 0.5707390998839825}


In [54]:
import numpy as np
def gini(array):
    """Calculate the Gini coefficient of a numpy array."""
    # based on bottom eq: http://www.statsdirect.com/help/content/image/stat0206_wmf.gif
    # from: http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    array = array.flatten() #all values are treated equally, arrays must be 1d
    if np.amin(array) < 0:
        array -= np.amin(array) #values cannot be negative
    array += 0.0000001 #values cannot be 0
    array = np.sort(array) #values must be sorted
    index = np.arange(1,array.shape[0]+1) #index per array element
    n = array.shape[0]#number of array elements
    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array))) #Gini coefficient

# What's actually happening?

In [81]:
print(gini(np.array(rnd_test_df.debate_id.value_counts().to_list(), dtype='float64')))
print(gini(np.array(overlap_test_df.debate_id.value_counts().to_list(), dtype='float64')))
print(gini(np.array(distinct_test_df.debate_id.value_counts().to_list(), dtype='float64')))

0.7126722835763067
0.31146269022452727
0.3458963324168394


In [161]:
within_train.debate_id.value_counts().to_frame().head(20) #19258
#tmp_df = within_train[within_train["debate_id"] == "b67fc3fb-2019-04-17T11:47:41Z"]
#print(len(set(tmp_df.argument1.to_list() + tmp_df.argument2.to_list())))

Unnamed: 0,debate_id
b67fc3fb-2019-04-17T11:47:41Z,17187
d2f4b1cd-2019-04-17T11:47:27Z,9567
40f91664-2019-04-17T11:47:29Z,2796
475596d3-2019-04-17T11:47:21Z,1176
414eb72a-2019-04-19T12:45:01Z,1013
b44abf31-2019-04-19T12:47:38Z,200
ac9fa785-2019-04-15T20:24:42Z,118
b18694c3-2019-04-18T13:51:52Z,32
1ff5359b-2019-04-18T15:06:14Z,32
66496b5f-2019-04-18T11:11:37Z,31


In [155]:
251 * 250 / 2

31375.0

## rnd

In [75]:
model_to_test = ('albert', 'albert-base-v2')
i = 0
train_args = get_train_args("exp_" + str(i))
model = ClassificationModel(model_to_test[0], train_args['best_model_dir'], num_labels=2, use_cuda=True, cuda_device=0, args=train_args)
result, model_outputs, wrong_predictions = model.eval_model(
    reformat_df(rnd_test_df), 
    acc=sklearn.metrics.accuracy_score, 
    precision=precision_macro, 
    recall=recall_macro, 
    f1=f1_macro
)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=4381.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=548.0, style=ProgressStyle(descr…

INFO:simpletransformers.classification.classification_model:{'mcc': 0.7627888997085139, 'tp': 2088, 'tn': 1776, 'fp': 253, 'fn': 264, 'acc': 0.8819904131476832, 'precision': 0.8812573812096389, 'recall': 0.8815315677774314, 'f1': 0.8813888402889491, 'eval_loss': 0.48301613785701275}





In [79]:
predicted_labels = model_outputs.argmax(axis=-1)
len(predicted_labels)

4381

In [135]:
rnd_test_df["pred_label"] = [True if l == 1 else False for l in predicted_labels]
rnd_test_df["agree"] = rnd_test_df["pred_label"] == rnd_test_df["is_same_side"]
df1 = rnd_test_df[["debate_id", "agree"]].groupby("debate_id").sum("agree").sort_values("agree", ascending = False)
df1.head(20)

Unnamed: 0_level_0,agree
debate_id,Unnamed: 1_level_1
b67fc3fb-2019-04-17T11:47:41Z,1455
d2f4b1cd-2019-04-17T11:47:27Z,788
40f91664-2019-04-17T11:47:29Z,246
475596d3-2019-04-17T11:47:21Z,104
414eb72a-2019-04-19T12:45:01Z,54
b44abf31-2019-04-19T12:47:38Z,10
f0fec70b-2019-04-18T17:27:54Z,6
7fa981cc-2019-04-18T19:51:14Z,5
b1851c86-2019-04-18T18:48:45Z,4
c2b2fdca-2019-04-18T16:51:34Z,4


In [137]:
df2 = rnd_test_df.debate_id.value_counts().to_frame()
df2.head(20)

Unnamed: 0,debate_id
b67fc3fb-2019-04-17T11:47:41Z,1471
d2f4b1cd-2019-04-17T11:47:27Z,790
40f91664-2019-04-17T11:47:29Z,247
475596d3-2019-04-17T11:47:21Z,104
414eb72a-2019-04-19T12:45:01Z,84
b44abf31-2019-04-19T12:47:38Z,21
ac9fa785-2019-04-15T20:24:42Z,6
a105b451-2019-04-18T16:00:52Z,6
f0fec70b-2019-04-18T17:27:54Z,6
ae62ea04-2019-04-18T19:35:43Z,6


In [138]:
df3 = pd.merge(df1, df2, left_index = True, right_index = True)
df3["acc"] =  df3.agree / df3.debate_id
df3.head(50)

Unnamed: 0,agree,debate_id,acc
b67fc3fb-2019-04-17T11:47:41Z,1455,1471,0.989123
d2f4b1cd-2019-04-17T11:47:27Z,788,790,0.997468
40f91664-2019-04-17T11:47:29Z,246,247,0.995951
475596d3-2019-04-17T11:47:21Z,104,104,1.0
414eb72a-2019-04-19T12:45:01Z,54,84,0.642857
b44abf31-2019-04-19T12:47:38Z,10,21,0.47619
f0fec70b-2019-04-18T17:27:54Z,6,6,1.0
7fa981cc-2019-04-18T19:51:14Z,5,5,1.0
b1851c86-2019-04-18T18:48:45Z,4,4,1.0
c2b2fdca-2019-04-18T16:51:34Z,4,5,0.8


In [139]:
df3[["debate_id", "acc"]].groupby("debate_id").mean("acc")

Unnamed: 0_level_0,acc
debate_id,Unnamed: 1_level_1
1,0.75228
2,0.713504
3,0.695652
4,0.71875
5,0.7
6,0.666667
21,0.47619
84,0.642857
104,1.0
247,0.995951


## distinct

In [109]:
model_to_test = ('albert', 'albert-base-v2')
i = 1
train_args = get_train_args("exp_" + str(i))
model = ClassificationModel(model_to_test[0], train_args['best_model_dir'], num_labels=2, use_cuda=True, cuda_device=0, args=train_args)
result, model_outputs, wrong_predictions = model.eval_model(
    reformat_df(distinct_test_df), 
    acc=sklearn.metrics.accuracy_score, 
    precision=precision_macro, 
    recall=recall_macro, 
    f1=f1_macro
)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=4378.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=548.0, style=ProgressStyle(descr…

INFO:simpletransformers.classification.classification_model:{'mcc': 0.3897261500444874, 'tp': 1101, 'tn': 1997, 'fp': 630, 'fn': 650, 'acc': 0.7076290543627227, 'precision': 0.6952437571980705, 'recall': 0.6944831350925253, 'f1': 0.6948475686323967, 'eval_loss': 1.6204442783798656}





In [110]:
predicted_labels = model_outputs.argmax(axis=-1)
len(predicted_labels)

4378

In [114]:
distinct_test_df["pred_label"] = [True if l == 1 else False for l in predicted_labels]
distinct_test_df["agree"] = distinct_test_df["pred_label"] == distinct_test_df["is_same_side"]
df1 = distinct_test_df[["debate_id", "agree"]].groupby("debate_id").sum("agree").sort_values("agree", ascending = False)
df1.head(20)

Unnamed: 0_level_0,agree
debate_id,Unnamed: 1_level_1
33ca75e3-2019-04-18T18:42:44Z,17
92ad51a3-2019-04-18T17:27:45Z,13
b1853369-2019-04-18T18:17:49Z,13
895cd490-2019-04-18T15:36:44Z,11
11d51208-2019-04-18T15:14:47Z,11
b187e621-2019-04-18T11:10:43Z,11
fc305b97-2019-04-18T18:43:58Z,11
b185370b-2019-04-18T18:07:10Z,11
b1853e8d-2019-04-18T17:51:22Z,10
5a26ecda-2019-04-18T14:57:07Z,10


In [118]:
df2 = distinct_test_df.debate_id.value_counts().to_frame()
df2.head(20)

Unnamed: 0,debate_id
b187e621-2019-04-18T11:10:43Z,18
b1853369-2019-04-18T18:17:49Z,17
33ca75e3-2019-04-18T18:42:44Z,17
92ad51a3-2019-04-18T17:27:45Z,17
f0197a2b-2019-04-18T16:51:04Z,14
b1861fe8-2019-04-18T15:14:30Z,14
6335507c-2019-04-18T15:08:16Z,11
6d03fb92-2019-04-18T17:16:08Z,11
c0717fda-2019-04-18T18:19:13Z,11
5c1ef67c-2019-04-18T13:55:20Z,11


In [133]:
df3 = pd.merge(df1, df2, left_index = True, right_index = True)
df3["acc"] =  df3.agree / df3.debate_id
df3.head(50)

Unnamed: 0,agree,debate_id,acc
33ca75e3-2019-04-18T18:42:44Z,17,17,1.0
92ad51a3-2019-04-18T17:27:45Z,13,17,0.764706
b1853369-2019-04-18T18:17:49Z,13,17,0.764706
895cd490-2019-04-18T15:36:44Z,11,11,1.0
11d51208-2019-04-18T15:14:47Z,11,11,1.0
b187e621-2019-04-18T11:10:43Z,11,18,0.611111
fc305b97-2019-04-18T18:43:58Z,11,11,1.0
b185370b-2019-04-18T18:07:10Z,11,11,1.0
b1853e8d-2019-04-18T17:51:22Z,10,11,0.909091
5a26ecda-2019-04-18T14:57:07Z,10,11,0.909091


In [131]:
df3[["debate_id", "acc"]].groupby("debate_id").mean("acc")

Unnamed: 0_level_0,acc
debate_id,Unnamed: 1_level_1
1,0.711864
2,0.707792
3,0.654472
4,0.742754
5,0.681928
6,0.74183
7,0.719048
8,0.679348
9,0.721368
10,0.692537


## Overlap

In [162]:
model_to_test = ('albert', 'albert-base-v2')
i = 2
train_args = get_train_args("exp_" + str(i))
model = ClassificationModel(model_to_test[0], train_args['best_model_dir'], num_labels=2, use_cuda=True, cuda_device=0, args=train_args)
result, model_outputs, wrong_predictions = model.eval_model(
    reformat_df(overlap_test_df), 
    acc=sklearn.metrics.accuracy_score, 
    precision=precision_macro, 
    recall=recall_macro, 
    f1=f1_macro
)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=4381.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=548.0, style=ProgressStyle(descr…




INFO:simpletransformers.classification.classification_model:{'mcc': 0.41935149500238844, 'tp': 1027, 'tn': 2105, 'fp': 341, 'fn': 908, 'acc': 0.7149052727687742, 'precision': 0.7246851120776829, 'recall': 0.6956690351383158, 'f1': 0.6965312166322621, 'eval_loss': 0.5707390998839825}


In [163]:
predicted_labels = model_outputs.argmax(axis=-1)
len(predicted_labels)

4381

In [164]:
overlap_test_df["pred_label"] = [True if l == 1 else False for l in predicted_labels]
overlap_test_df["agree"] = overlap_test_df["pred_label"] == overlap_test_df["is_same_side"]
df1 = overlap_test_df[["debate_id", "agree"]].groupby("debate_id").sum("agree").sort_values("agree", ascending = False)
df1.head(20)

Unnamed: 0_level_0,agree
debate_id,Unnamed: 1_level_1
b67fc3fb-2019-04-17T11:47:41Z,122
d2f4b1cd-2019-04-17T11:47:27Z,92
40f91664-2019-04-17T11:47:29Z,50
475596d3-2019-04-17T11:47:21Z,32
414eb72a-2019-04-19T12:45:01Z,19
b44abf31-2019-04-19T12:47:38Z,6
505b52ea-2019-04-18T18:45:44Z,5
b186fa3d-2019-04-18T13:12:02Z,5
ac9fa785-2019-04-15T20:24:42Z,5
34222b33-2019-04-18T15:09:25Z,5


In [165]:
df2 = overlap_test_df.debate_id.value_counts().to_frame()
df2.head(20)

Unnamed: 0,debate_id
b67fc3fb-2019-04-17T11:47:41Z,125
d2f4b1cd-2019-04-17T11:47:27Z,93
40f91664-2019-04-17T11:47:29Z,50
475596d3-2019-04-17T11:47:21Z,32
414eb72a-2019-04-19T12:45:01Z,31
b44abf31-2019-04-19T12:47:38Z,13
ac9fa785-2019-04-15T20:24:42Z,9
81e76103-2019-04-18T14:03:25Z,5
b186fa3d-2019-04-18T13:12:02Z,5
b186eebc-2019-04-18T13:35:03Z,5


In [166]:
df3 = pd.merge(df1, df2, left_index = True, right_index = True)
df3["acc"] =  df3.agree / df3.debate_id
df3.head(50)

Unnamed: 0,agree,debate_id,acc
b67fc3fb-2019-04-17T11:47:41Z,122,125,0.976
d2f4b1cd-2019-04-17T11:47:27Z,92,93,0.989247
40f91664-2019-04-17T11:47:29Z,50,50,1.0
475596d3-2019-04-17T11:47:21Z,32,32,1.0
414eb72a-2019-04-19T12:45:01Z,19,31,0.612903
b44abf31-2019-04-19T12:47:38Z,6,13,0.461538
505b52ea-2019-04-18T18:45:44Z,5,5,1.0
b186fa3d-2019-04-18T13:12:02Z,5,5,1.0
ac9fa785-2019-04-15T20:24:42Z,5,9,0.555556
34222b33-2019-04-18T15:09:25Z,5,5,1.0


In [167]:
df3[["debate_id", "acc"]].groupby("debate_id").mean("acc")

Unnamed: 0_level_0,acc
debate_id,Unnamed: 1_level_1
1,0.708839
2,0.688525
3,0.699519
4,0.689655
5,0.727273
9,0.555556
13,0.461538
31,0.612903
32,1.0
50,1.0
