In [383]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset, load_metric
from sklearn.metrics import f1_score, recall_score, precision_score
from utils import numerical_df

### Hyperparameters

In [2]:

df_path = './climate_twitter_tweets.csv'
df_org = pd.read_csv(df_path)
## Remove empty strings
df = df_org[df_org.tweet != '']
df = df[df.tweet.notnull()]

In [3]:
df[['labels', 'claim_expert1', 'evidence_expert1']].describe()

Unnamed: 0,labels,claim_expert1,evidence_expert1
count,212.0,212.0,212.0
mean,0.877358,0.849057,0.415094
std,0.328802,0.358841,0.493905
min,0.0,0.0,0.0
25%,1.0,1.0,0.0
50%,1.0,1.0,0.0
75%,1.0,1.0,1.0
max,1.0,1.0,1.0


In [4]:
df_org[['labels', 'claim_expert1', 'evidence_expert1']].describe()

Unnamed: 0,labels,claim_expert1,evidence_expert1
count,300.0,300.0,300.0
mean,0.866667,0.846667,0.42
std,0.340503,0.360911,0.494383
min,0.0,0.0,0.0
25%,1.0,1.0,0.0
50%,1.0,1.0,0.0
75%,1.0,1.0,1.0
max,1.0,1.0,1.0


In [5]:
f1_score(df_org.labels, np.ones(300), average='weighted'), f1_score(df_org.claim_expert1, np.ones(300), average='weighted'),  f1_score(df_org.evidence_expert1, np.ones(300), average='weighted')

(0.8047619047619048, 0.7763658243080627, 0.24845070422535212)

In [6]:
recall_score(df_org.labels, np.ones(300), average='weighted'), recall_score(df_org.claim_expert1, np.ones(300), average='weighted'),  recall_score(df_org.evidence_expert1, np.ones(300), average='weighted')

(0.8666666666666667, 0.8466666666666667, 0.42)

In [7]:
precision_score(df_org.labels, np.ones(300), average='weighted'), precision_score(df_org.claim_expert1, np.ones(300), average='weighted'),  precision_score(df_org.evidence_expert1, np.ones(300), average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


(0.7511111111111112, 0.7168444444444445, 0.17639999999999997)

In [8]:
f1_score(df.labels, np.ones(212), average='weighted'), f1_score(df.claim_expert1, np.ones(212), average='weighted'),  f1_score(df.evidence_expert1, np.ones(212), average='weighted')

(0.8200436142979046, 0.7797458606083943, 0.24352201257861641)

In [9]:
recall_score(df.labels, np.ones(212), average='weighted'), recall_score(df.claim_expert1, np.ones(212), average='weighted'),  recall_score(df.evidence_expert1, np.ones(212), average='weighted')

(0.8773584905660378, 0.8490566037735849, 0.41509433962264153)

In [10]:
precision_score(df.labels, np.ones(212), average='weighted'), precision_score(df.claim_expert1, np.ones(212), average='weighted'),  precision_score(df.evidence_expert1, np.ones(212), average='weighted')

(0.7697579209683162, 0.7208971164115343, 0.17230331078675687)

In [39]:
filtered = pd.read_csv('filtered_annotated.csv')
dev = pd.read_csv('dev.csv')
test = pd.read_csv('test.csv')
tweets = pd.read_csv('scraped_tweets.csv')

In [19]:
dev = dev.assign(evidence_type=pd.Series(['']*100).values)
test = test.assign(evidence_type=pd.Series(['']*100).values)

In [22]:
filtered_1 = filtered.sample(frac = 0.5) 
filtered_2 = filtered.drop(filtered_1.index) 

In [24]:
filtered_2

Unnamed: 0,index,tweet,label,argumentative,evidence,claim,procon,arg_type,evidence_type,topic
1,1,<MENTION> Doing a whole foods keto diet. Coffe...,1,1,0.0,0.0,-1.0,implicit,,Sustainable diets are healty
4,4,<MENTION> <MENTION> <MENTION> Because russian ...,0,1,0.0,0.0,0.0,unrelated,,Sustainable diets are healty
9,9,"<MENTION> Oh wow, I think they have a differen...",0,lacks context,0.0,0.0,0.0,,,Sustainable diets are healty
11,11,<MENTION> Beans. But you have to have some mea...,1,1,0.0,0.0,1.0,implicit,,Sustainable diets are healty
13,13,<MENTION> Tbh….\nI prefer the actual fruit..\n...,0,lacks context,0.0,0.0,0.0,,,Sustainable diets are healty
...,...,...,...,...,...,...,...,...,...,...
265,265,<MENTION> a better diet. start with better bre...,0,1,0.0,0.0,0.0,unrelated,,Sustainable diets are healty
266,266,<MENTION> But some olive oil on your door hinges,0,0,0.0,0.0,0.0,,,Sustainable diets are healty
268,268,"<MENTION> <MENTION> <MENTION> <MENTION> Sorry,...",0,0,0.0,0.0,0.0,,,Sustainable diets are healty
274,274,<MENTION> In the 60's and 70's in SE. QLD a sc...,0,0,0.0,0.0,0.0,,,Sustainable diets are healty


In [97]:
dev_full = pd.concat([dev, filtered_1]).drop(columns=['label'])
test_full = pd.concat([test, filtered_1]).drop(columns=['label'])

In [98]:
def get_ids(df):
    lacks_ids = df[df.id.isna()]
    ids = pd.merge(left=lacks_ids, right=tweets, left_on='tweet', right_on='tweet', how='inner')
    df.id.update(pd.Series(ids.id_y.values, index=lacks_ids.index))
    return df

In [99]:
test_full = get_ids(test_full)
dev_full = get_ids(dev_full)
dev_full.lang = 'en'
test_full.lang = 'en'

In [266]:
test_full.reset_index(drop=True, inplace=True)
dev_full.reset_index(drop=True, inplace=True)

In [299]:
sum(test_full[test_full.argumentative == '1'].arg_type.isna())

0

In [336]:
test_full.to_csv('full_testing_set.csv', index = False)
dev_full.to_csv('full_development_set.csv', index = False)

In [375]:
def numerical_df(df):
    df = df.copy()
    idx = df[df.argumentative == 'lacks context'].index
    df.argumentative.update(pd.Series(np.zeros(len(idx)), index=idx))
    df.argumentative = pd.to_numeric(df.argumentative)
    df.claim = pd.to_numeric(df.claim)
    df.evidence = pd.to_numeric(df.evidence)
    df.procon = pd.to_numeric(df.procon)
    return df

def score(df, label = 0, avg = 'weighted', score_fun = f1_score):
    df = numerical_df(df)
    
    return (score_fun(df.argumentative, [label]*len(df), average=avg), 
           score_fun(df.claim, [label]*len(df), average=avg),  
           score_fun(df.evidence, [label]*len(df), average=avg),
           score_fun(df.procon, [label]*len(df), average=avg))

In [391]:
label = 0

df = pd.DataFrame([score(dev_full, label = label, score_fun = f1_score), score(test_full, label = label, score_fun = f1_score)])
df.columns = ['Argumentative f1', 'Claim f1', 'Evidence f1', 'Procon f1']
df

Unnamed: 0,Argumentative f1,Claim f1,Evidence f1,Procon f1
0,0.434843,0.54403,0.846612,0.603894
1,0.44493,0.587398,0.834603,0.654107


In [392]:
df = pd.DataFrame([score(dev_full, label = label, score_fun = recall_score), score(test_full, label = label, score_fun = recall_score)])
df.columns = ['Argumentative recall', 'Claim recall', 'Evidence recall', 'Procon recall']
df

Unnamed: 0,Argumentative recall,Claim recakk,Evidence precision,Procon recall
0,0.5875,0.675,0.895833,0.720833
1,0.595833,0.708333,0.8875,0.758333


In [393]:
df = pd.DataFrame([score(dev_full, label = label, score_fun = precision_score) , score(test_full, label = label, score_fun = precision_score)])
df.columns = ['Argumentative precision', 'Claim precision', 'Evidence precision', 'Procon precision']
df

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Argumentative precision,Claim precision,Evidence precision,Procon precision
0,0.345156,0.455625,0.802517,0.519601
1,0.355017,0.501736,0.787656,0.575069


In [394]:
numerical_df(dev_full)[['argumentative', 'claim', 'evidence', 'procon']].describe()

Unnamed: 0,argumentative,claim,evidence,procon
count,240.0,240.0,240.0,240.0
mean,0.4125,0.325,0.104167,0.0625
std,0.493313,0.469354,0.306115,0.525749
min,0.0,0.0,0.0,-1.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0
