In [None]:
##### from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import pipeline
import tweepy
from flair.embeddings import TransformerDocumentEmbeddings
from flair.data import Sentence
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_validate
import nltk
from utils import numerical_df
from baselines import xgboost_baseline, majority_class_baseline, random_class_baseline, ibm_baseline, bert_baseline, bm25_baseline, qa_model_baseline
from sklearn.metrics import f1_score as f1, precision_score as ps, recall_score as rs
from sklearn.model_selection import KFold
from rank_bm25 import BM25Okapi

from sentence_transformers import SentenceTransformer, util

import emoji

### Hyperparameters

In [None]:
df_path = './annotated-dataset.csv'
df_org = pd.read_csv(df_path)

In [None]:
## Remove empty strings
df = df_org[df_org.tweet != '']
df = df[df.tweet.notnull()]

#bert_baseline(df, tasks=current_classes, use_topic=True, batch_size=5, epochs = 1)
#display(ibm_baseline(df_arg, ['argumentative']))
df_temp = df.reset_index(drop=True)
df_temp['tweet'] =  df_temp.tweet.map(lambda x: x.replace('<MENTION>', '@user'))
display(bert_baseline(df_temp, 
                      fold = 3,
                      model_name = 'cardiffnlp/twitter-roberta-base', 
                      tasks = ['argumentative', 'claim', 'evidence', 'procon'], 
                      learning_rate = 5e-5, 
                      batch_size=7,
                      epochs = 5))

In [None]:
## Remove empty strings
df = df_org[df_org.tweet != '']
df = df[df.tweet.notnull()]

df.loc[df.argumentative >= 0.5, 'argumentative'] = 1
df.loc[df.argumentative < 0.5, 'argumentative'] = 0

df.loc[df.claim >= 0.5, 'claim'] = 1
df.loc[df.claim < 0.5, 'claim'] = 0

df.loc[df.evidence < 0.5, 'evidence'] = 0
df.loc[df.evidence >= 0.5, 'evidence'] = 1


df.loc[df.procon < 0, 'procon'] = -1
df.loc[df.procon > 0, 'procon'] = 1


In [None]:
sum(df.argumentative), sum(df.claim), sum(df.evidence)

In [None]:
current_classes = ['argumentative', 'claim', 'evidence']

In [None]:
display(majority_class_baseline(df, ['argumentative', 'claim', 'evidence', 'procon']))

In [None]:

possible_tasks = ['argumentative', 'claim', 'evidence','procon'] 
def calc_scores(preds_set, labels_set, task = None):
    
    average_sets = []
    
    for average in ['binary', 'macro', 'micro']:
        averages = []
        for preds, labels in zip(preds_set, labels_set):
            averages.append((f1(preds, labels, average=average, zero_division = 0), 
                             ps(preds, labels, average=average, zero_division = 0), 
                             rs(preds, labels, average=average, zero_division = 0)))
        average_sets.append((average, *np.round(np.mean(averages, axis=0), 2)))
    
    if task == None:
        return np.array(average_sets)
    else:
        return np.hstack([[[task]]*3, np.array(average_sets)])


def dummy_class_baseline(df, tasks = possible_tasks, fold = 10, strategy="stratified"):
    
    if isinstance(tasks, str):
        tasks = [tasks]
    
    if not isinstance(tasks, list):
        return ValueError('Tasks is not a string or a list.')
    
    if not all(item in possible_tasks for item in tasks):
        return ValueError(f'Tasks can only be or contain the following elements {possible_tasks} but found {tasks}')
    
    res = []
    
    for task in tasks:
        kf = KFold(n_splits=fold)
        df_use = df.copy()
        data = np.array(df[task].values)#.round().astype(int)
        if task == 'procon':
            mask = data != 0
            data = [d for d, m in zip(data, mask) if m]
        elif task != 'argumentative':
            mask = df_use.argumentative > .5
            data = [d for d, m in zip(data, mask) if m]
        else:
            df_use = df_use.drop_duplicates(subset=['tweet'])
            data = df_use.argumentative.to_numpy()
        
        data = np.array(data).round().astype(int)
        tres = [] 
        tlabels = []
        for train_index, test_index in kf.split(data):
            
            dummy_clf = DummyClassifier(strategy=strategy)
            X_train, X_test = data[train_index], data[test_index]  
            dummy_clf.fit(X_train, X_train)

            labels = dummy_clf.predict(X_test)
            
            tres.append(labels)
            tlabels.append(X_test)
            
        res.append(calc_scores(tres, tlabels, task))
    res = pd.DataFrame(np.concatenate(res))
    res.columns = ['Task', 'Averaging', 'F1', 'Precision', 'Recall']
    return res

def random_class_baseline(df, tasks = possible_tasks, fold = 10):
    res = dummy_class_baseline(df, tasks, fold, strategy = 'stratified')
    res = res.style.set_caption(f'Random class results with {fold} fold split using weighted approach')
    return res

def majority_class_baseline(df, tasks = possible_tasks, fold = 10):
    
    res = dummy_class_baseline(df, tasks, fold, strategy = 'prior')
    res = res.style.set_caption(f'Majority class results with {fold} fold split')
    return res

majority_class_baseline(df, ['argumentative', 'claim', 'evidence', 'procon'])

In [None]:
display(random_class_baseline(df, ['argumentative', 'claim', 'evidence', 'procon']))

In [None]:
df_temp = df.reset_index(drop=True)
df_temp['tweet'] =  df_temp.tweet.map(lambda x: x.replace('<MENTION>', '@user'))
display(xgboost_baseline(df_temp, 
                         fold = 3, 
                         model_name = 'cardiffnlp/twitter-roberta-base', 
                         tasks = ['argumentative', 'claim', 'evidence', 'procon']))

In [None]:
#bm25_baseline(df, current_classes)
#display(bm25_baseline(df_arg, ['argumentative']))
display(bm25_baseline(df, ['claim', 'evidence']))

In [None]:
qa_model_baseline(df.copy(), ['claim', 'evidence'], model_cutoff = 0.5)

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from flair.embeddings import TransformerDocumentEmbeddings
from flair.data import Sentence

import xgboost as xgb
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

import nltk
from utils import numerical_df
from sklearn.metrics import f1_score as f1, precision_score as ps, recall_score as rs
from debater_python_api.api.debater_api import DebaterApi

from sklearn.model_selection import KFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import transformers
from transformers import TrainingArguments, Trainer
from datasets import Dataset, load_metric
from transformers import pipeline

from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util

import json
import io
from sklearn.dummy import DummyClassifier

possible_tasks = ['argumentative', 'claim', 'evidence','procon'] 



def calc_scores(preds_set, labels_set, task = None):
    
    average_sets = []
    
    for average in ['binary', 'macro', 'micro']:
        averages = []
        for preds, labels in zip(preds_set, labels_set):
            averages.append((f1(preds, labels, average=average, zero_division = 0), 
                             ps(preds, labels, average=average, zero_division = 0), 
                             rs(preds, labels, average=average, zero_division = 0)))

        average_sets.append((average, *np.round(np.mean(averages, axis=0), 2)))
    
    if task == None:
        return np.array(average_sets)
    else:
        return np.hstack([[[task]]*3, np.array(average_sets)])


def ibm_baseline(df, tasks = possible_tasks):
    if not isinstance(tasks, str) and not isinstance(tasks, list):
        raise ValueError("task must be list or str")
    
    if type(tasks) == str:
        tasks = [tasks]

    if not all(elem in possible_tasks for elem in tasks):
        raise ValueError("task must only contain any of the following strings: ", possible_tasks, ', but found:', tasks)
    

    credentials_path = './credentials.json'

    with io.open(credentials_path) as f_in:
        credentials = json.load(f_in)
    
    api_key = credentials['debater_api_key']
    debater_api = DebaterApi(apikey=api_key)
    clients = {
        "claim": debater_api.get_claim_detection_client(),
        "evidence": debater_api.get_evidence_detection_client(),
        "procon": debater_api.get_pro_con_client(),
        "argumentative": debater_api.get_argument_quality_client(),
    }
    
    argumentative_mask = df.argumentative == 1

    sentence_topic_dicts = [{'sentence' : row.tweet, 'topic' : row.topic } for row in df.iloc]

    res = []
    
    for task in tasks:
        print('Gathering results for', task)
        data = sentence_topic_dicts
        label = df[task].to_numpy()
        
        client = clients[task]
        
        if task == 'procon':
            mask = label != 0
            data = [d for d, m in zip(data, mask) if m]
            label = label[mask]
            scores = [1 if s > 0 else -1 for s in client.run(data)]
        else:
            data = [d for d, m in zip(data, argumentative_mask) if m]
            label = label[argumentative_mask]
            r = client.run(data)
            #scores = np.round(r)
            scores = [1 if s >= .5 else 0 for s in r]
            tp = (label == 1) & (scores == label)
            fn = (label == 1) & (scores != label)
            fp = (label == 0) & (scores != label)
            recall = sum(tp)/(sum(label))
            precision = sum(tp)/(sum(tp) + sum(fp))
            print(sum(scores), sum(label), recall, precision)
        
        res.append(calc_scores([scores], [label], task))
    res = pd.DataFrame(np.concatenate(res))
    res.columns =  ['Task', 'Averaging', 'F1', 'Precision', 'Recall']
    res = res.style.set_caption('Results from using imbs project debater api for 0 shot evalutaion')
    return res


In [None]:
# ibm_baseline(df, ['evidence']) # Only run this if you have access to ibms api models