In [1]:
%gui asyncio
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset, load_metric
from sklearn.model_selection import train_test_split
from transformers import pipeline
import tweepy
from flair.embeddings import TransformerDocumentEmbeddings
from flair.data import Sentence
import xgboost as xgb
from sklearn.model_selection import cross_validate
import nltk
import asyncio
from ipywidgets import widgets as w
from IPython.display import clear_output
import time 


In [2]:
batch_size = 5
seed       = 42
model_name = "bert-base-cased"
#task = 'claim_expert1'#'evidence_expert1' # argumentative_expert1
#metric     = load_metric('accuracy')
epochs     = 1
df_path = './scraped_tweets.csv'

load = True

In [3]:

df = pd.read_csv(df_path)
## Remove empty strings
df = df[df.tweet != '']
df = df[df.tweet.notnull()]

In [4]:
if load:
    data = np.load('./scraped_tweets_embeddings.npy', allow_pickle=True)
else:
    # init embedding
    embedding = TransformerDocumentEmbeddings(model_name, embeddings_storage_mode='none')
    tweet_embeddings = []

    for i, tweet in enumerate(df.tweet):
        print('\r', i, end='')
        tweet_embeddings.append((embedding.embed(Sentence(tweet))[0].get_embedding().cpu().detach().numpy(), tweet))

    data = np.array(tweet_embeddings, dtype=object)
    np.save('scraped_tweets_embeddings', data)

In [5]:
def gen_table(numRows):
    rows = []
    
    header = w.HTML()
    done = w.Button(
        description='Done',
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Click if done with annotation',
        icon='check' # (FontAwesome names without the `fa-` prefix)
    )
    

    gs = w.GridspecLayout(numRows+2, 2)
    gs[0,0] = header
    gs[numRows+1, 0] = done
    
    for i in range(numRows):
        btn = w.ToggleButtons(
            options=[0, 1],
            description='Label:',
            disabled=False,
            button_style='',
        )
    
        tweet = w.HTML()
        vector = {'value': ''} # This happens to save the tweet vector
        rows.append((tweet, btn, vector))
        
        gs[i+1, 0] = tweet
        gs[i+1, 1] = btn
    
    return gs, rows, header, done
    
    
    

In [7]:
def annotate_filter(data):
    gs, rows, header, done = gen_table(10)
    hasRun = False
    train = {"x": [], "y": []}

    def combined(_):
        save()
        iterate()

    def save():
        nonlocal train
        nonlocal header
        nX = []
        nY = []
        for row in rows:
            nX.append((row[2]['value'], row[0].value)) # Vector / Tweet
            nY.append(row[1].value) # Label

        if len(train['x']) > 0:
            train['x'] = np.concatenate((train['x'], nX))
            train['y'] = np.concatenate((train['y'], nY))
        else:
            train['x'] = np.array(nX, dtype=object)
            train['y'] = np.array(nY)
        header.value = f'Number of positively found tweets: {sum(train["y"])}'
        

    def iterate():
        nonlocal train
        nonlocal data
        if len(train['x']) > 0:

            model = xgb.XGBRFClassifier(n_estimators=10, max_depth=2, learning_rate=0.01, objective='binary:logistic', eval_metric='logloss', tree_method="gpu_hist")
            model.fit(np.array(list(train['x'][:,0]), dtype=np.float), train['y'])

            converted = np.array(list(data[:,0]), dtype=np.float) 
            probs = model.predict_proba(converted)[:,1].argsort()
            preds = model.predict(converted)
        else:
            preds = np.zeros(len(data))
            probs = np.random.permutation(len(data))

        bad_sort = probs[:5] # Low prediction
        good_sort = probs[-5:] # High prediciton
        sort = np.concatenate((good_sort, bad_sort))

        dt = data[sort]
        dp = preds[sort]

        for i, (tweet, prediction) in enumerate(zip(dt, dp)):
            rows[i][0].value = tweet[1] # Sets the tweet text
            rows[i][1].value = prediction # Sets the label
            rows[i][2]['value'] = tweet[0] # Saves the vector 

        data = np.delete(data, sort, axis = 0)


    done.on_click(combined)
    iterate() # We iterate once to pull in the first set of tweets
    return gs, train

In [8]:
gs, annotated = annotate_filter(data)

In [9]:
gs

GridspecLayout(children=(HTML(value='', layout=Layout(grid_area='widget001')), Button(description='Done', icon…

In [41]:
df = pd.DataFrame(np.array([annotated['x'][:,1], annotated['y']]).T)
df.columns = ['tweet', 'label']

In [43]:
df.to_csv('filtered_tweets.csv', index=False)