In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import pandas as pd
import numpy as np
import torch
import io
import json
from datasets import load_dataset, Dataset, load_metric
from sklearn.model_selection import train_test_split
from transformers import pipeline
import tweepy
from flair.embeddings import TransformerDocumentEmbeddings
from flair.data import Sentence
import xgboost as xgb
from sklearn.model_selection import cross_validate
import nltk

### Hyperparameters

In [2]:


# Twitter stuff
credentials_path = './../credentials.json'

with io.open(credentials_path) as f_in:
    credentials = json.load(f_in)


access_token = credentials["access_token"]
access_token_secret = credentials["access_token_secret"]


api_key = credentials["api_key"]
api_secret = credentials["api_secret"]
bearer_token = credentials["bearer_token"]

consumer_key = api_key
consumer_secret = api_secret


batch_size = 5
seed       = 42
model_name = "bert-base-german-cased"
task = 'argumentative_expert2'#'claim_expert1'#'evidence_expert1' # argumentative_expert1
metric     = load_metric('accuracy')
epochs     = 1
df_path = './climate_twitter_tweets.csv'

In [None]:
if False: # If you don't already have scraped the tweets switch this to true
    df = pd.read_csv('./global_expert_annotations.csv')
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)

    api = tweepy.API(auth)

    ids = df['tweet_id']
    tweets = []
    for id_ in ids:
        try:
            tweet = api.get_status(int(id_), tweet_mode='extended')
            tweets.append(tweet._json)
        except:
            pass
        print(len(tweets))

        if len(tweets) % 5000 == 0:
            dir_name = os.path.dirname(out_dir)
            file_path = os.path.join(dir_name, 'tweets_' + str(len(tweets)) + '.json')
            with io.open(file_path, mode='w') as f_out:
                json.dump(tweets,f_out)               

    df = df.assign(tweet=pd.Series(['']*len(df)).values)

    for tweet in tweets:
        df.loc[tweet['id'] == df.tweet_id, 'tweet'] = tweet['full_text']
    df.to_csv(df_path, index=False)

In [102]:

df = pd.read_csv(df_path)
## Remove empty strings
df = df[df.tweet != '']
df = df[df.tweet.notnull()]

In [103]:
# init embedding
embedding = TransformerDocumentEmbeddings(model_name)

In [104]:
tweet_embeddings = []

for tweet in df.tweet:
    tweet_embeddings.append(embedding.embed(Sentence(tweet))[0].get_embedding().cpu().detach().numpy())

In [105]:
data = np.array(tweet_embeddings)
label = df[task].to_numpy()

In [106]:
model = xgb.XGBRFClassifier(n_estimators=1, max_depth=1, objective='binary:logistic', eval_metric='auc', tree_method="gpu_hist")

In [107]:
param_grid = {
    #'gamma': [0,0.1,0.2,0.4,0.8,1.0],
    'learning_rate': [0.01, 0.03, 0.06],
    'max_depth': [1,3,5,6,7,8,9,10],
    'n_estimators': [1,2,5,7,10,15,20,25,30],
    #'reg_alpha': [0,0.1,0.2,0.4,0.8,1.0],
    #'reg_lambda': [0,0.1,0.2,0.4,0.8,1.0],
    'objective': ['binary:logistic'],
    'eval_metric':['auc'], 
    'tree_method':["gpu_hist"]
}

In [108]:
from sklearn.model_selection import GridSearchCV

In [109]:
#clf0 = GridSearchCV(estimator=model, scoring='f1_macro', param_grid=param_grid, n_jobs=10, verbose=1, cv=3)
#clf0.fit(data, label)
#df = pd.DataFrame(clf0.cv_results_)
#df

In [110]:
model = xgb.XGBRFClassifier(n_estimators=15, max_depth=1, learning_rate=0.01, objective='binary:logistic', eval_metric='auc', tree_method="gpu_hist")
cv_results = cross_validate(model, data, label, scoring=('f1_weighted', 'precision', 'recall'), cv=10)



In [111]:
cv_results['test_f1_weighted'].mean(), cv_results['test_precision'].mean(),cv_results['test_recall'].mean()

(0.8400990607537462, 0.8866704640388852, 0.9777777777777779)

In [67]:
model.get_params()

{'colsample_bynode': 0.8,
 'learning_rate': 0.01,
 'reg_lambda': 1e-05,
 'subsample': 0.8,
 'use_label_encoder': True,
 'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bytree': None,
 'enable_categorical': False,
 'gamma': None,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'max_delta_step': None,
 'max_depth': 1,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 15,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'scale_pos_weight': None,
 'tree_method': 'gpu_hist',
 'validate_parameters': None,
 'verbosity': None,
 'eval_metric': 'auc'}