## Setup

In [None]:
!pip install transformers --quiet
!pip install datasets --quiet
!pip install tensorflow --quiet
!pip install sklearn --quiet
!pip install tensorflow-addons --quiet
!pip install germansentiment --quiet
!pip install transformers[sentencepiece] --quiet
!pip install wandb --quiet

[K     |████████████████████████████████| 3.4 MB 14.1 MB/s 
[K     |████████████████████████████████| 3.3 MB 24.2 MB/s 
[K     |████████████████████████████████| 67 kB 3.0 MB/s 
[K     |████████████████████████████████| 895 kB 44.5 MB/s 
[K     |████████████████████████████████| 596 kB 45.4 MB/s 
[K     |████████████████████████████████| 306 kB 29.7 MB/s 
[K     |████████████████████████████████| 1.1 MB 51.5 MB/s 
[K     |████████████████████████████████| 243 kB 73.1 MB/s 
[K     |████████████████████████████████| 133 kB 67.7 MB/s 
[K     |████████████████████████████████| 144 kB 67.7 MB/s 
[K     |████████████████████████████████| 271 kB 70.9 MB/s 
[K     |████████████████████████████████| 160 kB 70.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 21.4 MB/s 
[K     |████████████████████████████████| 1.2 MB 28.4 MB/s 
[K     |████████████████████████████████| 1.7 MB 29.3 MB/s 
[K     |████████████████████████████████| 180 kB 53.7 MB/s 
[K     |█████████████████

In [None]:
from datasets import Dataset, DatasetDict, Features, Value, ClassLabel
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification,
    AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
from sklearn.metrics import confusion_matrix
from huggingface_hub import notebook_login
from transformers.keras_callbacks import PushToHubCallback
from transformers import create_optimizer
from tensorflow.keras.callbacks import ModelCheckpoint
import time
from germansentiment import SentimentModel
import torch
from transformers import pipeline
import sklearn
import wandb
from wandb.integration.keras import WandbCallback
from sklearn.metrics import confusion_matrix
import seaborn as sns


In [None]:
# To use the data I import it as a pandas data frame
df = pd.read_csv('df_wirtschaft_labeled.csv')
df.head(5)

In [None]:
df.hist(column=["label_title", "label_body"])

In [None]:
# create test lists for title and body
input_titles_raw = [x for x in df["title"]]
print(input_titles_raw[:3])
input_bodies_raw = [x for x in df["body_512"]]
print(input_bodies_raw[:3])

In [None]:
# create english test lists for title and body
# helsinki-nlp opus
model_checkpoint = "Helsinki-NLP/opus-mt-de-en"
translator = pipeline("translation", model=model_checkpoint)
%time
input_titles_en_raw =[x["translation_text"] for x in translator(input_titles_raw)]
%time
input_bodies_en_raw =[x["translation_text"] for x in translator(input_bodies_raw)]

In [None]:
# german-sentiment-bert
# https://huggingface.co/oliverguhr/german-sentiment-bert
model = SentimentModel()
%time
df["gsb_title"] = model.predict_sentiment(input_titles_raw)
%time
df["gsb_body"]  = model.predict_sentiment(input_bodies_raw)
df["gsb_title"] = df["gsb_title"].str.replace("negative", "-1").str.replace("neutral", "0").str.replace("positive",
                                                                                                        "1").astype(int)
df["gsb_body"] = df["gsb_body"].str.replace("negative", "-1").str.replace("neutral", "0").str.replace("positive",
                                                                                                      "1").astype(int)

In [None]:
# https://huggingface.co/mdraw/german-news-sentiment-bert
model_f = SentimentModel('mdraw/german-news-sentiment-bert')
%time df["gsb_f_title"] = model_f.predict_sentiment(input_titles_raw)
%time
df["gsb_f_body"] = model_f.predict_sentiment(input_bodies_raw)
df["gsb_f_title"] = df["gsb_f_title"].str.replace("negative", "-1").str.replace("neutral", "0").str.replace("positive",
                                                                                                            "1").astype(
    int)
df["gsb_f_body"] = df["gsb_f_body"].str.replace("negative", "-1").str.replace("neutral", "0").str.replace("positive",
                                                                                                          "1").astype(
    int)

In [None]:
# https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")


def nlptown(input_raw):
    input = tokenizer(input_raw, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**input)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions_int = []
    for pred in predictions:
        index_max = max(range(len(pred)), key=pred.__getitem__)
        sentiment = -1 if index_max < 2 else 0 if index_max < 3 else 1
        predictions_int.append(sentiment)
    return predictions_int

%time
df["nlptown_bert_title"]  = nlptown(input_titles_raw)
split_bodies = [input_bodies_raw[i:i + 20] for i in range(0, len(input_bodies_raw), 20)]
%time
df["nlptown_bert_body"]  =[item for sublist in split_bodies for item in nlptown(sublist)]

In [None]:
df

In [None]:
# https://huggingface.co/deepset/bert-base-german-cased-sentiment-Germeval17
tokenizer_deepset = AutoTokenizer.from_pretrained("deepset/bert-base-german-cased-sentiment-Germeval17")
model_deepset = AutoModelForSequenceClassification.from_pretrained(
    "deepset/bert-base-german-cased-sentiment-Germeval17")


def deepset(input_raw, tokenizer, model):
    input = tokenizer(input_raw, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**input)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions_int = []
    for pred in predictions:
        index_max = max(range(len(pred)), key=pred.__getitem__)
        sentiment = -1 if index_max < 1 else 0 if index_max < 2 else 1
        predictions_int.append(sentiment)
    return predictions_int

%time
df["deepset_title"]  = deepset(input_titles_raw, tokenizer_deepset, model_deepset)
split_bodies = [input_bodies_raw[i:i + 20] for i in range(0, len(input_bodies_raw), 20)]
%time
df["deepset_body"]  =[item for sublist in split_bodies for item in deepset(sublist, tokenizer_deepset, model_deepset)]

In [None]:
# roberta_en_3_classes
t_r = AutoTokenizer.from_pretrained("j-hartmann/sentiment-roberta-large-english-3-classes")
m_r = AutoModelForSequenceClassification.from_pretrained("j-hartmann/sentiment-roberta-large-english-3-classes")


def roberta_en(input_raw, tokenizer, model):
    # predict on translation
    input = tokenizer(input_raw, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**input)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions_int = []
    for pred in predictions:
        index_max = max(range(len(pred)), key=pred.__getitem__)
        sentiment = -1 if index_max < 1 else 0 if index_max < 2 else 1
        predictions_int.append(sentiment)
    return predictions_int

%time
df["roberta_en_title"]  = roberta_en(input_titles_en_raw, t_r, m_r)
chunk_size = 5
split_bodies = [input_bodies_raw[i:i + 5] for i in range(0, len(input_bodies_en_raw), 5)]
%time
df["roberta_en_body"]  =[item for sublist in split_bodies for item in roberta_en(sublist, t_r, m_r)]

In [None]:
# distilbert 
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# helsinki-nlp opus
# translate data to german
model_checkpoint = "Helsinki-NLP/opus-mt-en-de"
translator_en_de = pipeline("translation", model=model_checkpoint)

In [None]:
# distilbert_untrained
checkpoint = 'distilbert-base-uncased'
t_d = AutoTokenizer.from_pretrained(checkpoint)
m_d = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)


def distilbert_en(input_raw, tokenizer, model):
    input = tokenizer(input_raw, padding=True, truncation=True, return_tensors="tf")
    outputs = model(input)
    predictions = tf.math.softmax(outputs.logits, axis=-1)
    predictions_int = []
    for pred in predictions:
        index_max = max(range(len(pred)), key=pred.__getitem__)
        sentiment = -1 if index_max < 1 else 0 if index_max < 2 else 1
        predictions_int.append(sentiment)
    return predictions_int

%time
df["distilbert_untrained_en_title"]  = distilbert_en(input_titles_en_raw, t_d, m_d)
chunk_size = 5
split_bodies = [input_bodies_raw[i:i + 5] for i in range(0, len(input_bodies_en_raw), 5)]
%time
df["distilbert_untrained_en_body"]  =[item for sublist in split_bodies for item in distilbert_en(sublist, t_d, m_d)]

In [None]:
# distilbert_untrained WITH GERMAN TEXT
checkpoint = 'distilbert-base-uncased'
t_d = AutoTokenizer.from_pretrained(checkpoint)
m_d = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)


def distilbert_en(input_raw, tokenizer, model):
    input = tokenizer(input_raw, padding=True, truncation=True, return_tensors="tf")
    outputs = model(input)
    predictions = tf.math.softmax(outputs.logits, axis=-1)
    predictions_int = []
    for pred in predictions:
        index_max = max(range(len(pred)), key=pred.__getitem__)
        sentiment = -1 if index_max < 1 else 0 if index_max < 2 else 1
        predictions_int.append(sentiment)
    return predictions_int

%time
df["distilbert_untrained_title"]  = distilbert_en(input_titles_raw, t_d, m_d)
chunk_size = 5
split_bodies = [input_bodies_raw[i:i + 5] for i in range(0, len(input_bodies_raw), 5)]
%time
df["distilbert_untrained_body"]  =[item for sublist in split_bodies for item in distilbert_en(sublist, t_d, m_d)]

In [None]:
# To use the data I import it as a pandas data frame
df_train = pd.read_csv('all-data.csv', encoding="ISO-8859-1")
df_train.head(5)

In [None]:
# turn labels into 
df_train["label"] = df_train["label"].str.replace("negative", "0").str.replace("neutral", "1").str.replace("positive","2").astype(int)

In [None]:
# create arrow dataset from pandas
dataset = Dataset.from_pandas(df_train)
# split arrow dataset into training and validation 
dataset_split = dataset.train_test_split(test_size=0.1)
train_data = dataset_split['train']
validation_data = dataset_split['test']
# create dataset dict which holds both datasets and allows map() to run on both
dataset = DatasetDict({
    'train': train_data,
    'validation': validation_data
})
# use sklearn to calculate classweights
class_weights = sklearn.utils.class_weight.compute_class_weight('balanced', classes=np.unique(df_train['label']),y=df_train['label'])
class_weights = dict(enumerate(class_weights))
print("class_weights:", class_weights)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


# tokenize arrow dataset-dict
def tokenize_function(dat):
    return tokenizer(dat["headline"], truncation=True)


dataset_t = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")

In [None]:
tf_train_dataset = dataset_t["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    label_cols=["label"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_dataset = dataset_t["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    label_cols=["label"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
# define training parameters
batch_size = 16
num_epochs = 5
init_lr = 2e-5
batches_per_epoch = len(dataset_t["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=init_lr, num_warmup_steps=0, num_train_steps=total_train_steps)
# define model
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)
model.compile(optimizer=optimizer)

In [None]:
wandb.init(project="news-sentiment", entity="fogx", config={"batch_size": batch_size, "lr": 2e-5})
callbacks = [WandbCallback()]
name_model = 'Distilbert'

In [None]:
model.fit(
    tf_train_dataset,
    validation_data=tf_train_dataset,
    epochs=num_epochs,
    class_weight=class_weights
)

In [None]:
dir = os.path.join(os.getcwd(), 'checkpoints_distilbert', "model_best_lr_2e5")
model.save_pretrained(dir)

In [None]:
t = tokenizer(input_titles_en_raw[:3], padding=True, truncation=True, return_tensors="tf")
o = model(t)
p = tf.math.softmax(o.logits, axis=-1)
print(p)

In [None]:
# distilbert_trained
checkpoint = 'distilbert-base-uncased'


def distilbert_trained(input_raw, tokenizer, model):
    input = tokenizer(input_raw, padding=True, truncation=True, return_tensors="tf")
    outputs = model(input)
    predictions = tf.math.softmax(outputs.logits, axis=-1)
    predictions_int = []
    for pred in predictions:
        index_max = max(range(len(pred)), key=pred.__getitem__)
        sentiment = -1 if index_max < 1 else 0 if index_max < 2 else 1
        predictions_int.append(sentiment)
    return predictions_int


distilbert_trained(input_titles_raw[:3], tokenizer, model)

%time
df["distilbert_trained_en_lr2_title"]  = distilbert_trained(input_titles_en_raw, tokenizer, model)
chunk_size = 5
split_bodies = [input_bodies_raw[i:i + 5] for i in range(0, len(input_bodies_en_raw), 5)]
%time
df["distilbert_trained_en_lr2_body"]  =[item for sublist in split_bodies for item in distilbert_trained(sublist, tokenizer, model)]
print("with german texts")
%time
df["distilbert_trained_lr2_title"]  = distilbert_trained(input_titles_raw, tokenizer, model)
chunk_size = 5
split_bodies = [input_bodies_raw[i:i + 5] for i in range(0, len(input_bodies_raw), 5)]
%time
df["distilbert_trained_lr2_body"]  =[item for sublist in split_bodies for item in distilbert_trained(sublist, tokenizer, model)]

In [None]:
# evaluation
columns = ["gsb", "gsb_f", "nlptown_bert", "deepset", "roberta_en", "distilbert_untrained_en", "distilbert_untrained",
           "distilbert_trained_en", "distilbert_trained", "distilbert_trained_lr2"]
cms = {}
for col in columns:
    col_title = f"{col}_title"
    col_body = f"{col}_body"
    title_correct = len(df[df["label_title"] == df[col_title]])
    body_correct = len(df[df["label_body"] == df[col_body]])
    df_len = len(df)
    print(f'correct in {col_title}: {title_correct}/{df_len} -> {round(title_correct / df_len, 2) * 100}%')
    print(f'correct in {col_body}: {body_correct}/{df_len} -> {round(body_correct / df_len, 2) * 100}%')
    #confusion matrices
    cm_t = confusion_matrix(df["label_title"], df[col_title])
    cm_b = confusion_matrix(df["label_body"], df[col_body])
    cms[col] = [cm_t, cm_b]

In [None]:
cms["gsb"]

In [None]:
ax = sns.heatmap(cms["distilbert_trained_en"][0], annot=True, cmap='Blues')
ax.set_title('Confusion matrix')
ax.set_xlabel('Predicted values')
ax.set_ylabel('Actual Values')
ax.xaxis.set_ticklabels(['positive', 'neutral', 'negative'])
ax.yaxis.set_ticklabels(['positive', 'neutral', 'negative'])