## **Prerequisites**

In [1]:
# install needed packages simpletransformers
!pip install simpletransformers scikit-learn jedi Counter lxml openpyxl


Collecting simpletransformers
  Downloading simpletransformers-0.64.3-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.8/250.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting jedi
  Downloading jedi-0.19.0-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Counter
  Downloading Counter-1.0.0.tar.gz (5.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers>=4.31.0 (from simpletransformers)
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from simpletransformers)
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Coll

In [2]:
# import needed modules
import random as rn
import numpy as np
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from collections import Counter
import openpyxl
import gc

ModuleNotFoundError: ignored

In [None]:
# load packages to make progress bar of simpletransformers in vs code work
#from tqdm import tqdm
#from ipywidgets import interact
#import ipywidgets as widgets

In [None]:
# mount GDrive to be able to import data
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# set global seed for reproducibility of results
seed = 1337
np.random.seed(seed)

## **Import training data**

In [None]:
# import training dataset saved in GDrive
pd.set_option('display.max_columns', None)

# optional: use dropna() to remove empty excel rows
df = pd.read_excel("/content/drive/MyDrive/Masterarbeit/BertClassifierOpi/articles_opi_final_edited.xlsx")[["Text", "opinion"]] #.dropna()
print(df.head())

                                                Text  opinion
0  Ein neues Jahr beginnt. Es liegt vor uns wie N...        1
1  Eine große Kraftanstrengung wurde uns versproc...        1
2  Eine Leserin schrieb zu einem Beitrag über die...        1
3  Kein Zweifel: Jeder hat das Recht, seine Anspr...        1
4  Daten sind ein ganz besonderer Stoff. Flüchtig...        1


In [None]:
# get length of imported dataset
len(df)

54158

In [None]:
# check distribution of 1s = opinion piece / 0s = descriptive article
print(Counter(df['opinion'].values))

Counter({0: 27081, 1: 27077})


# **Prepare data for training of final model**

In [None]:
# construct test (20%) and train dataset (80%)
split = train_test_split(df[["Text", "opinion"]], test_size = 0.1, stratify= df['opinion'], random_state = seed)

In [None]:
test_data = split[1]
train_data = split[0]
train_data

Unnamed: 0,Text,opinion
13580,Die langjährige Vertraute des in Haft zu Tode ...,0
47838,Deal or no Deal: Im Endspiel um den Brexit ble...,1
49301,Die Hoffnung auf eine schnelle Erholung der de...,0
50263,Wer im Dezember hierzulande die letzte Chance ...,0
18525,Berlin. Der Patentstreit zwischen dem Lautspre...,0
...,...,...
49766,Die Frage klingt harmlos. „Willst du eine klei...,0
24860,"Es ist praktisch für Politiker, wenn es im eig...",1
41152,"Es ist dunkel, es ist laut und – windig. Wer d...",1
45431,Es ist nicht mehr lange bis zum ersten Jahrest...,1


In [None]:
test_data

Unnamed: 0,Text,opinion
9508,Mit Blick auf das milliardenschwere Beihilfepa...,0
47811,Die Corona-Pandemie hat in den Nationen Europa...,1
13591,"Wer hat die Oberhand? Das Militär, das seit de...",0
6363,Ob die griechische Außenpolitik immer diplomat...,1
25318,mRNA-Impfstofftechnologie ist – von diversen p...,1
...,...,...
5180,Wenn es in der Politik - wie Zyniker sagen - i...,1
30942,Die gute Fee ist ein Taxifahrer. Plötzlich ste...,0
28030,Ähnlich wie vor vier Jahren als Finanzminister...,1
39643,Die Reise des deutschen Bundeskanzlers Olaf Sc...,1


In [None]:
#check balance of constructed datasets
print(Counter(train_data['opinion'].values))

Counter({0: 24373, 1: 24369})


In [None]:
#check balance of constructed datasets
print(Counter(test_data['opinion'].values))

Counter({0: 2708, 1: 2708})


# **Define settings for the training process/model**


In [None]:
# define hyperparameters for model (https://simpletransformers.ai/docs/usage/)

# example for understanding batch size and epochs:
# Assume you have a dataset with 200 samples (rows of data) and you choose a batch size of 5 and 1,000 epochs.
# This means that the dataset will be divided into 40 batches, each with five samples. The model weights will be updated after each batch of five samples.
# This also means that one epoch will involve 40 batches or 40 updates to the model.
# With 1,000 epochs, the model will be exposed to or pass through the whole dataset 1,000 times. That is a total of 40,000 batches during the entire training process.


train_args ={"reprocess_input_data": True, # If True, the input data will be reprocessed even if a cached file of the input data exists in the cache_dir.
             "overwrite_output_dir": True, # If True, the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory.
             "use_cached_eval_features": True, # True here ok since we use at every evaluation the same test dataset; tokenizes validation set not again and again when ever a validation is conducted
             "output_dir": "outputs", # The directory where all outputs will be stored. This includes model checkpoints and evaluation results.
             "fp16": True, # fp16 = True when graphic card is avaliable otherwise fp16 = False
             "max_seq_length": 512, # maximum number of tokens that a sequence can contain. Any tokens that appear after the max_seq_length will be truncated (max value: 512)
             "sliding_window": False, # Whether to use sliding window technique to prevent truncating sequences longer than 512 tokens
             "num_train_epochs": 15, # defines the number times that the learning algorithm will work through the entire training dataset. Take a high number since early stopping will stop the model automatically when its enough
             "train_batch_size": 16, # defines the number of samples to work through before updating the internal model parameters (smaller = better / 32 common for use / see: https://wandb.ai/ayush-thakur/dl-question-bank/reports/What-s-the-Optimal-Batch-Size-to-Train-a-Neural-Network---VmlldzoyMDkyNDU)
            # use the following if the machine has not enough gpu ram for bigger batch sizes:
            # "gradient_accumulation_steps": 2, # e.g. 16 batch size * 2 gradient accumulation = 32 batch size (uses batch size 16 but updates first internal model parameters after 2 batches are worked through)
            # when using gradient_acc use for evaluate_during_training_steps -> batch size/len(data) / gradient = steps per epoch
             "use_early_stopping": True, # prevent model from overfitting
             "early_stopping_metric": 'eval_loss', # choose evaluation metric for early stopping (other metric e.g. mcc) -> eval_loss: how good can the model extrapolate to not seen data
             "early_stopping_delta": 0.01, # stop if eval_loss cannot get better by 0.01
             "early_stopping_metric_minimze": True, # eval_loss should be minimized (note: if mcc is used, it should get maximizied!)
             "evaluate_during_training": True, # evaluation will be performed during training to monitor the training process closely in order to find best model
             "evaluate_during_training_steps": 609, # Perform evaluation at every specified number of steps. In this case evaluate twice every epoch (steps_per_epoch/2)
             "early_stopping_patience": 15, # Terminate training after this many evaluations without an improvement in the evaluation metric greater then early_stopping_delta
             "evaluate_during_training_verbose": True, # Print results from evaluation during training.
             "manual_seed": seed, # for reproducible results
             "save_model_every_epoch": False, # not needed since model is evaluated 5 times per epoch
             "save_steps": -1} # dont save checkpoint every 2000 steps by default


# Create a BERT ClassificationModel using the pretrained german BERT model (cased -> takes into account lowercase and uppercase letters)
# models are imported from huggingface (see for a list: https://huggingface.co/transformers/v3.3.1/pretrained_models.html)
model = ClassificationModel(
    "bert", "bert-base-german-cased",
    num_labels=2,
    args=train_args,
    use_cuda = True
)


Downloading (…)lve/main/config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoi

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/485k [00:00<?, ?B/s]

In [None]:
model.args

ClassificationArgs(adafactor_beta1=None, adafactor_clip_threshold=1.0, adafactor_decay_rate=-0.8, adafactor_eps=(1e-30, 0.001), adafactor_relative_step=True, adafactor_scale_parameter=True, adafactor_warmup_init=True, adam_betas=(0.9, 0.999), adam_epsilon=1e-08, best_model_dir='outputs/best_model', cache_dir='cache_dir/', config={}, cosine_schedule_num_cycles=0.5, custom_layer_parameters=[], custom_parameter_groups=[], dataloader_num_workers=0, do_lower_case=False, dynamic_quantize=False, early_stopping_consider_epochs=False, early_stopping_delta=0.01, early_stopping_metric='eval_loss', early_stopping_metric_minimize=True, early_stopping_patience=15, encoding=None, eval_batch_size=8, evaluate_during_training=True, evaluate_during_training_silent=True, evaluate_during_training_steps=609, evaluate_during_training_verbose=True, evaluate_each_epoch=True, fp16=True, gradient_accumulation_steps=1, learning_rate=4e-05, local_rank=-1, logging_steps=50, loss_type=None, loss_args={}, manual_seed

In [None]:
# check how many steps per epoch will be conducted using a batch size of 16 and folds with a proportion of 80% training data and 20% validation data
steps_per_epoch = (len(df)*0.9)/float(train_args['train_batch_size'])
steps_per_epoch

3046.3875000000003

In [None]:
# evaluate 5 times per epoch
steps_per_epoch/5

609.2775

# **Train final model**

In [None]:
# fine-tune the pretrained bert-base-german-cased model with our final training dataset
# to use early stopping, eval_df has to be defined here
model.train_model(train_data, eval_df = test_data)



  0%|          | 0/48742 [00:00<?, ?it/s]

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Running Epoch 0 of 15:   0%|          | 0/3047 [00:00<?, ?it/s]



  0%|          | 0/5416 [00:00<?, ?it/s]



Running Epoch 1 of 15:   0%|          | 0/3047 [00:00<?, ?it/s]



Running Epoch 2 of 15:   0%|          | 0/3047 [00:00<?, ?it/s]



Running Epoch 3 of 15:   0%|          | 0/3047 [00:00<?, ?it/s]



Running Epoch 4 of 15:   0%|          | 0/3047 [00:00<?, ?it/s]



(14007,
 defaultdict(list,
             {'global_step': [609,
               1218,
               1827,
               2436,
               3045,
               3047,
               3654,
               4263,
               4872,
               5481,
               6090,
               6094,
               6699,
               7308,
               7917,
               8526,
               9135,
               9141,
               9744,
               10353,
               10962,
               11571,
               12180,
               12188,
               12789,
               13398,
               14007],
              'train_loss': [0.03534746170043945,
               0.04830598831176758,
               0.617581844329834,
               0.30817630887031555,
               0.8521475791931152,
               0.0893961563706398,
               0.08204632997512817,
               0.07488429546356201,
               0.04532872140407562,
               0.036057353019714355,
            

# **Test model and evaluate performance**


In [None]:
# load best model
model = ClassificationModel(
    "bert", "best_model",
    num_labels=2,
    args=train_args,
    use_cuda = True
    )

In [None]:
# test if fine-tuned model works (article text is from a descriptive article from faz.de)

test = "Dem polnischen Geheimdienst ist nach Angaben der Regierung in Warschau ein Schlag gegen russische Spionage gelungen. 'Das ganze Spionagenetzwerk wurde auseinandergenommen', sagte Verteidigungsminister Mariusz Blaszczak in einem Interview des öffentlich-rechtlichen Senders Polskie Radio. Er sprach von einer realen Bedrohung für Polen.'Das war ganz klar ein Spionagering, der Informationen gesammelt hat zugunsten derer, die die Ukraine angegriffen haben und dort Kriegsverbrechen begehen', sagte er weiter. Details nannte er nicht, das Innenministerium kündigte jedoch eine Pressekonferenz an. Nach unbestätigten Informationen des Radiosenders Rmf.fm soll das Spionagenetzwerk vor allem Bahnstrecken ausgespäht haben. Der Geheimdienst habe an wichtigen Strecken und Knotenpunkten versteckte Kameras entdeckt, die den Verkehr auf den Gleisen aufnahmen und die Bilder per Internet übermittelten. Sechs Personen, alle Ausländer, seien festgenommen worden. Polen ist das wichtigste Drehkreuz für die militärische Hilfe für die Ukraine, so dass viele Rüstungstransporte per Bahn das Land durchqueren. Seit dem russischen Angriff auf die Ukraine sind in vielen europäischen Ländern russische Spione enttarnt worden, etwa in Schweden. Auch wurden vielerorts Mitarbeiter der russischen Botschaften ausgewiesen, bei denen die Gastländer von Verbindungen zu den Geheimdiensten Moskaus ausgingen. Ende vergangenen Jahres war in Deutschland ein mutmaßlicher Doppelagent enttarnt worden. Der Mitarbeiter des Bundesnachrichtendienstes soll für Russland spioniert haben."

predictions, raw_outputs = model.predict([test])

# 0 = descriptive article / 1 = opinion_piece
print(predictions[0])

In [None]:
# evaluate model performance using test_data dataframe

# evaluation metrics:
# mcc - matthews  correlation coefficient (range -1:1): close to 1 good / close to 0 no difference / under 0 bad performance -> doing the opposite as expected
# tp: true positive
# tn: true negative
# fp: false positive
# fn: false negative
# for the following: https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc?hl=de
# auroc (range 0:1): is the area under the curve where x is false positive rate (FPR) and y is true positive rate. Tells whether model has good discriminatory ability: 0.70 – 0.80 is good performance, greater than 0.8 is excellent performance
# auprc: is the area under the curve where x is recall and y is precision.
# eval_loss:

results, model_outputs, wrong_predictions = model.eval_model(test_data)

In [None]:
results

# **Save model**

In [None]:
# save files of model written on colab to /content/output

import os
import tarfile

def save_model(model_path='',file_name=''):
  files = [files for root, dirs, files in os.walk(model_path)][0]
  with tarfile.open(file_name+ '.tar.gz', 'w:gz') as f:
    for file in files:
      f.add(f'{model_path}/{file}')

In [None]:
save_model('outputs/best_model','classifier_opinion_decriptive_no_slid_batch_16')

In [None]:
!tar -zxvf ./classifier_opinion_decriptive_no_slid_batch_16.tar.gz

outputs/best_model/tokenizer.json
outputs/best_model/scheduler.pt
outputs/best_model/eval_results.txt
outputs/best_model/config.json
outputs/best_model/training_args.bin
outputs/best_model/special_tokens_map.json
outputs/best_model/tokenizer_config.json
outputs/best_model/optimizer.pt
outputs/best_model/pytorch_model.bin
outputs/best_model/model_args.json
outputs/best_model/vocab.txt


In [None]:
import os
import tarfile

def unpack_model(model_name=''):
  tar = tarfile.open(f"{model_name}.tar.gz", "r:gz")
  tar.extractall()
  tar.close()

unpack_model('classifier_opinion_decriptive-4792')

In [None]:
# download files or move folder outputs to /content/drive/MyDrive to be able to download files
from google.colab import files
files.download("/content/outputs/config.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Load model again**

In [None]:
# strategy for dealing with texts containing > 512 tokens: https://simpletransformers.ai/docs/classification-specifics/#dealing-with-long-text

# define hyperparameter for model (https://simpletransformers.ai/docs/usage/)
train_args ={"reprocess_input_data": True, # If True, the input data will be reprocessed even if a cached file of the input data exists in the cache_dir.
             "overwrite_output_dir": True, # If True, the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory.
             "output_dir": "/content/drive/MyDrive/news_classification", # The directory where all outputs will be stored. This includes model checkpoints and evaluation results.
             "fp16": True, # fp16 = True when graphic card is avaliable otherwise fp16 = False
            }

# assuming that our files are saved under /content/outputs
model = ClassificationModel(
    "bert", "/content/outputs/checkpoint-3594", #"/content/drive/MyDrive/news_classification/outputs",
    num_labels=2,
    args=train_args
)

In [None]:
test = "Nach dem Absturz einer unbemannten Militärdrohne über dem Schwarzen Meer hat das amerikanische Militär Bildmaterial von dem Vorfall veröffentlicht. Darauf ist zu sehen, wie ein russisches Kampfflugzeug beim Anflug auf die Drohne Treibstoff ablässt und dann extrem nah heranfliegt. Insgesamt hat der Vorfall mit dem russischen Kampfflugzeug nach Angaben des Verteidigungsministeriums in Washington 30 bis 40 Minuten gedauert, das veröffentlichte Video zeigt demnach die entscheidenden Momente. Die Kamera der MQ-9 Reaper ist auf den veröffentlichten Aufnahmen nach hinten gerichtet, zum Teil ist der Propeller der Drohne zu sehen. Bei einem zweiten Anflug ließ der russische Su-27 abermals Benzin ab und stieß dann mit der Drohne zusammen, wie die amerikanische Kommandozentrale für Europa am Donnerstag mitteilte. Der Zusammenstoß ist in dem Video jedoch nicht zu sehen. Nach US-Angaben fiel die Kamera rund eine Minute aus. Im Anschluss ist in dem Video der teils beschädigte Propeller der Drohne zu sehen. Nach amerikanischen Angaben war die Drohne am Dienstag im internationalen Luftraum über dem Schwarzen Meer mit einem russischen Kampfjet kollidiert. Das US-Militär schilderte, zwei russische Kampfjets hätten ein Abfangmanöver begonnen. Dabei habe einer der Kampfjets den Propeller der US-Drohne getroffen. Die USA beklagten ein „unprofessionelles“, „unsicheres“ und „rücksichtsloses“ Handeln der russischen Piloten.„Wir haben Videobeweise für all das“, sagte Generalstabschef Mark Milley. Auf die Frage, ob die russischen Piloten mit Absicht gehandelt hätten, sagte er, das Abfangmanöver und das aggressive Handeln habe die russische Seite mit Absicht betrieben. Ob das Kampfflugzeug auch absichtlich die Drohne getroffen und damit zu Boden gezwungen habe, müsse sich noch zeigen."

predictions, raw_outputs = model.predict([test])
predictions

# **Export wrongly classified articles**

In [None]:
test_data = test_data.reset_index(drop=True)

In [None]:
#Show false predictions
# Vorhersagen auf dem Testdatensatz durchführen
predictions, _ = model.predict(test_data['Text'])

# Erstellen eines DataFrames mit den Vorhersagen und den wahren Labels
results = pd.DataFrame({'Text': test_data['Text'], 'true_label': test_data['opinion'], 'predicted_label': predictions})

# Filtern der Zeilen, in denen das Modell falsch vorhergesagt hat
wrong_predictions = results[results['true_label'] != results['predicted_label']]

In [None]:
file_path = '/content/drive/MyDrive/news_classification/falsch_vorhergesagte_beispiele.xlsx'
wrong_predictions.to_excel(file_path, index=False)

# **Use model for other dataset and save results in dataframe for merging it with this dataset (in case that model does not predict -> use runtime without extended RAM or multiprocessing = False)**

In [None]:
train_args ={"reprocess_input_data": True, # If True, the input data will be reprocessed even if a cached file of the input data exists in the cache_dir.
             "overwrite_output_dir": True, # If True, the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory.
             "use_cached_eval_features": True, # True here ok since we use at every evaluation the same test dataset; tokenizes validation set not again and again when ever a validation is conducted
             "output_dir": "outputs", # The directory where all outputs will be stored. This includes model checkpoints and evaluation results.
             "fp16": True, # fp16 = True when graphic card is avaliable otherwise fp16 = False
             "max_seq_length": 512, # maximum number of tokens that a sequence can contain. Any tokens that appear after the max_seq_length will be truncated (max value: 512)
             "sliding_window": False, # Whether to use sliding window technique to prevent truncating sequences longer than 512 tokens
             "num_train_epochs": 15, # defines the number times that the learning algorithm will work through the entire training dataset. Take a high number since early stopping will stop the model automatically when its enough
             "train_batch_size": 16, # defines the number of samples to work through before updating the internal model parameters (smaller = better / 32 common for use / see: https://wandb.ai/ayush-thakur/dl-question-bank/reports/What-s-the-Optimal-Batch-Size-to-Train-a-Neural-Network---VmlldzoyMDkyNDU)
            # use the following if the machine has not enough gpu ram for bigger batch sizes:
            # "gradient_accumulation_steps": 2, # e.g. 16 batch size * 2 gradient accumulation = 32 batch size (uses batch size 16 but updates first internal model parameters after 2 batches are worked through)
            # when using gradient_acc use for evaluate_during_training_steps -> batch size/len(data) / gradient = steps per epoch
             "use_early_stopping": True, # prevent model from overfitting
             "early_stopping_metric": 'eval_loss', # choose evaluation metric for early stopping (other metric e.g. mcc) -> eval_loss: how good can the model extrapolate to not seen data
             "early_stopping_delta": 0.01, # stop if eval_loss cannot get better by 0.01
             "early_stopping_metric_minimze": True, # eval_loss should be minimized (note: if mcc is used, it should get maximizied!)
             "evaluate_during_training": True, # evaluation will be performed during training to monitor the training process closely in order to find best model
             "evaluate_during_training_steps": 609, # Perform evaluation at every specified number of steps. In this case evaluate twice every epoch (steps_per_epoch/2)
             "early_stopping_patience": 15, # Terminate training after this many evaluations without an improvement in the evaluation metric greater then early_stopping_delta
             "evaluate_during_training_verbose": True, # Print results from evaluation during training.
             "manual_seed": seed, # for reproducible results
             "save_model_every_epoch": False, # not needed since model is evaluated 5 times per epoch
             "save_steps": -1} # dont save checkpoint every 2000 steps by default


# assuming that our files are saved under /content/outputs
model = ClassificationModel(
    "bert", "/content/drive/MyDrive/Masterarbeit/BertClassifierOpi/outputs/BERT_classifier_opinion_training_NOSW/checkpoint-7308",
    num_labels=2,
    args=train_args
)

In [None]:
# use dropna() to remove empty excel rows
news = pd.read_excel("/content/drive/MyDrive/Masterarbeit/BertClassifierEnv/News/news_categorised_final_justEnv.xlsx")

news = news.dropna(subset=["text"])

In [None]:
len(news)

9690

In [None]:
#predict Dataset
predictions, _ = model.predict(news["text"].tolist())

#for SW
#predict Dataset
#predictions, _ = model16SW.predict(news["text"].tolist())

#Store results
results = pd.DataFrame({"text": news["text"], "opinion": predictions})

# match data with predictions
news_categorised = pd.concat([news, results["opinion"]], axis=1)

# print
print(news_categorised.head())

  0%|          | 0/9690 [00:00<?, ?it/s]

  0%|          | 0/1212 [00:00<?, ?it/s]

                                                link  \
0  https://abendblatt.de/hamburg/article232904205...   
1  https://abendblatt.de/hamburg/article232999471...   
2  https://abendblatt.de/hamburg/article233207877...   
3  https://abendblatt.de/hamburg/article233397401...   
4  https://abendblatt.de/hamburg/article233402137...   

                                                text  \
0  Hamburg. Den August verbinden viele Menschen i...   
1  Hamburg. Der Hochsommer ist zurück in Hamburg ...   
2  Hamburg. Die "Wow-Prognose des Sommers 2021" –...   
3  Hamburg. Es ist eine teilweise drastische Vers...   
4  Hamburg. Der Deutsche Wetterdienst (DWD) hat e...   

                                               title  \
0  Wetter in Hamburg – überraschende Prognose für...   
1  Hurra, wieder Sommer! Saharaluft bringt Wärme ...   
2  Verrückt! Längste Schönwetterphase des Sommers...   
3  Droht Hamburg ein flächendeckendes Dieselfahrv...   
4  Neuer Report: Klimawandel in Hamburg immer 

In [None]:
news_categorised.to_excel("/content/drive/MyDrive/Masterarbeit/BertClassifierEnv/News/news_categorised_final_all.xlsx", index=False)