In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from typing import List
import torch
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json
testtweets = json.load(open("../../../../res/tweet_example.json", encoding="utf8"))

In [4]:
model_name = "oliverguhr/german-sentiment-bert"
model = AutoModelForSequenceClassification.from_pretrained(model_name, torchscript=True).to("cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
text_example = ["Ich hasse mein Leben so sehr.", "Ich liebe mein Leben"]
encoded = tokenizer.batch_encode_plus(text_example, padding=True, add_special_tokens=True,truncation=True, return_tensors="pt")
model.eval()

dummy_input = [encoded["input_ids"], encoded["attention_mask"]]

traced_model = torch.jit.trace(model, [encoded["input_ids"], encoded["attention_mask"]])
torch.jit.save(traced_model, "traced_bert.pt")

In [6]:
loaded_model = torch.jit.load("traced_bert.pt")
loaded_model.eval()

RecursiveScriptModule(
  original_name=BertForSequenceClassification
  (bert): RecursiveScriptModule(
    original_name=BertModel
    (embeddings): RecursiveScriptModule(
      original_name=BertEmbeddings
      (word_embeddings): RecursiveScriptModule(original_name=Embedding)
      (position_embeddings): RecursiveScriptModule(original_name=Embedding)
      (token_type_embeddings): RecursiveScriptModule(original_name=Embedding)
      (LayerNorm): RecursiveScriptModule(original_name=LayerNorm)
      (dropout): RecursiveScriptModule(original_name=Dropout)
    )
    (encoder): RecursiveScriptModule(
      original_name=BertEncoder
      (layer): RecursiveScriptModule(
        original_name=ModuleList
        (0): RecursiveScriptModule(
          original_name=BertLayer
          (attention): RecursiveScriptModule(
            original_name=BertAttention
            (self): RecursiveScriptModule(
              original_name=BertSelfAttention
              (query): RecursiveScriptModule(ori

In [10]:
listofTweets = [tweet["Text"] for tweet in testtweets]

In [11]:
# Daten müssen gesplittet werden, um in den Speicher zu passen
def split_dataframe(df, chunk_size = 50): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

In [27]:
clean_chars = re.compile(r'[^A-Za-züöäÖÜÄß ]', re.MULTILINE)
clean_http_urls = re.compile(r'https*\\S+', re.MULTILINE)
clean_at_mentions = re.compile(r'@\\S+', re.MULTILINE)

In [28]:
def clean_text(text):
    text = text.replace("\n", " ")        
    text = clean_http_urls.sub('',text)
    text = clean_at_mentions.sub('',text)                     
    text = clean_chars.sub('', text) # use only text chars                          
    text = ' '.join(text.split()) # substitute multiple whitespace with single whitespace   
    text = text.strip().lower()
    return text

In [29]:
texts = [clean_text(text) for text in listofTweets]

In [38]:
from IPython.display import clear_output

# Durchlaufen des Modells
ergebnisliste = []
laenge = len(texts)
chunksize = 1
for i, x in enumerate(split_dataframe(texts, chunksize)):
    if(x == []):
        break
    clear_output(wait=True)
    print("Prozent: ", (i+1)*chunksize / laenge)
    print("Genau: ", (i+1)*chunksize, " / ", laenge)

    encoded = tokenizer.batch_encode_plus(x, padding=True, add_special_tokens=True,truncation=True, return_tensors="pt")
    dummy_input = [encoded["input_ids"], encoded["attention_mask"]]

    output_example = loaded_model(*dummy_input)[0].argmax(1)

    ergebnisliste.append(output_example)

Prozent:  1.0
Genau:  1000  /  1000


In [39]:
import itertools
newListTMP = list(itertools.chain(*ergebnisliste))

In [40]:
newListTMP

[tensor(2),
 tensor(2),
 tensor(2),
 tensor(1),
 tensor(1),
 tensor(1),
 tensor(1),
 tensor(0),
 tensor(2),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(1),
 tensor(0),
 tensor(1),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(1),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(1),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(1),
 tensor(1),
 tensor(2),
 tensor(2),
 tensor(1),
 tensor(1),
 tensor(0),
 tensor(2),
 tensor(1),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(1),
 tensor(2),
 tensor(0),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(1),
 tensor(2),
 tensor(2),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(1),
 tensor(2),
 tensor(2),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(1),
 tensor(1),
 tensor(1),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(1),
 ten

In [41]:
from textblob_de import TextBlobDE
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hahnb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [50]:
sentimentliste = []
for tweet in texts:
    blob = TextBlobDE(tweet)
    for sentence in blob.sentences:
        sentimentliste.append(sentence.sentiment.polarity)


In [52]:
sentimentliste

[0.0,
 0.0,
 0.0,
 0.6666666666666666,
 0.0,
 0.5,
 -0.35,
 0.0,
 -1.0,
 0.0,
 0.0,
 -0.5,
 0.0,
 0.0,
 -0.15000000000000002,
 -0.5,
 0.0,
 0.6666666666666666,
 0.0,
 0.0,
 -0.7,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.9,
 0.0,
 0.0,
 -0.6666666666666666,
 0.0,
 0.0,
 -0.2333333333333333,
 0.0,
 0.0,
 0.7,
 0.0,
 0.15000000000000002,
 -0.7,
 0.0,
 0.0,
 0.0,
 0.35,
 0.0,
 0.0,
 -0.25,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.07999999999999999,
 -0.3333333333333333,
 0.0,
 0.5,
 0.7,
 0.0,
 -0.7,
 1.0,
 0.7,
 0.0,
 0.0,
 0.5666666666666667,
 -0.3333333333333333,
 0.0,
 0.0,
 0.0,
 0.0,
 -1.0,
 0.0,
 0.0,
 -0.7,
 0.0,
 0.0,
 0.0,
 -0.7,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -1.0,
 1.0,
 -0.7,
 0.16666666666666666,
 -0.26666666666666666,
 0.0,
 0.0,
 1.0,
 0.0,
 0.35,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.7,
 0.0,
 0.0,
 0.0,
 0.0,
 0.675,
 0.0,
 0.0,
 -1.0,
 0.0,
 0.0,
 -0.3333333333333333,
 0.0,
 1.0,
 -0.15000000000000002,
 0.0,
 0.16666666666666666,
 -1.0,
 -0.175,
 0.0,
 0.0,
 0.85,
 -

In [54]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
data = ["Das Theather war gestern nicht so gut.", "Die Fernsehsendung war toll."]
sentiment_pipeline(data)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


[{'label': 'NEGATIVE', 'score': 0.9904754757881165},
 {'label': 'NEGATIVE', 'score': 0.9879122376441956}]