In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from typing import List
import torch
import re

In [4]:
import json
testtweets = json.load(open("../../../../res/tweet_example.json", encoding="utf8"))

In [5]:
# model_name = "oliverguhr/german-sentiment-bert"
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name, torchscript=True).to("cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading: 100%|██████████| 953/953 [00:00<00:00, 916kB/s]
Downloading: 100%|██████████| 638M/638M [00:54<00:00, 12.3MB/s] 
Downloading: 100%|██████████| 39.0/39.0 [00:00<00:00, 39.0kB/s]
Downloading: 100%|██████████| 851k/851k [00:00<00:00, 1.37MB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 112kB/s]


In [6]:
text_example = ["Ich hasse mein Leben so sehr.", "Ich liebe mein Leben"]
encoded = tokenizer.batch_encode_plus(text_example, padding=True, add_special_tokens=True,truncation=True, return_tensors="pt")
model.eval()

dummy_input = [encoded["input_ids"], encoded["attention_mask"]]

traced_model = torch.jit.trace(model, [encoded["input_ids"], encoded["attention_mask"]])
torch.jit.save(traced_model, "traced_bert.pt")

In [9]:
loaded_model = torch.jit.load("traced_bert.pt")
loaded_model.eval()

RecursiveScriptModule(
  original_name=BertForSequenceClassification
  (bert): RecursiveScriptModule(
    original_name=BertModel
    (embeddings): RecursiveScriptModule(
      original_name=BertEmbeddings
      (word_embeddings): RecursiveScriptModule(original_name=Embedding)
      (position_embeddings): RecursiveScriptModule(original_name=Embedding)
      (token_type_embeddings): RecursiveScriptModule(original_name=Embedding)
      (LayerNorm): RecursiveScriptModule(original_name=LayerNorm)
      (dropout): RecursiveScriptModule(original_name=Dropout)
    )
    (encoder): RecursiveScriptModule(
      original_name=BertEncoder
      (layer): RecursiveScriptModule(
        original_name=ModuleList
        (0): RecursiveScriptModule(
          original_name=BertLayer
          (attention): RecursiveScriptModule(
            original_name=BertAttention
            (self): RecursiveScriptModule(
              original_name=BertSelfAttention
              (query): RecursiveScriptModule(ori

In [10]:
listofTweets = [tweet["Text"] for tweet in testtweets]

In [11]:
# Daten müssen gesplittet werden, um in den Speicher zu passen
def split_dataframe(df, chunk_size = 50): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

In [12]:
clean_chars = re.compile(r'[^A-Za-züöäÖÜÄß ]', re.MULTILINE)
clean_http_urls = re.compile(r'https*\\S+', re.MULTILINE)
clean_at_mentions = re.compile(r'@\\S+', re.MULTILINE)

In [13]:
def clean_text(text):
    text = text.replace("\n", " ")        
    text = clean_http_urls.sub('',text)
    text = clean_at_mentions.sub('',text)                     
    text = clean_chars.sub('', text) # use only text chars                          
    text = ' '.join(text.split()) # substitute multiple whitespace with single whitespace   
    text = text.strip().lower()
    return text

In [14]:
texts = [clean_text(text) for text in listofTweets]

In [15]:
from IPython.display import clear_output

# Durchlaufen des Modells
ergebnisliste = []
laenge = len(texts)
chunksize = 1
for i, x in enumerate(split_dataframe(texts, chunksize)):
    if(x == []):
        break
    clear_output(wait=True)
    print("Prozent: ", (i+1)*chunksize / laenge)
    print("Genau: ", (i+1)*chunksize, " / ", laenge)

    encoded = tokenizer.batch_encode_plus(x, padding=True, add_special_tokens=True,truncation=True, return_tensors="pt")
    dummy_input = [encoded["input_ids"], encoded["attention_mask"]]

    output_example = loaded_model(*dummy_input)[0].argmax(1)

    ergebnisliste.append(output_example)

Prozent:  1.0
Genau:  1000  /  1000


In [16]:
import itertools
newListTMP = list(itertools.chain(*ergebnisliste))

In [17]:
newListTMP

[tensor(2),
 tensor(4),
 tensor(4),
 tensor(4),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(4),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(4),
 tensor(4),
 tensor(4),
 tensor(4),
 tensor(4),
 tensor(4),
 tensor(2),
 tensor(0),
 tensor(0),
 tensor(3),
 tensor(2),
 tensor(2),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(4),
 tensor(2),
 tensor(4),
 tensor(0),
 tensor(4),
 tensor(2),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(2),
 tensor(2),
 tensor(4),
 tensor(0),
 tensor(0),
 tensor(2),
 tensor(1),
 tensor(3),
 tensor(4),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(4),
 tensor(4),
 tensor(0),
 tensor(4),
 tensor(1),
 tensor(0),
 tensor(2),
 tensor(0),
 tensor(1),
 tensor(4),
 tensor(0),
 tensor(3),
 tensor(0),
 tensor(0),
 tensor(4),
 tensor(4),
 tensor(0),
 tensor(0),
 tensor(4),
 tensor(0),
 tensor(0),
 tensor(2),
 tensor(3),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(4),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(2),
 ten

In [18]:
from textblob_de import TextBlobDE
# python3 -m textblob.download_corpora
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hahnb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
sentimentliste = []
sentiment_text = []
for tweet in texts:
    blob = TextBlobDE(tweet)
    for sentence in blob.sentences:
        sentimentliste.append(sentence.sentiment.polarity)
         # set sentiment
        if sentence.sentiment.polarity > 0:
            sentiment_text.append("positive")
        elif sentence.sentiment.polarity == 0:
            sentiment_text.append("neutral")
        else:
            sentiment_text.append("negative")


In [54]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
data = ["Das Theather war gestern nicht so gut.", "Die Fernsehsendung war toll."]
sentiment_pipeline(data)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


[{'label': 'NEGATIVE', 'score': 0.9904754757881165},
 {'label': 'NEGATIVE', 'score': 0.9879122376441956}]