In [25]:
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils import data
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import classification_report
import tqdm

In [26]:
#Uncomment for Aspect Category Experiments
# train = 'data/aspect_final_train.csv'
# test = 'data/aspect_final_test.csv'

#Uncomment for Polarity Experiment
train = 'data/polarity_final_train.csv'
test = 'data/polarity_final_test.csv'

with open(train) as fp:
    reader = csv.reader(fp, delimiter=",", quotechar='"')
    train_data = [row for row in reader]
with open(test) as fp:
    reader = csv.reader(fp, delimiter=",", quotechar='"')
    test_data = [row for row in reader]
    
print("TRAIN SAMPLES: {}".format(len(train_data)))
print("TEST SAMPLES: {}".format(len(test_data)))

TRAIN SAMPLES: 614
TEST SAMPLES: 154


In [27]:
def get_aspect(index_string, full_text):
    indices = index_string.replace('[','').replace(']','').split(',')
    aspect = ''
    for index in indices:
        aspect+= full_text.split()[int(index)] + ' '
    return aspect.strip()

def get_pos(sent_list,aspect_list):
    first_pos = sent_list.index(aspect_list[0])
    final_pos = []
    for i in range(0,len(aspect_list)):
        final_pos.append(first_pos+i)
    return final_pos    

class TalkLitDataset(data.Dataset):
    def __init__(self, tagged_sents):
        sents, aspects, tags = [], [], [] # list of lists
        for sent in tagged_sents:
            sent_tokens = tokenizer.encode(sent[0])
            try:
                aspect_tokens = tokenizer.encode(get_aspect(sent[1],sent[0]))[1:-1]
            except:
                print(sent[1])
                print(sent[0])
                continue
            pos_aspects = get_pos(sent_tokens, aspect_tokens)
            tag = sent[2]
            sents.append(sent_tokens)
            aspects.append(pos_aspects)
            tags.append(tag)
            
        self.sents, self.aspects, self.tags = sents, aspects, tags

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, aspects, tags = self.sents[idx], self.aspects[idx], tag2idx[self.tags[idx]] # words, tags: string list
        return words, aspects, tags

In [28]:
tags = []
for dat in train_data:
    tags.append(dat[2])
    
tags = list(set(tags))
",".join(tags)
tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}
print(tag2idx)

{'[(106, 108)]': 0, 'Positive': 1, 'Negative': 2}


In [29]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [30]:
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")
model = BertModel.from_pretrained("bert-base-german-cased")

In [31]:
class Net(nn.Module):
    def __init__(self, vocab_size=None):
        super().__init__()
        self.model = BertModel.from_pretrained("bert-base-german-cased", output_hidden_states=True)
        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, sent, aspects, y):
        sent = torch.LongTensor(sent).to(device)
        y = torch.LongTensor(y).to(device)
        input_ids = sent.unsqueeze(0)  # Batch size 1
        with torch.no_grad():
            outputs = self.model(input_ids)
            last_hidden_states = outputs.last_hidden_state[0]  # The last hidden-state is the first element of the output tuple
            start = 0
            end = len(last_hidden_states)-1
            context_window = 5
            
            if aspects[0]-context_window>0:
                start = aspects[0]-context_window
            if aspects[-1]+context_window<len(last_hidden_states)-1:
                end = aspects[-1]+context_window
                
            all_aspects = []
            for i in range(start,end):
                all_aspects.append(i)
            bert_embeds = torch.zeros(len(all_aspects),768)
            for i, aspect in enumerate(all_aspects):
                bert_embeds[i] = last_hidden_states[aspect]
                
            bert_embeds = torch.mean(bert_embeds, axis=0)
        logits = self.fc(bert_embeds)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat, bert_embeds

In [35]:
train_dataset = TalkLitDataset(train_data)
train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=1,
                             shuffle=True,
                             num_workers=0)

test_dataset = TalkLitDataset(test_data)
test_iter = data.DataLoader(dataset=test_dataset,
                             batch_size=1,
                             shuffle=False,
                             num_workers=0)



In [36]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)

X_train = []
y_train = []
X_test = []
y_test = []

for i, batch in tqdm.tqdm(enumerate(train_iter)):
        words, aspects, y = batch
        _y = y
        logits, y, _, bert_embeds = model(words, aspects, y)
        logits = logits.view(-1, logits.shape[-1])
        y = y.view(-1)
        bert_embeds = bert_embeds.cpu().numpy()
        y = int(y.cpu().numpy()[0])
        X_train.append(bert_embeds)
        y_train.append(y)
        
for i, batch in tqdm.tqdm(enumerate(test_iter)):
        words, aspects, y = batch
        _y = y
        logits, y, _, bert_embeds = model(words, aspects, y)
        logits = logits.view(-1, logits.shape[-1])
        y = y.view(-1)
        bert_embeds = bert_embeds.cpu().numpy()
        y = int(y.cpu().numpy()[0])
        X_test.append(bert_embeds)
        y_test.append(y)

614it [01:17,  7.97it/s]
154it [00:19,  8.06it/s]


In [46]:
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import random

names = ["Linear SVM", "Neural Net"]


classifiers = [
    SVC(kernel="linear", C=0.025),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier()
]

for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_test)
    print("CLASSIFIER: {}".format(name))
    print(classification_report(y_test, predicted))

CLASSIFIER: Linear SVM
              precision    recall  f1-score   support

           1       0.78      0.82      0.80        97
           2       0.67      0.61      0.64        57

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.72       154
weighted avg       0.74      0.75      0.74       154

CLASSIFIER: Neural Net
              precision    recall  f1-score   support

           1       0.76      0.80      0.78        97
           2       0.63      0.58      0.61        57

    accuracy                           0.72       154
   macro avg       0.70      0.69      0.69       154
weighted avg       0.72      0.72      0.72       154



In [47]:
test_set_outputs = []
for i, row in enumerate(test_data):
    row.extend([predicted[i]])
    test_set_outputs.append(row)
    
for i in range(0,5):
    example = random.choice(test_set_outputs)
    print("_______________________________________________________")
    print("TEXT: {}".format(example[0]))
    print("ASPECT: {}".format(get_aspect(example[1],example[0])))
    print("PREDICTION: {}".format(idx2tag[example[-1]]))
    print("ACTUAL: {}".format(example[2]))

_______________________________________________________
TEXT: Übrigens heißt meine Favoritin Othmann. Weil. #tddl
ASPECT: Othmann.
PREDICTION: Positive
ACTUAL: Positive
_______________________________________________________
TEXT: Ein ungewohnt gegenständlicher und „einfach“ zu erfassender Text von Tanja Maljartschuk, schöne Geschichte. Größter und längster Applaus am Schluss. #tddl #tddl18 https://t.co/vxz8AQQ2kN
ASPECT: Text von Tanja Maljartschuk,
PREDICTION: Positive
ACTUAL: Positive
_______________________________________________________
TEXT: #BachmannPreis 
Lesung von Corinna T.Sievers

Klaus Kastberger - Hätte ein Mann diesen Text geschrieben, wie wäre er angekommen? Und auch sonst gute Meinungen. Note: 3+

Insa Wilke - Die Frau, die es will. Literarische Radikalität fehlt trotzdem; gut zusammen gefasst. Note 2+
ASPECT: Klaus Kastberger
PREDICTION: Positive
ACTUAL: Positive
_______________________________________________________
TEXT: Schön gelesen, auch #tddl #bovbjerg
ASPECT: