In [2]:
#import matplotlib
#import copy
import logging
#import torch

#from tqdm.autonotebook import tqdm

from data.data_loader import Dataset
from data.germeval2017 import germeval2017_dataset

from misc.preferences import PREFERENCES
#from misc.visualizer import *
from misc.run_configuration import get_default_params, randomize_params
from misc import utils

#from optimizer import get_default_optimizer
#from criterion import NllLoss, LossCombiner

#from models.transformer.encoder import TransformerEncoder
#from models.jointAspectTagger import JointAspectTagger
#from models.transformer.train import Trainer
import pprint

import torch
import torchtext
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup

In [3]:
PREFERENCES.defaults(
    data_root='./data/germeval2017',
    data_train='train_v1.4.tsv',    
    data_validation='dev_v1.4.tsv',
    data_test='test_TIMESTAMP1.tsv',
    early_stopping='highest_5_F1'
)
def load(hp, logger):
    dataset = Dataset(
        'germeval',
        logger,
        hp,
        source_index=0,
        target_vocab_index=2,
        data_path=PREFERENCES.data_root,
        train_file=PREFERENCES.data_train,
        valid_file=PREFERENCES.data_validation,
        test_file=PREFERENCES.data_test,
        file_format='.tsv',
        init_token=None,
        eos_token=None
    )
    dataset.load_data(germeval2017_dataset, verbose=False)
    return dataset

experiment_name = utils.create_loggers(experiment_name='testing')
logger = logging.getLogger(__name__)

default_hp = get_default_params(False)

logger.info(default_hp)
print(default_hp)

Log path is  C:\Users\felix\OneDrive\Studium\Studium\6. Semester\MA\Project\ABSA-Transformer\logs\testing\12
+----------------------------------------------------------+
|                     Hyperparameters                      |
+-------------------------+--------------------------------+
|        Parameter        |             Value              |
+-------------------------+--------------------------------+
|        batch_size       |               12               |
|        model_size       |              300               |
|    learning_rate_type   | LearningSchedulerType.Adadelta |
|      learning_rate      |               1                |
|   learning_rate_warmup  |              4800              |
|   learning_rate_factor  |               2                |
|     optim_adam_beta1    |              0.9               |
|     optim_adam_beta2    |              0.98              |
|      early_stopping     |               5                |
|         use_cuda        |          

In [4]:
dataset = load(default_hp, logger)


                                           

In [7]:
def produce_test_gold_labels(iterator: torchtext.data.Iterator, dataset: Dataset):

    fields = dataset.fields
    with torch.no_grad():
        iterator.init_epoch()

        tree = ET.Element('Documents')

        for batch in iterator:
            doc_id, comment, relevance, aspect_sentiment, general_sentiment = batch.id, batch.comments, batch.relevance, batch.aspect_sentiments, batch.general_sentiments
            doc_id = fields['id'].reverse(doc_id.unsqueeze(1))
            comment = fields['comments'].reverse(comment)
            relevance = ['false' if r == 0 else 'true' for r in relevance]
            general_sentiment = fields['general_sentiments'].reverse(general_sentiment.unsqueeze(1))
            aspect_sentiment = fields['aspect_sentiments'].reverse(aspect_sentiment, detokenize=False)

            for i in range(len(doc_id)):
                docuement_elem = ET.SubElement(tree, 'Document', {'id': doc_id[i]})

                rel_field = ET.SubElement(docuement_elem, 'relevance')
                rel_field.text = relevance[i]

                sen_field = ET.SubElement(docuement_elem, 'sentiment')
                sen_field.text = general_sentiment[i]

                text_field = ET.SubElement(docuement_elem, 'text')
                text_field.text = comment[i]

                options_elem = ET.SubElement(docuement_elem, 'Opinions')

                # add aspects
                for sentiment, a_name in zip(aspect_sentiment[i], dataset.target_names):
                    if sentiment == 'n/a':
                        continue

                    asp_field = ET.SubElement(options_elem, 'Opinion', {
                        'category': a_name,
                        'target': sentiment
                    })

        print(BeautifulSoup(ET.tostring(tree), "xml").prettify())

In [8]:
produce_test_gold_labels(dataset.test_iter, dataset)

<?xml version="1.0" encoding="utf-8"?>
<Documents>
 <Document id="https://plus.google.com/108587609118807152537/posts/JLayH39aY5J">
  <relevance>
   true
  </relevance>
  <sentiment>
   negative
  </sentiment>
  <text>
   gute nachricht für alle pendler gute nachricht für alle pendler﻿ article seit jahren versucht die deutsche bahn ihre ständigen verspätungen in den griff zu bekommen nun scheint die lösung ein für alle mal gefunden zu sein ab dem 1 august sollen keine züge mehr rollen bahnchef grube verspricht sich damit einen verspätungsrückgang von 100 prozent httpswwweinezeitungnet20160510niewiederverspaetungendeutschebahnstelltbetrieb
  </text>
  <Opinions>
   <Opinion category="Zugfahrt" target="negative"/>
  </Opinions>
 </Document>
 <Document id="http://twitter.com/JottKaesebrecht/statuses/686983328461910017">
  <relevance>
   true
  </relevance>
  <sentiment>
   negative
  </sentiment>
  <text>
   rt strandamhund ist euch mal aufgefallen dass selbst die verspätungsalarmmails de

</Documents>
