In [None]:
import pandas as pd
import csv
import torch
from transformers import BertTokenizer
from tqdm.notebook import tqdm

# read csv
path = "Listing_Titles.tsv"
df = pd.read_csv(path, sep="\t", dtype=str, keep_default_na=False, na_values=[""], quoting=csv.QUOTE_NONE)
df["Record Number"] = df["Record Number"].astype(int)

# read training csv
train_path = "Train_Tagged_Titles.tsv"
train_df = pd.read_csv(train_path, sep="\t", dtype=str, keep_default_na=False, na_values=[""], quoting=csv.QUOTE_NONE)

# maps of labels and ids
labels2id = {train_df['Tag'].unique()[i - 1]: i for i in range(1, len(train_df['Tag'].unique()) + 1)}
id2labels = {labels2id[i]: i for i in labels2id}

# import model and tokenizer
model = ...
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')

In [None]:
# function to retrieve results from each listing title
def retrieve_results(tokenizer, text):
    encoding = tokenizer(text, return_tensors='pt')
    outputs = model(**encoding)
    logits = outputs.logits
    # print(logits.shape)
    predicted_label_classes = logits.argmax(-1)
    # print(predicted_label_classes)
    predicted_labels = [model.config.id2labels[id] for id in predicted_label_classes.squeeze().tolist()]
    # print(predicted_labels)
    tokens = [tokenizer.decode([id]) for id in encoding.input_ids.squeeze().tolist()]
    # print tokens
    return predicted_labels, tokens

In [None]:
# submission dataframe
df_submission = pd.DataFrame()
record_numbers = []
aspect_names = []
aspect_values = []

for i in range(5000, 30000):
        title = df.iat[i,1]
        labels, values = retrieve_results(tokenizer, title)
        # record number
        number = i+1
        numbers = [number for label in labels]
        record_numbers.extend(numbers)
        aspect_names.extend(labels)
        aspect_values.extend(values)

# save results to pandas df
df_submission = pd.DataFrame()
df_submission['Record Number'] = record_numbers
df_submission['Aspect Name'] = aspect_names
df_submission['Aspect Value'] = aspect_values

In [None]:
# combine duplicate adjacent labels
recNum = None
prevName = None
lastI = None
combineToken = []
for i in range(len(df_submission)):
    num = int(df_submission.iat[i, 0])
    name = str(df_submission.iat[i, 1])
    val = str(df_submission.iat[i,2])
    if num == recNum and name == prevName:
        combineToken.append(val)
        df_submission.iat[i,2] = 'drop'
    else:
        if len(combineToken) > 0:
            combineToken.insert(0, str(df_submission.iat[lastI, 2]))
            df_submission.iat[lastI, 2] = " ".join(combineToken)
            combineToken = []
        prevName = name
        recNum = num
        lastI = i

df_submission['Aspect Value'] = df_submission['Aspect Value'].astype(str)
df_submission['Aspect Name'] = df_submission['Aspect Name'].astype(str)
df_submission = df_submission[df_submission['Aspect Value']!='drop']
# df_submission

In [None]:
# get rid of values labeled 'obscure' and 'no tag'
df_submission = df_submission[df_submission['Aspect Name'] != 'Obscure']
df_submission = df_submission[df_submission['Aspect Name'] != 'No Tag']

In [None]:
# save in submission format
df_submission.to_csv("results.tsv", sep="\t", encoding='utf-8', header=False, index=False)