Source:
https://git.gesis.org/papenmaa/chiir21_naturallanguagequeries/-/tree/master/VACOS_NLQ_v2?ref_type=heads

In [7]:
import json
import pandas as pd
import numpy as np

In [36]:
df = pd.read_json('Input_Data/Product_NER_Dataset.json')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3481 entries, 0 to 3480
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           3481 non-null   int64 
 1   domain       3481 non-null   object
 2   text         3481 non-null   object
 3   user         3481 non-null   object
 4   key facts    3481 non-null   object
 5   vague words  3481 non-null   object
 6   text token   3481 non-null   object
 7   negations    3481 non-null   object
dtypes: int64(1), object(7)
memory usage: 217.7+ KB


Unnamed: 0,ID,domain,text,user,key facts,vague words,text token,negations
0,718,laptop,"I want a smaller laptop, nothing too unwieldy....","{'age': 34, 'gender': 'male', 'domain knowledg...","{'IAA key facts': 'n/a', 'annotations': [{'wor...","{'IAA vague words': 'n/a', 'annotations': [{'w...","[[0, I], [2, want], [7, a], [9, smaller], [17,...","[{'word': 'nothing', 'start index': 25, 'keyfa..."
1,719,laptop,"I would like it to look sleek, and probably co...","{'age': 40, 'gender': 'female', 'domain knowle...","{'IAA key facts': 'n/a', 'annotations': [{'wor...","{'IAA vague words': 'n/a', 'annotations': [{'w...","[[0, I], [2, would], [8, like], [13, it], [16,...",[]
2,720,laptop,I think l would look at the same make of .lapt...,"{'age': 64, 'gender': 'female', 'domain knowle...","{'IAA key facts': 'n/a', 'annotations': [{'wor...","{'IAA vague words': 'n/a', 'annotations': [{'w...","[[0, I], [2, think], [8, l], [10, would], [16,...","[{'word': 'not', 'start index': 372, 'keyfact ..."
3,721,laptop,i would buy a resonable priced laptop with all...,"{'age': 43, 'gender': 'female', 'domain knowle...","{'IAA key facts': 'n/a', 'annotations': [{'wor...","{'IAA vague words': 'n/a', 'annotations': [{'w...","[[0, i], [2, would], [8, buy], [12, a], [14, r...",[]
4,723,laptop,It would be a faster version with more memory ...,"{'age': 41, 'gender': 'female', 'domain knowle...","{'IAA key facts': 'n/a', 'annotations': [{'wor...","{'IAA vague words': 'n/a', 'annotations': [{'w...","[[0, It], [3, would], [9, be], [12, a], [14, f...",[]


In [None]:
# Drop all rows without 'annotations' in their 'key facts' column
df = df[df['key facts'].notna()]

def has_annotations(x):
    if 'annotations' in x.keys():
        return True
    else:
        return False
df = df[df['key facts'].apply(lambda x: has_annotations(x))]

In [None]:
# We can create a new column for the annotations of each row
def get_annotations(x):
    return x['annotations']
df['annotations'] = df['key facts'].apply(lambda x: get_annotations(x))

In [75]:
# Each item in the annotations list is a dictionary with metadata about the named entities
# For the sake of our training, we only want entities with the attribute category of 'meta', 'meta_brand', or 'meta_model'
def has_meta(x):
    if x['attribute category'] in ['meta', 'meta_brand', 'meta_model']:
        return True
    else:
        return False

# Since each annotation is a list of dicts for each entity, we need to filter those lists to remove words that don't have the meta categories
def filter_annotations(x):
    return [y for y in x if has_meta(y)]
df['annotations'] = df['annotations'].apply(lambda x: filter_annotations(x))

In [None]:
# Each annotation dictionary also has information we won't need
# We can extract the word, start index, 

1     [{'word': 'mac', 'start index': 233, 'attribut...
6     [{'word': 'mac', 'start index': 26, 'attribute...
7     [{'word': 'reliable', 'start index': 2, 'attri...
9     [{'word': 'HP', 'start index': 94, 'attribute ...
10    [{'word': 'Apple', 'start index': 16, 'attribu...
Name: annotations, dtype: object


In [76]:
# We will only need the text and text annotations for our training
df = df[['text', 'annotations']]

In [77]:
df.head()
attr = df.iloc[0]['annotations']
print(attr)
print(df.iloc[0]['text'])

[{'word': 'mac', 'start index': 233, 'attribute category': 'meta_model', 'keyfact index': 4, 'attribute or value': 'value', 'BIO scheme': 'B', 'vogue': False}, {'word': 'macbook', 'start index': 267, 'attribute category': 'meta_model', 'keyfact index': 5, 'attribute or value': 'value', 'BIO scheme': 'B', 'vogue': False}]
I would like it to look sleek, and probably cost more than the one that just broke.  I would like it to have a good screen and screen.  I would like a good brand.  I used microsoft computers all my life and just recently switched to mac.  I think I would buy another macbook.


In [95]:
# Within the annotations, we only need the word, start index and BIO scheme
def filter_annotations(annotations):
    return [{'word': ann['word'], 'start index': ann['start index'], 'BIO scheme': ann['BIO scheme']} for ann in annotations]

df['annotations'] = df['annotations'].apply(filter_annotations)

df.to_csv('Cleaned_Data/Product_NER_Dataset.csv', index=False)

In [None]:
import torch
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def tokenize_and_align_labels(df):
    input_ids = []
    attention_masks = []
    labels = []

    for index, row in df.iterrows():
        text = row['text']
        annotations = row['annotations']  # This is the list of annotations with 'word', 'start index', and 'BIO scheme'

        # Tokenize the input text with the fast tokenizer
        encoding = tokenizer(text, truncation=True, padding='max_length', max_length=128, return_offsets_mapping=True)

        # Create a list for BIO labels (initialize with 'O')
        bio_labels = ['O'] * len(encoding['input_ids'])

        # Align the BIO labels
        for annotation in annotations:
            word = annotation['word']
            bio_scheme = annotation['BIO scheme']
            start_index = annotation['start index']

            # Find the tokens corresponding to the start of the word
            for i, (token, offset) in enumerate(zip(encoding['input_ids'], encoding['offset_mapping'])):
                token_str = tokenizer.decode([token]).strip()

                # Check if the token matches the word and is within the correct range
                if text[offset[0]:offset[1]].lower() == word.lower():
                    if bio_scheme == 'B':
                        bio_labels[i] = 'B'
                    elif bio_scheme == 'I':
                        bio_labels[i] = 'I'
                    break

                # If it's a continuation of the word, propagate the 'I' label
                if bio_scheme == 'B' and token_str.startswith('##'):
                    bio_labels[i] = 'I'

        # Convert BIO labels to numeric values
        bio_label_to_id = {'O': 0, 'B': 1, 'I': 2}
        aligned_labels = [bio_label_to_id[label] for label in bio_labels]

        # Add the results to the lists
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])
        labels.append(aligned_labels)

    # Convert lists to tensors for PyTorch
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)

    return input_ids, attention_masks, labels

input_ids, attention_masks, labels = tokenize_and_align_labels(df)

# Check the shapes of the tensors
print(input_ids.shape, attention_masks.shape, labels.shape)

torch.Size([749, 128]) torch.Size([749, 128]) torch.Size([749, 128])


In [97]:
# Save the tensors to the NER dataset
torch.save(input_ids, 'Cleaned_Data/NER/input_ids.pt')
torch.save(attention_masks, 'Cleaned_Data/NER/attention_masks.pt')
torch.save(labels, 'Cleaned_Data/NER/labels.pt')