# ABOUT:
- Multi Aspect Multi Sentiment (MAMS) Dataset is an food review ABSA dataset where samples have multiple sentiments and aspects
- MAMS has 2 dataset types: ATSA and ACSA. This research focuses on ATSA
- this notebook:
    1. parses MAMS xml file into a dataframe
    2. converts into a training dataset
    3. save as pickle

In [10]:
from bs4 import BeautifulSoup
import tqdm
import pandas as pd
from bs4.element import Tag
import jsonlines
import os

#### import data

In [22]:
path = r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\local\data\NLPCC 2020 Shared Task 2 Guideline  Multi-Aspect-based Multi Sentiment Analysis (MAMS)\Dataset_MAMS\ATSA\train.xml"
with open(path, 'r') as f:
    data = f.read()
    data = BeautifulSoup(data, "xml")

### 1. Parse ATSA XML dataset

In [23]:
def parse_ATSA_sentence(sentence, remove_newlines = True):     # takes a sentence TAG, returns the relevant units of information
    text = sentence.text
    if remove_newlines:
        text = text.replace('\n',"")
    aspectTerms = tuple((a['from'],a['polarity'],a['term'],a['to']) for a in sentence.aspectTerms if isinstance(a,Tag))
    return {"text":text,"aspectTerms":aspectTerms}      

In [24]:
# from datasets import ClassLabel, Sequence
import pandas as pd
from IPython.display import display, HTML
def show_elements(dataset, randomize = True, num_samples = 10):
    
    if isinstance(dataset,pd.DataFrame):                  # if DataFrame 
        if randomize:                                          # if random> shuffle
            dataset = dataset.sample(frac=1)
        display(HTML(dataset.iloc[:num_samples].to_html()))             # take first n rows
    
    else:                                                    # if not DataFrame
        if randomize:                                           # if random> shuffle
            dataset = dataset.shuffle()   
        dataset = pd.DataFrame(dataset.select(range(num_samples)))   # convert first n rows to dataframe
        display(HTML(dataset.to_html()))

In [25]:
dataset = []
for s in tqdm.tqdm(data.find_all("sentence")):
    parsed_sentence = parse_ATSA_sentence(s)
    dataset.append(parsed_sentence)
dataset = pd.DataFrame(dataset)

100%|███████████████████████████████████████████████████████████████████████████| 4297/4297 [00:00<00:00, 36898.80it/s]


In [26]:
show_elements(dataset,randomize = False)

Unnamed: 0,text,aspectTerms
0,The decor is not special at all but their food and amazing prices make up for it.,"((4, negative, decor, 9), (42, positive, food, 46), (59, positive, prices, 65))"
1,"when tables opened up, the manager sat another party before us.","((5, neutral, tables, 11), (27, negative, manager, 34))"
2,"Though the menu includes some unorthodox offerings (a peanut butter roll, for instance), the classics are pure and great--we've never had better sushi anywhere, including Japan.","((11, neutral, menu, 15), (54, negative, peanut butter roll, 72), (93, positive, classics, 101), (145, positive, sushi, 150))"
3,"service is good although a bit in your face, we were asked every five mins if food was ok, but better that than being ignored.","((0, positive, service, 7), (78, neutral, food, 82))"
4,PS- I just went for brunch on Saturday and the eggs served with onions and rosemary were amazing.,"((20, neutral, brunch, 26), (47, positive, eggs served with onions, 70))"
5,they didn't have to change anything about the menu except add a leg of chicken seperatley and the guy mumbled very rudely that I had already ordered and I should've decided earlier.,"((46, neutral, menu, 50), (98, negative, guy, 101))"
6,"The server came to us and was sooo hot, he went over the menu and specials with us.","((4, positive, server, 10), (57, neutral, menu, 61))"
7,The Food The best surprises on the El Salvadorean menu are the appetizers.,"((4, neutral, Food, 8), (63, positive, appetizers, 73))"
8,"12/24/03 Dinner was ok, service was so- so,the worst part was the hostess - we made reservations a month before Christmas Eve for three people, you would think the table would be large enough for all three of us.","((10, positive, Dinner, 16), (25, neutral, service, 32), (67, negative, hostess, 74))"
9,"Still, after all the fuss, the food makes you forget about the wait.","((31, positive, food, 35), (63, neutral, wait, 67))"


In [27]:
len(dataset)

4297

In [31]:
# average number of sentiments per sample - Fi_ATSA
def get_num_unique_sentiments(label):
    return len(set(l[1] for l in label))
num_unique_sentiments = dataset.aspectTerms.apply(lambda label: get_num_unique_sentiments(label))
sum(num_unique_sentiments)/len(num_unique_sentiments)

2.028857342331859

### 2. convert into training dataset

In [8]:
def process_mams_dataset(dataset):
    def _generate_sample(text, aspectTerms):
        output = []
        label_mapping = {"negative":0,"neutral":1,"positive":2}
        for AT in aspectTerms:
            output.append({"text":text, 
                           "span": (int(AT[0]), int(AT[-1])),
                           "label":label_mapping[AT[1]]})
        return output
    samples = []
    for i in range(len(dataset)):
        samples.extend(_generate_sample(dataset.loc[i,"text"],dataset.loc[i,"aspectTerms"]))
    return pd.DataFrame(samples)


In [9]:
dataset = process_mams_dataset(dataset)

### 3. save as pickle

In [9]:
path = os.path.join(os.getcwd(),"data\\MAMS\\train\\mams_atsa_train.pkl")
dataset.to_pickle(path)

### Input for SATSA 
- text: a sentence e.g "The decor is not special at all but their.."
- span: the span of the aspect e.g (4, 9)	
- label: the sentiment of the aspect e.g 0

note: label_mappings = {"negative":0,"neutral":1,"positive":2}

In [10]:
dataset.head()

Unnamed: 0,text,span,label
0,The decor is not special at all but their food...,"(4, 9)",0
1,The decor is not special at all but their food...,"(42, 46)",2
2,The decor is not special at all but their food...,"(59, 65)",2
3,"when tables opened up, the manager sat another...","(5, 11)",1
4,"when tables opened up, the manager sat another...","(27, 34)",0


In [13]:
dataset.to_dict("records")[:1]

[{'text': 'The decor is not special at all but their food and amazing prices make up for it.',
  'span': (4, 9),
  'label': 0}]

In [10]:
len(dataset)

11186

In [1]:
import pandas as pd
import os
from collections import Counter
aspect_set = set()
path = os.path.join(os.getcwd(),"data\\MAMS\\train\\mams_atsa_train.pkl")
df = pd.read_pickle(path)
for i in range(len(df)):
    span = df.loc[i,"span"]
    aspect = df.loc[i,"text"][span[0]:span[1]]
    aspect_set.add(aspect)
counts = Counter(df.text).values()
print("MAMS Dataset")
print("Total number of unique aspects:", len(aspect_set))
print("Size of training set:", len(counts))
print("Average number of aspects per sample:", sum(counts)/len(counts),end = "\n\n")
import pandas as pd
import os
from collections import Counter
aspect_set = set()
path = r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\URECA--Financial-Aspect-Based-Sentiment-Analysis\data\Fi_ATSA\train\Fi_ATSA_train.pkl"
df = pd.read_pickle(path)
for i in range(len(df)):
    span = df.loc[i,"span"]
    aspect = df.loc[i,"text"][span[0]:span[1]]
    aspect_set.add(aspect)
counts = Counter(df.text).values()
print("Fi_ATSA Dataset (my own)")
print("Total number of unique aspects:", len(aspect_set))
print("Size of training set:", len(counts))
print("Average number of aspects per sample:", sum(counts)/len(counts),end = "\n\n")
import pandas as pd
import os
from collections import Counter
aspect_set = set()
path = r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\URECA--Financial-Aspect-Based-Sentiment-Analysis\data\FiQA\train\FiQA_train.pkl"
df = pd.read_pickle(path)
for i in range(len(df)):
    span = df.loc[i,"span"]
    aspect = df.loc[i,"text"][span[0]:span[1]]
    aspect_set.add(aspect)
counts = Counter(df.text).values()
print("FiQA Dataset")
print("Total number of unique aspects:", len(aspect_set))
print("Size of training set:", len(counts))
print("Average number of aspects per sample:", sum(counts)/len(counts),end = "\n\n")

MAMS Dataset
Total number of unique aspects: 2586
Size of training set: 4297
Average number of aspects per sample: 2.6032115429369327

Fi_ATSA Dataset (my own)
Total number of unique aspects: 383
Size of training set: 413
Average number of aspects per sample: 3.49636803874092

FiQA Dataset
Total number of unique aspects: 526
Size of training set: 1078
Average number of aspects per sample: 1.0556586270871986



In [32]:
import pandas as pd
import os
from collections import Counter
aspect_set = set()
path = r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\URECA--Financial-Aspect-Based-Sentiment-Analysis\data\FiQA\train\FiQA_train.pkl"
df = pd.read_pickle(path)
for i in range(len(df)):
    span = df.loc[i,"span"]
    aspect = df.loc[i,"text"][span[0]:span[1]]
    aspect_set.add(aspect)
counts = Counter(df.text).values()
print("FiQA Dataset")
print("Total number of unique aspects:", len(aspect_set))
print("Size of training set:", len(counts))
print("Average number of aspects per sample:", sum(counts)/len(counts),end = "\n\n")

FiQA Dataset
Total number of unique aspects: 526
Size of training set: 1078
Average number of aspects per sample: 1.0556586270871986



In [49]:
c = {}
for text in df.text:
    

Unnamed: 0,text,span,label
0,Royal Mail chairman Donald Brydon set to step ...,"(0, 10)",0
1,Stakes High for AstraZeneca Heart Drug Facing ...,"(16, 27)",0
2,UPDATE 1Dairy Crest loses a third of Morrisons...,"(37, 46)",1
3,Insight hires Avivas David Hillier for multias...,"(0, 7)",1
4,Primark racks up a happy Christmas after stron...,"(0, 7)",2
...,...,...,...
1133,Facebook FB received a Buy rating from Wells F...,"(9, 11)",2
1134,TSLA Wish had my puts back but see if we can f...,"(0, 4)",0
1135,Citrix Systems Inc CTXS Position Increased by ...,"(19, 23)",2
1136,Notable gainers among liquid option names this...,"(65, 66)",2


In [48]:
df.groupby(["text","label"])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001A1D660A310>

In [33]:
counts

dict_values([1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 3, 1, 1, 

### perform same steps for dev dataset

In [None]:
# read
path = r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\local\data\NLPCC 2020 Shared Task 2 Guideline  Multi-Aspect-based Multi Sentiment Analysis (MAMS)\Dataset_MAMS\ATSA\dev.xml"
with open(path, 'r') as f:
    data = f.read()
    data = BeautifulSoup(data, "xml")
   
 # parse
dataset = []
for s in tqdm.tqdm(data.find_all("sentence")):
    parsed_sentence = parse_ATSA_sentence(s)
    dataset.append(parsed_sentence)
dataset = pd.DataFrame(dataset)

# process
dataset = process_mams_dataset(dataset)

path = r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\local\data\mams_atsa_dev.pkl"
dataset.to_pickle(path)

# ACSA below is not the focus

#### import data

In [None]:
path = r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\local\data\NLPCC 2020 Shared Task 2 Guideline  Multi-Aspect-based Multi Sentiment Analysis (MAMS)\Dataset_MAMS\ACSA\train.xml"
with open(path, 'r') as f:
    data = f.read()
    data = BeautifulSoup(data, "xml")

In [None]:
def parse_ACSA_sentence(sentence, remove_newlines = True):     # takes a sentence TAG, returns the relevant units of information
    text = sentence.text
    if remove_newlines:
        text = text.replace('\n',"")
    aspectCategories = tuple((c['category'],c['polarity']) for c in sentence.aspectCategories if isinstance(c,Tag))
        
    return {"text":text,"aspectCategories":aspectCategories}      

# Parse ACSA XML dataset

In [None]:
dataset = []
for s in tqdm.tqdm(data.find_all("sentence")):
    parsed_sentence = parse_ACSA_sentence(s)
    dataset.append(parsed_sentence)
dataset = pd.DataFrame(dataset)
dataset

### save dataset
- as pandas dataframe

In [None]:
path = r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\local\data\NLPCC 2020 Shared Task 2 Guideline  Multi-Aspect-based Multi Sentiment Analysis (MAMS)\Dataset_MAMS\ACSA\train.csv"
dataset.to_csv(path,index = False)

## preprocess and save dev datasets too

In [None]:
path = r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\local\data\NLPCC 2020 Shared Task 2 Guideline  Multi-Aspect-based Multi Sentiment Analysis (MAMS)\Dataset_MAMS\ACSA\dev.xml"

with open(path, 'r') as f:
    data = f.read()
    data = BeautifulSoup(data, "xml")
    
dataset = []
for s in tqdm.tqdm(data.find_all("sentence")):
    parsed_sentence = parse_ACSA_sentence(s)
    dataset.append(parsed_sentence)
dataset = pd.DataFrame(dataset)

path = r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\local\data\NLPCC 2020 Shared Task 2 Guideline  Multi-Aspect-based Multi Sentiment Analysis (MAMS)\Dataset_MAMS\ACSA\dev.csv"
dataset.to_csv(path,index = False)