## SemEval16: Dataset Preprocessing 

- Author: Nana
- Created: 2022.07.21
---
- Goal: preprocess the data to fit the format provided by semEval 2022.
- Source data: SemEval 2016 Task 5 Restaurant datasets 
- XML to JSON


In [111]:
datadir = '../../semEval16/data'
import os, sys
queue = []
for file in os.listdir(datadir):
    if file.endswith('.xml'):
        queue.append(file)
queue

['EN_REST_SB1_TEST-gold.xml',
 'EN_REST_SB1_TEST-B.xml',
 'ABSA16_Restaurants_Train_SB1_v2.xml',
 'EN_REST_SB1_TEST-A.xml']

### XML structure 

In [47]:
from xml.etree import ElementTree as ET

filename = f'{datadir}/{queue[0]}'
tree = ET.parse(filename)
root = tree.getroot()
print(root.tag)
for review in root:
    print(review.tag, review.attrib)
    sentences = review.find('sentences')
    for sentence in sentences.findall('sentence'):
        print(sentence.attrib)
        senttext = sentence.find('text')
        opinions = sentence.find('Opinions')
        print(senttext.text)
        opinions = opinions.findall('Opinion')
        print(opinions[0].attrib)  
    break 

Reviews
Review {'rid': 'en_BlueRibbonSushi_478218171'}
{'id': 'en_BlueRibbonSushi_478218171:0'}
Yum!
{'target': 'NULL', 'category': 'FOOD#QUALITY', 'polarity': 'positive', 'from': '0', 'to': '0'}
{'id': 'en_BlueRibbonSushi_478218171:1'}
Serves really good sushi.
{'target': 'sushi', 'category': 'FOOD#QUALITY', 'polarity': 'positive', 'from': '19', 'to': '24'}
{'id': 'en_BlueRibbonSushi_478218171:2'}
Not the biggest portions but adequate.
{'target': 'portions', 'category': 'FOOD#STYLE_OPTIONS', 'polarity': 'neutral', 'from': '16', 'to': '24'}
{'id': 'en_BlueRibbonSushi_478218171:3'}
Green Tea creme brulee is a must!
{'target': 'Green Tea creme brulee', 'category': 'FOOD#QUALITY', 'polarity': 'positive', 'from': '0', 'to': '22'}
{'id': 'en_BlueRibbonSushi_478218171:4'}
Don't leave the restaurant without it.
{'target': 'NULL', 'category': 'FOOD#QUALITY', 'polarity': 'positive', 'from': '0', 'to': '0'}


### SemEval 2022 Data Structure

In [87]:
class structs:
    def ReviewDict():
        return {
            'review_id':'', # extra, to keep the review-sentences structure in SemEval 2016 
            'sent_id': '',
            'text':'',
            'opinions':[]
        }
    def OpnDict():
        return {
        'Source':[[],[]],
        'Target':[[],[]],
        'Polar_expression':[[],[]],
        'Polarity':'',
        'Intensity':'',
        'Category': '', # extra, to keep the gold label of category in SemEval 2016 
    }

### Conversion Function

In [109]:
from xml.etree import ElementTree as ET
import json 

In [124]:
def xml2JSON(filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    print(root.tag)
    REVIEWS = []
    for review in root:
        # print(review.tag, review.attrib['rid'])

        sentences = review.find('sentences')

        for sentence in sentences.findall('sentence'):
            # print('==initializing ReviewDict==')
            RD = structs.ReviewDict()
            RD['review_id'] = review.attrib['rid']
            RD['sent_id'] = sentence.attrib['id']
            RD['text'] = sentence.find('text').text

            opinions = sentence.find('Opinions')
            if opinions: 
                opinions = opinions.findall('Opinion')
                for op in opinions:
                    op = op.attrib
                    OPD = structs.OpnDict()

                    if op['target'] == 'NULL': 
                        target = [[], []]
                    else: 
                        target = [op['target'], [f"{op['from']}:{op['to']}"]]
                    # skip Source
                    OPD['Target'] = target
                    OPD['Polarity'] = op['polarity']
                    OPD['Category'] = op['category']
                    RD['opinions'].append(OPD) 
                    # print(RD['opinions'])
            else: pass 
            REVIEWS.append(RD)
            
    jsonfilename = filename.rstrip('.xml')+'.json'
    with open(jsonfilename, 'w') as fout:
        json.dump(REVIEWS , fout, ensure_ascii=False, indent = 4)
    print(f'{filename} converting to {jsonfilename}.')

In [125]:
for file in queue:
    filename = f'{datadir}/{file}'
    xml2JSON(filename)

Reviews
../../semEval16/data/EN_REST_SB1_TEST-gold.xml converting to ../../semEval16/data/EN_REST_SB1_TEST-gold.json.
Reviews
../../semEval16/data/EN_REST_SB1_TEST-B.xml converting to ../../semEval16/data/EN_REST_SB1_TEST-B.json.
Reviews
../../semEval16/data/ABSA16_Restaurants_Train_SB1_v2.xml converting to ../../semEval16/data/ABSA16_Restaurants_Train_SB1_v2.json.
Reviews
../../semEval16/data/EN_REST_SB1_TEST-A.xml converting to ../../semEval16/data/EN_REST_SB1_TEST-A.json.
