In [None]:
'''This script read the train and test data and prepares them for use in modelling.'''

# Import libraries

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from itertools import chain

# Define Functions

In [5]:
# Define a function to load data and transform to data frame
def load_data(data_path):
    '''loads xml data and converts it to data frame'''
    tree = ET.parse(data_path)
    root = tree.getroot()

    reviews = []
    aspects = []
    for sentence in root:
        categories =[]
        review = sentence[0].text
        reviews.append(review)
        for aspect in sentence.iter('aspectCategory'):
            categories.append(aspect.attrib['category'])
        aspects.append(categories)
        
    # Convert to dataframe
    data = pd.DataFrame(list(zip(reviews, aspects)))
    data.columns = ['review', 'aspects']
    # One-hot encode aspects
    aspects = sorted(set(chain.from_iterable(aspects)))
    for aspect in aspects:
        data[aspect] = data['aspects'].apply(lambda x: aspect in x).astype(int)
    del data['aspects']
    
    return data

In [6]:
TRAIN_PATH = '../data/train.xml'
VALID_PATH = '../data/val.xml'
TEST_PATH = '../data/test.xml'

train = load_data(TRAIN_PATH)
validation = load_data(VALID_PATH)
test = load_data(TEST_PATH)

print(train.shape, validation.shape, test.shape)
train.head()

(3149, 9) (400, 9) (400, 9)


Unnamed: 0,review,ambience,food,menu,miscellaneous,place,price,service,staff
0,It might be the best sit down food I've had in...,0,1,0,0,1,0,0,0
1,Hostess was extremely accommodating when we ar...,0,0,0,1,0,0,0,1
2,We were a couple of minutes late for our reser...,0,0,0,1,0,0,0,1
3,"Though the service might be a little slow, the...",0,0,0,0,0,0,1,1
4,Although we arrived at the restaurant 10 min l...,0,0,0,1,0,0,0,1


In [8]:
test.head()

Unnamed: 0,review,ambience,food,menu,miscellaneous,place,price,service,staff
0,"We went again and sat at the bar this time, I ...",0,1,0,0,1,0,0,0
1,"The food was good, but it's not worth the wait...",0,1,0,0,0,0,1,0
2,Waiter took our drink order and then we didn't...,0,0,0,1,0,0,0,1
3,It does get crowded with plenty of blue shirte...,0,1,0,1,0,0,0,0
4,"After hearing all of the specials, you would t...",0,1,0,0,0,1,0,0


In [12]:
# Save as csv
# train.to_csv('../data/train.csv', index=False)
# validation.to_csv('../data/validation.csv', index=False)
# test.to_csv('../data/test.csv', index=False)