In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

In [2]:
tree = ET.parse('Datasets/Restaurants_Train.xml')
sentences = tree.getroot()

## Category (one hot encoding)

In [3]:
dict = {}
se_count = 0
counter = 0
for count, sentence in enumerate(sentences):
    texts = []
    aspect_terms = []
    aspect_categories  = []
    term_polarities = []
    category_polarities = []

    text = sentence.find('text').text
    texts.append(text)

    aspect_term_elements = sentence.find('aspectTerms')
    if aspect_term_elements is not None:
        for aspect_term in aspect_term_elements.findall('aspectTerm'):
            
            term = aspect_term.get('term')
            term_polarity = aspect_term.get('polarity')
            aspect_terms.append(term)
            term_polarities.append(term_polarity)
    else:
        aspect_terms.append(None)
        term_polarities.append(None)
    
    aspect_category_elements = sentence.find('aspectCategories')
    if aspect_category_elements is not None:
        for aspect_category in aspect_category_elements.findall('aspectCategory'):
            category = aspect_category.get('category')
            category_polarity = aspect_category.get('polarity')
            aspect_categories.append(category)
            category_polarities.append(category_polarity)
    else:
        aspect_categories.append(None)
        category_polarities.append(None)

    

    for i in range(len(aspect_terms)):
        for j in range(len(aspect_categories)):
            dict[counter] = {
                'text':texts[0],
                'aspect_term':aspect_terms[i],
                'term_polarity':term_polarities[i],
                'aspect_category':aspect_categories[j],
                'category_polarity':category_polarities[j]
            }
            counter += 1
        counter += 1

In [4]:
df = pd.DataFrame.from_dict(dict).T.reset_index().drop(columns=['index'])
df.head(10)

Unnamed: 0,text,aspect_term,term_polarity,aspect_category,category_polarity
0,But the staff was so horrible to us.,staff,negative,service,negative
1,"To be completely fair, the only redeeming fact...",food,positive,food,positive
2,"To be completely fair, the only redeeming fact...",food,positive,anecdotes/miscellaneous,negative
3,"The food is uniformly exceptional, with a very...",food,positive,food,positive
4,"The food is uniformly exceptional, with a very...",kitchen,positive,food,positive
5,"The food is uniformly exceptional, with a very...",menu,neutral,food,positive
6,Where Gabriela personaly greets you and recomm...,,,service,positive
7,"For those that go once and don't enjoy it, all...",,,anecdotes/miscellaneous,positive
8,"Not only was the food outstanding, but the lit...",food,positive,food,positive
9,"Not only was the food outstanding, but the lit...",food,positive,service,positive


In [5]:
df_one_hot = pd.get_dummies(df['aspect_category'], drop_first=False, dtype=int)
df_one_hot = pd.concat([df, df_one_hot], axis=1)
df_one_hot = df_one_hot.groupby(df_one_hot['text']).aggregate({'aspect_term':'first', 'term_polarity':'first', 'category_polarity':'first', 'ambience':'max', 'anecdotes/miscellaneous':'max', 'food':'max', 'price':'max', 'service':'max'})
df_one_hot = df_one_hot.reset_index()

In [6]:
df_category = df_one_hot.copy()
df_category.head()

Unnamed: 0,text,aspect_term,term_polarity,category_polarity,ambience,anecdotes/miscellaneous,food,price,service
0,"$160 for 2 filets, 2 sides, an appetizer and d...",filets,neutral,neutral,0,0,1,1,0
1,$20 for all you can eat sushi cannot be beaten.,sushi,neutral,positive,0,0,0,1,0
2,$20 gets you unlimited sushi of a very high qu...,sushi,positive,positive,0,0,1,1,0
3,"$6 and there is much tasty food, all of it fre...",food,positive,positive,0,0,1,1,0
4,"($200 for 2 glasses of champagne, not too expe...",glasses of champagne,negative,negative,0,0,0,1,0


## Category + sentiment

In [7]:
df_catsent_hot = pd.get_dummies(df['aspect_category'], drop_first=False, dtype=int)
df_catsent_hot = pd.concat([df, df_catsent_hot], axis=1)
df_catsent_hot.head()

Unnamed: 0,text,aspect_term,term_polarity,aspect_category,category_polarity,ambience,anecdotes/miscellaneous,food,price,service
0,But the staff was so horrible to us.,staff,negative,service,negative,0,0,0,0,1
1,"To be completely fair, the only redeeming fact...",food,positive,food,positive,0,0,1,0,0
2,"To be completely fair, the only redeeming fact...",food,positive,anecdotes/miscellaneous,negative,0,1,0,0,0
3,"The food is uniformly exceptional, with a very...",food,positive,food,positive,0,0,1,0,0
4,"The food is uniformly exceptional, with a very...",kitchen,positive,food,positive,0,0,1,0,0


In [8]:
for i, row in df_catsent_hot.iterrows():
    if row['category_polarity'] == 'negative':
        df_catsent_hot.loc[i, row['aspect_category']] = 1
    elif row['category_polarity'] == 'positive':
        df_catsent_hot.loc[i, row['aspect_category']] = 2
    else:
        df_catsent_hot.loc[i, row['aspect_category']] = 3

df_catsent_hot.head()

Unnamed: 0,text,aspect_term,term_polarity,aspect_category,category_polarity,ambience,anecdotes/miscellaneous,food,price,service
0,But the staff was so horrible to us.,staff,negative,service,negative,0,0,0,0,1
1,"To be completely fair, the only redeeming fact...",food,positive,food,positive,0,0,2,0,0
2,"To be completely fair, the only redeeming fact...",food,positive,anecdotes/miscellaneous,negative,0,1,0,0,0
3,"The food is uniformly exceptional, with a very...",food,positive,food,positive,0,0,2,0,0
4,"The food is uniformly exceptional, with a very...",kitchen,positive,food,positive,0,0,2,0,0


In [9]:
df_catsent_hot = df_catsent_hot.groupby(df_catsent_hot['text']).aggregate({'aspect_term':'first', 'term_polarity':'first', 'category_polarity':'first', 'ambience':'max', 'anecdotes/miscellaneous':'max', 'food':'max', 'price':'max', 'service':'max'})
df_catsent_hot.reset_index(inplace=True)
df_catsent_hot.head()

Unnamed: 0,text,aspect_term,term_polarity,category_polarity,ambience,anecdotes/miscellaneous,food,price,service
0,"$160 for 2 filets, 2 sides, an appetizer and d...",filets,neutral,neutral,0,0,3,3,0
1,$20 for all you can eat sushi cannot be beaten.,sushi,neutral,positive,0,0,0,2,0
2,$20 gets you unlimited sushi of a very high qu...,sushi,positive,positive,0,0,2,2,0
3,"$6 and there is much tasty food, all of it fre...",food,positive,positive,0,0,2,2,0
4,"($200 for 2 glasses of champagne, not too expe...",glasses of champagne,negative,negative,0,0,0,1,0


## Dataset

In [13]:
text, aspect_term, term_polarity, category_polarity, ambience, anec_misc, food, price, service = df_category.iloc[0].values
print(text)

$160 for 2 filets, 2 sides, an appetizer and drinks.


In [14]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize(text)
tokens = ['[cls]'] + tokens + ['[sep]']

['$',
 '160',
 'for',
 '2',
 'file',
 '##ts',
 ',',
 '2',
 'sides',
 ',',
 'an',
 'app',
 '##eti',
 '##zer',
 'and',
 'drinks',
 '.']

In [11]:
from torch.utils.data import Dataset
import torch

class ACDDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
    
    def __getitem__(self, index):
        
        text, aspect_term, term_polarity, category_polarity, ambience, anec_misc, food, price, service = self.df.iloc[index].values

        tokens = self.tokenizer.tokenize(text)

        for i in range

        return 
    
    def __len__(self):
        return len(self.df)


True

In [191]:
df_one_hot = pd.get_dummies(df['aspect_category'], drop_first=False, dtype=int)

In [192]:
df_one_hot = pd.concat([df, df_one_hot], axis=1)

In [194]:
df_one_hot = df_one_hot.groupby(df_one_hot['text']).aggregate({'aspect_term':'first', 'term_polarity':'first', 'category_polarity':'first', 'ambience':'max', 'anecdotes/miscellaneous':'max', 'food':'max', 'price':'max', 'service':'max'})

In [195]:
df_one_hot.head()

Unnamed: 0_level_0,aspect_term,term_polarity,category_polarity,ambience,anecdotes/miscellaneous,food,price,service
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"$160 for 2 filets, 2 sides, an appetizer and drinks.",filets,neutral,neutral,0,0,1,1,0
$20 for all you can eat sushi cannot be beaten.,sushi,neutral,positive,0,0,0,1,0
$20 gets you unlimited sushi of a very high quality- I even took a friend here from Japan who said it was one of the best sushi places in the US that he has been to.,sushi,positive,positive,0,0,1,1,0
"$6 and there is much tasty food, all of it fresh and continually refilled.",food,positive,positive,0,0,1,1,0
"($200 for 2 glasses of champagne, not too expensive bottle of wine and 2 after dinner drinks).",glasses of champagne,negative,negative,0,0,0,1,0


In [196]:
df_one_hot = df_one_hot.reset_index()
df_one_hot.head()

Unnamed: 0,text,aspect_term,term_polarity,category_polarity,ambience,anecdotes/miscellaneous,food,price,service
0,"$160 for 2 filets, 2 sides, an appetizer and d...",filets,neutral,neutral,0,0,1,1,0
1,$20 for all you can eat sushi cannot be beaten.,sushi,neutral,positive,0,0,0,1,0
2,$20 gets you unlimited sushi of a very high qu...,sushi,positive,positive,0,0,1,1,0
3,"$6 and there is much tasty food, all of it fre...",food,positive,positive,0,0,1,1,0
4,"($200 for 2 glasses of champagne, not too expe...",glasses of champagne,negative,negative,0,0,0,1,0


In [164]:
df_one_hot[df_one_hot['text'] == '$160 for 2 filets, 2 sides, an appetizer and drinks.']

Unnamed: 0,text,aspect_term,term_polarity,aspect_category,category_polarity,ambience,anecdotes/miscellaneous,food,price,service
3557,"$160 for 2 filets, 2 sides, an appetizer and d...",filets,neutral,food,neutral,0,0,1,0,0
3558,"$160 for 2 filets, 2 sides, an appetizer and d...",filets,neutral,price,neutral,0,0,0,1,0
3559,"$160 for 2 filets, 2 sides, an appetizer and d...",sides,neutral,food,neutral,0,0,1,0,0
3560,"$160 for 2 filets, 2 sides, an appetizer and d...",sides,neutral,price,neutral,0,0,0,1,0
3561,"$160 for 2 filets, 2 sides, an appetizer and d...",appetizer,neutral,food,neutral,0,0,1,0,0
3562,"$160 for 2 filets, 2 sides, an appetizer and d...",appetizer,neutral,price,neutral,0,0,0,1,0
3563,"$160 for 2 filets, 2 sides, an appetizer and d...",drinks,neutral,food,neutral,0,0,1,0,0
3564,"$160 for 2 filets, 2 sides, an appetizer and d...",drinks,neutral,price,neutral,0,0,0,1,0


In [127]:
df_one_hot = df_one_hot.groupby(df_one_hot['text']).aggregate({'ambience':'min', 'anecdotes/miscellaneous':'min', 'food':'min', 'price':'min', 'service':'min'})

In [117]:
df_one_hot

Unnamed: 0_level_0,ambience,anecdotes/miscellaneous,food,price,service
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"$160 for 2 filets, 2 sides, an appetizer and drinks.",0,0,4,4,0
$20 for all you can eat sushi cannot be beaten.,0,0,0,1,0
$20 gets you unlimited sushi of a very high quality- I even took a friend here from Japan who said it was one of the best sushi places in the US that he has been to.,0,0,3,3,0
"$6 and there is much tasty food, all of it fresh and continually refilled.",0,0,1,1,0
"($200 for 2 glasses of champagne, not too expensive bottle of wine and 2 after dinner drinks).",0,0,0,3,0
...,...,...,...,...,...
would have rather tried terrace in the sky or water club for that price,0,1,0,1,0
"wow! how have i missed this one ,tried Long Tan last week for the first time and now know what the NY TIMES and many more in the neighborhood already knew.",0,1,0,0,0
you can actually get 2 salads worth if u take it home and add it to some lettuce!,0,0,2,0,0
you guys rock.,0,1,0,0,0


In [87]:
import pandas as pd
from collections import defaultdict

# Parse the XML data
import xml.etree.ElementTree as ET
tree = ET.parse('Datasets/Restaurants_Train.xml')
root = tree.getroot()

# Extract the aspect terms and categories
aspect_terms = []
aspect_categories = []
for sentence in root.findall('sentence'):
    text = sentence.find('text').text
    for aspectTerm in sentence.findall('aspectTerms/aspectTerm'):
        aspect_terms.append((text, aspectTerm.attrib['term'], aspectTerm.attrib['polarity']))
    for aspectCategory in sentence.findall('aspectCategories/aspectCategory'):
        aspect_categories.append((text, aspectCategory.attrib['category'], aspectCategory.attrib['polarity']))

# Create the dataframe
df = pd.DataFrame(aspect_terms, columns=['text', 'aspect_term', 'aspect_term_polarity'])
df = pd.concat([df, pd.DataFrame(aspect_categories, columns=['text', 'aspect_category', 'aspect_category_polarity'])], ignore_index=True)

# One-hot encode the aspect terms and categories
aspect_term_encoder = df['aspect_term'].unique()
aspect_category_encoder = df['aspect_category'].unique()

df_encoded = pd.DataFrame(index=df['text'].unique())

for term in aspect_term_encoder:
    df_encoded[f'aspect_term_{term}'] = df['aspect_term'].apply(lambda x: 1 if x == term else 0)

for category in aspect_category_encoder:
    df_encoded[f'aspect_category_{category}'] = df['aspect_category'].apply(lambda x: 1 if x == category else 0)

# Fill NaN values with 0
df_encoded = df_encoded.fillna(0)

print(df_encoded)

  df_encoded[f'aspect_term_{term}'] = df['aspect_term'].apply(lambda x: 1 if x == term else 0)
  df_encoded[f'aspect_term_{term}'] = df['aspect_term'].apply(lambda x: 1 if x == term else 0)
  df_encoded[f'aspect_term_{term}'] = df['aspect_term'].apply(lambda x: 1 if x == term else 0)
  df_encoded[f'aspect_term_{term}'] = df['aspect_term'].apply(lambda x: 1 if x == term else 0)
  df_encoded[f'aspect_term_{term}'] = df['aspect_term'].apply(lambda x: 1 if x == term else 0)
  df_encoded[f'aspect_term_{term}'] = df['aspect_term'].apply(lambda x: 1 if x == term else 0)
  df_encoded[f'aspect_term_{term}'] = df['aspect_term'].apply(lambda x: 1 if x == term else 0)
  df_encoded[f'aspect_term_{term}'] = df['aspect_term'].apply(lambda x: 1 if x == term else 0)
  df_encoded[f'aspect_term_{term}'] = df['aspect_term'].apply(lambda x: 1 if x == term else 0)
  df_encoded[f'aspect_term_{term}'] = df['aspect_term'].apply(lambda x: 1 if x == term else 0)
  df_encoded[f'aspect_term_{term}'] = df['aspect_t

                                                    aspect_term_staff  \
But the staff was so horrible to us.                              0.0   
To be completely fair, the only redeeming facto...                0.0   
The food is uniformly exceptional, with a very ...                0.0   
Not only was the food outstanding, but the litt...                0.0   
Our agreed favorite is the orrechiete with saus...                0.0   
...                                                               ...   
Better than the bagel shop on the corner, but n...                0.0   
it helps if you know what to order.                               0.0   
But that is highly forgivable.                                    0.0   
When we arrived at 6:00 PM, the restaurant was ...                0.0   
I am going to the mid town location next.                         0.0   

                                                    aspect_term_food  \
But the staff was so horrible to us.               

In [88]:
df_encoded.head()

Unnamed: 0,aspect_term_staff,aspect_term_food,aspect_term_kitchen,aspect_term_menu,aspect_term_perks,aspect_term_orrechiete with sausage and chicken,aspect_term_waiters,aspect_term_meats,aspect_term_dish,aspect_term_Bagels,...,aspect_term_cheese sticks,aspect_term_pot of boiling water,aspect_term_glass noodles,aspect_term_nan,aspect_category_nan,aspect_category_service,aspect_category_food,aspect_category_anecdotes/miscellaneous,aspect_category_price,aspect_category_ambience
But the staff was so horrible to us.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"To be completely fair, the only redeeming factor was the food, which was above average, but couldn't make up for all the other deficiencies of Teodora.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"The food is uniformly exceptional, with a very capable kitchen which will proudly whip up whatever you feel like eating, whether it's on the menu or not.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Not only was the food outstanding, but the little 'perks' were great.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Our agreed favorite is the orrechiete with sausage and chicken (usually the waiters are kind enough to split the dish in half so you get to sample both meats).,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
