In [1]:
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
import nltk

datalatih=Path.cwd().parent.joinpath(r'C:\Users\USER\Documents\DataScience\Skripsiiuu\ATE-XGBoost\Dataset\train.xml')

extract=None
with datalatih.open(encoding="utf-8")as f:
    extract=BeautifulSoup(f.read().strip(),"lxml-xml")
if extract is None:
    raise Exception("Data XML not found")
sentence_nodes=extract.find_all("sentence")

In [2]:
sentence_nodes

[<sentence id="1004293:0">
 <text>Judging from previous posts this used to be a good place, but not any longer.</text>
 <Opinions>
 <Opinion category="RESTAURANT#GENERAL" from="51" polarity="negative" target="place" to="56"/>
 </Opinions>
 </sentence>,
 <sentence id="1004293:1">
 <text>We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude.</text>
 <Opinions>
 <Opinion category="SERVICE#GENERAL" from="75" polarity="negative" target="staff" to="80"/>
 </Opinions>
 </sentence>,
 <sentence id="1004293:2">
 <text>They never brought us complimentary noodles, ignored repeated requests for sugar, and threw our dishes on the table.</text>
 <Opinions>
 <Opinion category="SERVICE#GENERAL" from="0" polarity="negative" target="NULL" to="0"/>
 </Opinions>
 </sentence>,
 <sentence id="1004293:3">
 <text>The food was lousy - too sweet or too salty and the portions tiny.</text>
 <Opinions>
 <Opinion category="FOOD#QU

In [3]:
def soup2dict(sentence_nodes):
    nkata=[]
    i=0
    for n in sentence_nodes:
        i+=1
        kata={}
        aspect_term=[]
        kata['id']=i
        kata['text']=n.find('text').string
        if n.find('Opinions'):
            for c in n.find('Opinions').contents:
                if c.name=='Opinion':
                    if c['target'] not in aspect_term:
                        aspect_term.append(c['target'])
        kata['aspect']=aspect_term
        nkata.append(kata)
    return(nkata)        

In [4]:
kata = soup2dict(sentence_nodes)
kata

[{'id': 1,
  'text': 'Judging from previous posts this used to be a good place, but not any longer.',
  'aspect': ['place']},
 {'id': 2,
  'text': 'We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude.',
  'aspect': ['staff']},
 {'id': 3,
  'text': 'They never brought us complimentary noodles, ignored repeated requests for sugar, and threw our dishes on the table.',
  'aspect': ['NULL']},
 {'id': 4,
  'text': 'The food was lousy - too sweet or too salty and the portions tiny.',
  'aspect': ['food', 'portions']},
 {'id': 5,
  'text': 'After all that, they complained to me about the small tip.',
  'aspect': ['NULL']},
 {'id': 6, 'text': 'Avoid this place!', 'aspect': ['place']},
 {'id': 7,
  'text': 'I have eaten at Saul, many times, the food is always consistently, outrageously good.',
  'aspect': ['food']},
 {'id': 8,
  'text': 'Saul is the best restaurant on Smith Street and in Brooklyn.',
  'aspec

In [5]:
def sen_process(kata):
    kata = re.sub('([.,!?()])', r' \1 ', kata) # match the punctuation characters and surround them by spaces,
    kata = re.sub('\s{2,}', ' ', kata)         # collapse multiple spaces to one space
    kata = re.sub(r'[^\w\s]', '', kata)
    words = kata.lower().split()
    return words

In [6]:
# tagging using POS tagging

# def post_tag(review):
#     tagged_review=[]
#     for each_review in tqdm(review):
#         review=nltk.word_tokeniza(each_review)
#         tagged_review.append(nltk.post_tag(review))
#     return tagged_review

# def posttagTest(review):
#     for word in review:
#         review=nltk.word_tokeniza(review)
#         tagged_reviews=nltk.post_tag(review)
#     return tagged_reviews

In [7]:
def IOB_tag(s, aspects):
    tags = ['O'] * len(s)
    for aspect in aspects:
        pre_index = 0
        for word in s: 
            if word in aspect: # 'good' in 'a good place'
                cur_index = s.index(word) 
                if cur_index - pre_index == 1: # inside an aspect term
                    tags[cur_index] = 'I'
                else:                       # beginning of an aspect term
                    tags[cur_index] = 'B'
                pre_index = cur_index 
    return tags

In [8]:
def opinion_rule(kata_postag):
    results_tree=[]
    grammar="NP: {<DT|PP|CD|RB>?<JJ|JJR|JJS>*<NN|NNS|PRpP|NNP}"
    cp=nltk.RegexpParser(grammar)
    for tag in kata_postag:
        results_tree.append(cp.parse(tag))
    return results_tree



In [9]:
def dict2df(kata):
    data = pd.DataFrame()
    for s in kata:
        sentence = {}
        sentence['Sentence #'] = s['id']
        sentence['Word'] = sen_process(s['text'])  # split text to words
        s_length = len(sentence['Word']) # the length of sentence, used to generate tag
        if len(s['aspect'])==0 or s['aspect'][0] == 'NULL': # tagging: if no aspect term
            sentence['Tag'] = ['O'] * s_length
        else:                                               # IOB format tag if aspect exist
            aspect_terms = [x.lower() for x in s['aspect']]  
            sentence['Tag'] = IOB_tag(sentence['Word'], aspect_terms)

        # convert each setence to dataframe 
        sentence_df = pd.DataFrame.from_dict(sentence)
        data = data.append(sentence_df, ignore_index=True)
    
    return data

In [10]:
data=dict2df(kata)
data

  data = data.append(sentence_df, ignore_index=True)


Unnamed: 0,Sentence #,Word,Tag
0,1,judging,O
1,1,from,O
2,1,previous,O
3,1,posts,O
4,1,this,O
...,...,...,...
25031,2000,someone,O
25032,2000,would,O
25033,2000,retrain,O
25034,2000,the,O


In [11]:
datatag=data.groupby('Tag').count()
datatag

Unnamed: 0_level_0,Sentence #,Word
Tag,Unnamed: 1_level_1,Unnamed: 2_level_1
B,1727,1727
I,1135,1135
O,22174,22174


In [12]:
datatag['Word']

Tag
B     1727
I     1135
O    22174
Name: Word, dtype: int64

In [13]:
datauji=Path.cwd().parent.joinpath(r'C:\Users\USER\Documents\DataScience\Skripsiiuu\ATE-XGBoost\Dataset\test.xml')

extract=None
with datauji.open(encoding="utf-8")as f:
    extract=BeautifulSoup(f.read().strip(),"lxml-xml")
if extract is None:
    raise Exception("Data XML not found")
sentence_nodes=extract.find_all("sentence")


katauji = soup2dict(sentence_nodes)
katauji

ujidata=dict2df(katauji)
ujidata

  data = data.append(sentence_df, ignore_index=True)


Unnamed: 0,Sentence #,Word,Tag
0,1,yum,O
1,2,serves,O
2,2,really,O
3,2,good,O
4,2,sushi,B
...,...,...,...
8538,676,the,O
8539,676,view,B
8540,676,was,O
8541,676,good,O


In [14]:
def read_data(file_path):
    # 1 raw data to soup
    soup = None
    with file_path.open(encoding="utf-8") as f:
        soup = BeautifulSoup(f.read().strip(), "lxml-xml")
    if soup is None:
        raise Exception("Can't read xml file")
    sentence_nodes = soup.find_all("sentence")

    # 2  convert soup object to a list of dictionaries
    sentences = soup2dict(sentence_nodes)

    # 3 list to dataframe
    data = dict2df(sentences)
    
    return data

In [15]:
#saving to csv file
data.to_csv("Dataset/Train_Restaurant.csv")

In [16]:
#saving test data to csv file too
ujidata.to_csv("Dataset/Test_Restaurant.csv")

In [17]:
#Using XGBoost
