In [62]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Data Import

In [63]:
import pandas as pd
import xml.etree.ElementTree as ET

In [64]:
parsedXML = ET.parse("/content/drive/My Drive/Masterarbeit/Data/Original/SEMEVAL-14/Restaurants_Test_Gold.xml")

In [65]:
aspect_number = 14

In [66]:
dfcols = ['id','text']
for ii in range(1,aspect_number):
    dfcols.append("aspect_term_{}".format(ii))
    dfcols.append("aspect_polarity_{}".format(ii))
    dfcols.append("aspect_from_{}".format(ii))
    dfcols.append("aspect_to_{}".format(ii))          
df = pd.DataFrame(columns=dfcols)

for sentence in parsedXML.getroot():
    id = sentence.attrib.get('id')
    text = sentence.find('text').text
    line = [id,text]

    for asp in sentence.iter('aspectTerm'):
        term = asp.attrib.get("term")
        pol = asp.attrib.get("polarity")
        a_from = asp.attrib.get("from")
        a_to = asp.attrib.get("to")

        line += [term, pol, a_from, a_to]

    if len(line) < len(dfcols):
        pads = [None] * (len(dfcols)-len(line))
        line += pads
    
    df = df.append(pd.Series(line, index=dfcols), ignore_index=True)

In [67]:
df

Unnamed: 0,id,text,aspect_term_1,aspect_polarity_1,aspect_from_1,aspect_to_1,aspect_term_2,aspect_polarity_2,aspect_from_2,aspect_to_2,aspect_term_3,aspect_polarity_3,aspect_from_3,aspect_to_3,aspect_term_4,aspect_polarity_4,aspect_from_4,aspect_to_4,aspect_term_5,aspect_polarity_5,aspect_from_5,aspect_to_5,aspect_term_6,aspect_polarity_6,aspect_from_6,aspect_to_6,aspect_term_7,aspect_polarity_7,aspect_from_7,aspect_to_7,aspect_term_8,aspect_polarity_8,aspect_from_8,aspect_to_8,aspect_term_9,aspect_polarity_9,aspect_from_9,aspect_to_9,aspect_term_10,aspect_polarity_10,aspect_from_10,aspect_to_10,aspect_term_11,aspect_polarity_11,aspect_from_11,aspect_to_11,aspect_term_12,aspect_polarity_12,aspect_from_12,aspect_to_12,aspect_term_13,aspect_polarity_13,aspect_from_13,aspect_to_13
0,32897564#894393#2,The bread is top notch as well.,bread,positive,4,9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,33070600#670328#0,I have to say they have one of the fastest del...,delivery times,positive,43,57,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,33070600#670328#2,Food is always fresh and hot- ready to eat!,Food,positive,0,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,36244464#949326#5,Did I mention that the coffee is OUTSTANDING?,coffee,positive,23,29,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,32894246#870052#0,"Certainly not the best sushi in New York, howe...",sushi,conflict,23,28,place,positive,79,84,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,11661949#1709112#8,"Anyway, the owner was fake.",owner,negative,12,17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
796,35698240#574381#2,Owner is pleasant and entertaining.,Owner,positive,0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
797,11350390#802808#3,"I have never in my life sent back food before,...",food,negative,34,38,waiter,negative,76,82,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
798,11482070#503858#3,"Although the restaurant itself is nice, I pref...",food,negative,67,71,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Drop duplicates

In [68]:
df[df.duplicated("text",keep=False)]

Unnamed: 0,id,text,aspect_term_1,aspect_polarity_1,aspect_from_1,aspect_to_1,aspect_term_2,aspect_polarity_2,aspect_from_2,aspect_to_2,aspect_term_3,aspect_polarity_3,aspect_from_3,aspect_to_3,aspect_term_4,aspect_polarity_4,aspect_from_4,aspect_to_4,aspect_term_5,aspect_polarity_5,aspect_from_5,aspect_to_5,aspect_term_6,aspect_polarity_6,aspect_from_6,aspect_to_6,aspect_term_7,aspect_polarity_7,aspect_from_7,aspect_to_7,aspect_term_8,aspect_polarity_8,aspect_from_8,aspect_to_8,aspect_term_9,aspect_polarity_9,aspect_from_9,aspect_to_9,aspect_term_10,aspect_polarity_10,aspect_from_10,aspect_to_10,aspect_term_11,aspect_polarity_11,aspect_from_11,aspect_to_11,aspect_term_12,aspect_polarity_12,aspect_from_12,aspect_to_12,aspect_term_13,aspect_polarity_13,aspect_from_13,aspect_to_13


In [69]:
text_counts = df.text.value_counts()
text_counts[:10]

They have very quick service which is great when you don't have much time.                                                                        1
Its good to go there for drinks if you don't want to get drunk because you'll be lucky if you can get one drink an hour the service is so bad.    1
it is a hidden delight complete with a quaint bar and good food.                                                                                  1
This place has beautiful sushi, and it's delicious CHEAP.                                                                                         1
I would definitely go back -- if only for some of those exotic martinis on the blackboard.                                                        1
The Deco and ambiance was really romantic.                                                                                                        1
They are served on Focacchia bread and are to die for.                                                          

# First Descriptive Analysis


## Aspects per sentence

In [70]:
pol_cols = ["aspect_polarity_"+str(ii) for ii in range(1,aspect_number)]

In [71]:
prev = 0
total_asp = 0
for no, col in enumerate(pol_cols):
  
    if col != "aspect_polarity_1":
        print("sentences with exactly ", no, "aspects:", prev - sum(df[col].value_counts()))
        total_asp += no * (prev - sum(df[col].value_counts()))

    if col == "aspect_polarity_"+str(aspect_number-1):
        print("sentences with exactly ", no+1, "aspects:", sum(df[col].value_counts()))
        total_asp += (no+1) * sum(df[col].value_counts())


    prev = sum(df[col].value_counts())

print("total no of aspects: ", total_asp)

sentences with exactly  1 aspects: 290
sentences with exactly  2 aspects: 186
sentences with exactly  3 aspects: 80
sentences with exactly  4 aspects: 31
sentences with exactly  5 aspects: 14
sentences with exactly  6 aspects: 3
sentences with exactly  7 aspects: 1
sentences with exactly  8 aspects: 0
sentences with exactly  9 aspects: 0
sentences with exactly  10 aspects: 0
sentences with exactly  11 aspects: 0
sentences with exactly  12 aspects: 0
sentences with exactly  13 aspects: 1
total no of aspects:  1134


## Sentiment Frequency

In [72]:
df_pol = df.loc[:,pol_cols]
df_pol_counts = df_pol.apply(pd.Series.value_counts)
df_pol_counts.sum(axis=1)

conflict     14.0
negative    196.0
neutral     196.0
positive    728.0
dtype: float64

In [73]:
sum(df_pol_counts.sum(axis=1))

1134.0

## Sentences with more than one aspect

In [76]:
multi_counter = 0
for line in df.index:
    sentiment_list = []
    for col in pol_cols:
        if df.loc[line,col] != None:
            sentiment_list += [df.loc[line,col]]
    if len(set(sentiment_list)) > 1:
        multi_counter += 1
multi_counter

85

# Remove "conflict"

In [77]:
for line in range(len(df)):
    for col in pol_cols:
        if df.loc[line,col] == "conflict":
            df.loc[line,col] = None
            number = col[-1:]
            df.loc[line,"aspect_term_"+str(number)] = None
            df.loc[line,"aspect_to_"+str(number)] = None
            df.loc[line,"aspect_from_"+str(number)] = None

In [78]:
df

Unnamed: 0,id,text,aspect_term_1,aspect_polarity_1,aspect_from_1,aspect_to_1,aspect_term_2,aspect_polarity_2,aspect_from_2,aspect_to_2,aspect_term_3,aspect_polarity_3,aspect_from_3,aspect_to_3,aspect_term_4,aspect_polarity_4,aspect_from_4,aspect_to_4,aspect_term_5,aspect_polarity_5,aspect_from_5,aspect_to_5,aspect_term_6,aspect_polarity_6,aspect_from_6,aspect_to_6,aspect_term_7,aspect_polarity_7,aspect_from_7,aspect_to_7,aspect_term_8,aspect_polarity_8,aspect_from_8,aspect_to_8,aspect_term_9,aspect_polarity_9,aspect_from_9,aspect_to_9,aspect_term_10,aspect_polarity_10,aspect_from_10,aspect_to_10,aspect_term_11,aspect_polarity_11,aspect_from_11,aspect_to_11,aspect_term_12,aspect_polarity_12,aspect_from_12,aspect_to_12,aspect_term_13,aspect_polarity_13,aspect_from_13,aspect_to_13
0,32897564#894393#2,The bread is top notch as well.,bread,positive,4,9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,33070600#670328#0,I have to say they have one of the fastest del...,delivery times,positive,43,57,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,33070600#670328#2,Food is always fresh and hot- ready to eat!,Food,positive,0,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,36244464#949326#5,Did I mention that the coffee is OUTSTANDING?,coffee,positive,23,29,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,32894246#870052#0,"Certainly not the best sushi in New York, howe...",,,,,place,positive,79,84,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,11661949#1709112#8,"Anyway, the owner was fake.",owner,negative,12,17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
796,35698240#574381#2,Owner is pleasant and entertaining.,Owner,positive,0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
797,11350390#802808#3,"I have never in my life sent back food before,...",food,negative,34,38,waiter,negative,76,82,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
798,11482070#503858#3,"Although the restaurant itself is nice, I pref...",food,negative,67,71,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Check for wrong positions

In [79]:
mistakes = []
for ii in range(len(df)):
    for xx in range(1,aspect_number):
        asp_col = "aspect_term_"+str(xx)
        from_col = "aspect_from_"+str(xx)
        to_col = "aspect_to_"+str(xx)
        actual_term = df.loc[ii,asp_col]
        if actual_term != None:
            pos_term = df.text[ii][int(df.loc[ii,from_col]):int(df.loc[ii,to_col])]
            if actual_term != pos_term:
                mistakes += [ii]
                print(actual_term, pos_term)
mistakes

[]

# Check for wrong aspect terms

In [80]:
import nltk

In [81]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [82]:
for ii in range(len(df)):
    tokens = nltk.word_tokenize(df.text[ii])
    for xx in range(1,aspect_number):
        actual_term = df.loc[ii,"aspect_term_"+str(xx)]
        if actual_term != None:
            for asp_part in nltk.word_tokenize(actual_term):
                if asp_part not in tokens and asp_part+"-" not in tokens:
                    print(ii,"-",xx,":",tokens)
                    print(actual_term, asp_part)

453 - 3 : ['You', 'must', 'try', 'Odessa', 'stew', 'or', 'Rabbit', 'stew', ';', 'salads-all', 'good', ';', 'and', 'kompot', 'is', 'soo', 'refreshing', 'during', 'the', 'hot', 'summer', 'day', '(', 'they', 'make', 'it', 'the', 'way', 'my', 'mom', 'does', ',', 'reminds', 'me', 'of', 'home', 'a', 'lot', ')', '.']
salads salads


# Drop lines without aspects

In [83]:
def pol_to_no(sentiment):
  
    if sentiment == "positive":
        pol = 1
    elif sentiment == "negative":
        pol = -1
    elif sentiment == "neutral":
        pol = 0

    return pol

In [84]:
all_pols = []
no_pol = []
for line in df.index:
    pols = []
    for col in pol_cols:
        if df.loc[line,col] != None:
            pols += [pol_to_no(df.loc[line,col])]
    if len(pols) > 0:
        all_pols += [pols]
    else:
        no_pol += [line]

df_pol = pd.DataFrame(data=all_pols)

In [85]:
df.drop(no_pol, axis=0, inplace=True)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,id,text,aspect_term_1,aspect_polarity_1,aspect_from_1,aspect_to_1,aspect_term_2,aspect_polarity_2,aspect_from_2,aspect_to_2,aspect_term_3,aspect_polarity_3,aspect_from_3,aspect_to_3,aspect_term_4,aspect_polarity_4,aspect_from_4,aspect_to_4,aspect_term_5,aspect_polarity_5,aspect_from_5,aspect_to_5,aspect_term_6,aspect_polarity_6,aspect_from_6,aspect_to_6,aspect_term_7,aspect_polarity_7,aspect_from_7,aspect_to_7,aspect_term_8,aspect_polarity_8,aspect_from_8,aspect_to_8,aspect_term_9,aspect_polarity_9,aspect_from_9,aspect_to_9,aspect_term_10,aspect_polarity_10,aspect_from_10,aspect_to_10,aspect_term_11,aspect_polarity_11,aspect_from_11,aspect_to_11,aspect_term_12,aspect_polarity_12,aspect_from_12,aspect_to_12,aspect_term_13,aspect_polarity_13,aspect_from_13,aspect_to_13
0,32897564#894393#2,The bread is top notch as well.,bread,positive,4,9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,33070600#670328#0,I have to say they have one of the fastest del...,delivery times,positive,43,57,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,33070600#670328#2,Food is always fresh and hot- ready to eat!,Food,positive,0,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,36244464#949326#5,Did I mention that the coffee is OUTSTANDING?,coffee,positive,23,29,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,32894246#870052#0,"Certainly not the best sushi in New York, howe...",,,,,place,positive,79,84,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,11661949#1709112#8,"Anyway, the owner was fake.",owner,negative,12,17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
596,35698240#574381#2,Owner is pleasant and entertaining.,Owner,positive,0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
597,11350390#802808#3,"I have never in my life sent back food before,...",food,negative,34,38,waiter,negative,76,82,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
598,11482070#503858#3,"Although the restaurant itself is nice, I pref...",food,negative,67,71,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Final Descriptive Analysis


## Aspects per sentence

In [86]:
pol_cols = ["aspect_polarity_"+str(ii) for ii in range(1,aspect_number)]

In [87]:
prev = 0
total_asp = 0
for no, col in enumerate(pol_cols):
  
    if col != "aspect_polarity_1":
        print("sentences with exactly ", no, "aspects:", prev - sum(df[col].value_counts()))
        total_asp += no * (prev - sum(df[col].value_counts()))

    if col == "aspect_polarity_"+str(aspect_number-1):
        print("sentences with exactly ", no+1, "aspects:", sum(df[col].value_counts()))
        total_asp += (no+1) * sum(df[col].value_counts())


    prev = sum(df[col].value_counts())

print("total no of aspects: ", total_asp)

sentences with exactly  1 aspects: 282
sentences with exactly  2 aspects: 186
sentences with exactly  3 aspects: 78
sentences with exactly  4 aspects: 31
sentences with exactly  5 aspects: 14
sentences with exactly  6 aspects: 3
sentences with exactly  7 aspects: 1
sentences with exactly  8 aspects: 0
sentences with exactly  9 aspects: 0
sentences with exactly  10 aspects: 0
sentences with exactly  11 aspects: 0
sentences with exactly  12 aspects: 0
sentences with exactly  13 aspects: 1
total no of aspects:  1120


## Sentiment Frequency

In [88]:
df_pol = df.loc[:,pol_cols]
df_pol_counts = df_pol.apply(pd.Series.value_counts)
df_pol_counts.sum(axis=1)

negative    196.0
neutral     196.0
positive    728.0
dtype: float64

In [89]:
sum(df_pol_counts.sum(axis=1))

1120.0

## Sentences with more than one aspect

In [90]:
multi_counter = 0
for line in df.index:
    sentiment_list = []
    for col in pol_cols:
        if df.loc[line,col] != None:
            sentiment_list += [df.loc[line,col]]
    if len(set(sentiment_list)) > 1:
        multi_counter += 1
multi_counter

80

# Save as xml

In [None]:
root = ET.Element('sentences')

for line in df.index:
    name = "sentence"
    entry = ET.SubElement(root, name)
    entry.set("id", str(df["id"][line]))

    text_child = ET.SubElement(entry, "text")
    text_child.text = str(df["text"][line])

    asp_child = ET.SubElement(entry, "aspectTerms")
    for xx in range(1,aspect_number):
        if df.loc[line,"aspect_term_"+str(xx)] != None:
            asp_subchild = ET.SubElement(asp_child, "aspectTerm")
            asp_subchild.set("from",str(df["aspect_from_"+str(xx)][line]))
            asp_subchild.set("polarity",str(df["aspect_polarity_"+str(xx)][line]))
            asp_subchild.set("term",str(df["aspect_term_"+str(xx)][line]))
            asp_subchild.set("to",str(df["aspect_to_"+str(xx)][line]))

xml_data = ET.tostring(root)

In [None]:
with open('/content/drive/My Drive/Masterarbeit/Data/Final/SEMEVAL-14-REST/test.xml', 'w') as f:  # Write in file as utf-8
    f.write(xml_data.decode('utf-8'))

# Create xml.seg

In [None]:
def xml_seg_maker(df):

    df_wo_id = df.drop(columns="id", axis=1, inplace=False)
    df_wo_id.reset_index(drop=True, inplace=True)
    data_lines = []

    for ii in range(len(df_wo_id)):

        line = list(df_wo_id.loc[ii])
        o_text = line[0]
        aspects = [line[xx] for xx in range(1,len(df_wo_id.loc[0]),4) if line[xx] != None]
        pols = [line[xx] for xx in range(2,len(df_wo_id.loc[0]),4) if line[xx] != None]

        for asp in range(len(aspects)):
            text = o_text.replace(aspects[asp],'$T$')
            pol = str(pol_to_no(pols[asp]))

            data_lines += [text,aspects[asp],pol]

    return data_lines

In [None]:
xml_seg_data = xml_seg_maker(df)

with open('/content/drive/My Drive/Masterarbeit/Data/Final/SEMEVAL-14-REST/test.xml.seg', 'w') as f:
    f.write('\n'.join(xml_seg_data))

# Create BERT+txt

In [None]:
asp_cols = ["aspect_term_"+str(ii) for ii in range(1,aspect_number)]

In [None]:
def sent_conv(sentiment):

    if sentiment == "positive":
        return "POS"
    elif sentiment == "negative":
        return "NEG"
    elif sentiment == "neutral":
        return "NEU"

In [None]:
def txt_maker(df):

    data_lines = []

    for line in df.index:

        text = df.loc[line,"text"]        
        tokens = nltk.word_tokenize(text)

        # correct tokens
        for no,tok in enumerate(tokens):
            if tok[-1:] == "-" and len(tok)>2:
                tokens[no] = tok[:-1]
            if tok[:1] == "'" and len(tok)>3:
                tokens[no] = tok[1:]
            if tok in ["'","(",")"]:
                tokens.remove(tok)

        # create aspect-polarity dict
        asp_sent_dict = {}
        max_asp_len = 0
        for col in range(len(asp_cols)):
            aspect = df.loc[line,asp_cols[col]]
            if aspect != None:
                asp_sent_dict[aspect] = sent_conv(df.loc[line,pol_cols[col]])
                if len(aspect.split()) > max_asp_len:
                    max_asp_len = len(aspect.split())


        label = ""
        # check for one-word-aspects
        for tok in tokens:
            if tok in asp_sent_dict.keys():
                label += tok + "=T-" + asp_sent_dict[tok] + " "
            else:
                label += tok + "=O "
        label = label[:-1]

        # check for multi-word-aspects
        for ii in range(2,max_asp_len+1):
            for no,tok in enumerate(tokens):
                new_tok = " ".join(tokens[no:no+ii])
                if new_tok in asp_sent_dict.keys():
                    new_pol = asp_sent_dict[new_tok]
                    old_label = " ".join([tokens[no+xx]+"=O" for xx in range(ii) if no+xx < len(tokens)])
                    new_label = " ".join([tokens[no+xx]+"=T-"+new_pol for xx in range(ii) if no+xx < len(tokens)])
                    label = label.replace(old_label, new_label)
  
        data_lines += [text+"####"+label]

    return data_lines

In [None]:
txt_data = txt_maker(df)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/SEMEVAL-14-REST/test.txt","w") as f:
    f.write('\n'.join(txt_data))

# Create RGATjson

In [None]:
!pip install stanza

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/50/ae/a70a58ce6b4e2daad538688806ee0f238dbe601954582a74ea57cde6c532/stanza-1.2-py3-none-any.whl (282kB)
[K     |████████████████████████████████| 286kB 4.0MB/s 
Installing collected packages: stanza
Successfully installed stanza-1.2


In [None]:
import stanza
from more_itertools import locate
import json

In [None]:
stanza.download('en')
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 26.4MB/s]                    
2021-05-11 08:04:39 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.2.0/en/default.zip: 100%|██████████| 411M/411M [02:38<00:00, 2.60MB/s]
2021-05-11 08:07:28 INFO: Finished downloading models and saved to /root/stanza_resources.
2021-05-11 08:07:28 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2021-05-11 08:07:28 INFO: Use device: cpu
2021-05-11 08:07:28 INFO: Loading: tokenize
2021-05-11 08:07:28 INFO: Loading: pos
2021-05-11 08:07:29 INFO: Loading: lemma
2021-05-11 08:07:29 INFO: Loading: depparse
2021-05-11 08:07:29 INFO: Done loading processors!


## in case of creating for the first time

In [None]:
def json_make_pos(df):
   
    manual_pos = {}

    for ii in df.index:
        
        new_dict = {}
        text = df.loc[ii,"text"]
        tokens = [token.text for sentence in nlp(text).sentences for token in sentence.tokens]
        
        new_dict["aspects"] = []
        for xx in range(1,aspect_number):         
            if df.loc[ii,"aspect_term_"+str(xx)] != None:
                asp_dict = {}   
                term = df.loc[ii,"aspect_term_"+str(xx)]

                # construct aspect position on token level
                asp_toks = [token.text for sentence in nlp(term).sentences for token in sentence.tokens]
                asp_ind = [list(locate(tokens, lambda a: a == tok)) for tok in asp_toks]

                # for aspects appearing only once in text, take the correct position
                # otherwise set to None
                if len(asp_ind[0]) == 1:
                    from_index = asp_ind[0][0]
                else:
                    from_index = None
                if len(asp_ind[-1]) == 1:
                    to_index = asp_ind[-1][0] 
                else: 
                    to_index = None

                # if both start and end pos are unknown, 
                # e.g. for single-word aspects, 
                # take character positions for help
                if from_index == None and to_index == None and len(asp_ind[0]) != 0:
                    print(ii, ": ", text)
                    print("original term: ",term)
                    all_char_from = [i for i in range(len(text)) if text.startswith(asp_toks[0], i)]
                    print("all start chars: ",all_char_from)
                    corr_char_from = int(df.loc[ii, "aspect_from_"+str(xx)])
                    print("correct start char: ",corr_char_from)
                    print("text beginning at correct start char: ", text[corr_char_from:])
                    print("tokens: ",tokens)
                    print("original asp tokens: ", asp_toks)
                    print("original asp indices: ",asp_ind)
                    if corr_char_from == max(all_char_from):
                        from_index = asp_ind[0][-1]
                    elif corr_char_from == min(all_char_from):
                        from_index = asp_ind[0][0]

                # in case of missing start/end positions,
                # try to find "to"/"from" using aspect token number as distance
                if from_index == None and to_index != None:
                    from_index = to_index - len(asp_toks) +1
                if to_index == None and from_index != None:
                    to_index = from_index + len(asp_toks) -1

                # correct tokenization errors in aspect term tokenization
                if from_index == None or to_index == None or asp_toks != tokens[from_index:to_index+1]:

                    print("Tokenization Error in line ",ii,"!")
                    for pos, tok in enumerate(tokens):
                        print(pos, tok)
                    print("aspect term: ", term)
                    print("original asp tokens: ", asp_toks)

                    from_index = int(input("start position?"))
                    to_index = int(input("end position?"))

                    # add manually stated positions to dict for reproducibility
                    manual_pos[text] = {}
                    manual_pos[text][term] = {}
                    manual_pos[text][term]["from"] = from_index
                    manual_pos[text][term]["to"] = to_index
                    print(manual_pos)

                    print("Final aspect tokens: ", tokens[from_index:to_index+1])

                asp_dict["from"] = from_index
                asp_dict["to"] = to_index + 1

    with open("/content/drive/My Drive/Masterarbeit/Data/preprocessing/aspect_positions/semeval_rest_test.json","w") as f:
        json.dump(manual_pos, f)

json_make_pos(df)

Tokenization Error in line  39 !
0 Had
1 a
2 great
3 experience
4 at
5 Trio
6 ...
7 staff
8 was
9 pleasant
10 ;
11 food
12 was
13 tasty
14 and
15 large
16 in
17 portion
18 size
19 -
20 I
21 would
22 highly
23 recommend
24 the
25 portobello
26 /
27 gorgonzola
28 /
29 sausage
30 appetizer
31 and
32 the
33 lobster
34 risotto
35 .
aspect term:  portobello/gorgonzola/sausage appetizer
original asp tokens:  ['portobello/gorgonzola', '/', 'sausage', 'appetizer']
start position?25
end position?30
{'Had a great experience at Trio ... staff was pleasant; food was tasty and large in portion size - I would highly recommend the portobello/gorgonzola/sausage appetizer and the lobster risotto.': {'portobello/gorgonzola/sausage appetizer': {'from': 25, 'to': 30}}}
Final aspect tokens:  ['portobello', '/', 'gorgonzola', '/', 'sausage', 'appetizer']
109 :  We been there and we really enjoy the food, was areally great food, and the service was really good.
original term:  food
all start chars:  [38, 62]


## in case of reproducing the dataset

In [None]:
def json_maker(df, manual_pos):
   
    new_data = []

    for ii in df.index:
        
        new_dict = {}
        text = df.loc[ii,"text"]

        tokens = [token.text for sentence in nlp(text).sentences for token in sentence.tokens]
        new_dict["token"] = tokens

        new_dict["pos"] = [word.xpos for sentence in nlp(text).sentences for word in sentence.words]
        new_dict["head"] = [str(word.head) for sentence in nlp(text).sentences for word in sentence.words]
        new_dict["deprel"] = [word.deprel for sentence in nlp(text).sentences for word in sentence.words]
        
        new_dict["aspects"] = []
        for xx in range(1,aspect_number):         
            if df.loc[ii,"aspect_term_"+str(xx)] != None:
                asp_dict = {}   
                term = df.loc[ii,"aspect_term_"+str(xx)]
                asp_dict["term"] = term
                asp_dict["polarity"] = df.loc[ii,"aspect_polarity_"+str(xx)]

                # construct aspect position on token level
                asp_toks = [token.text for sentence in nlp(term).sentences for token in sentence.tokens]
                asp_ind = [list(locate(tokens, lambda a: a == term)) for term in asp_toks]

                # for aspects appearing only once in text, take the correct position
                # otherwise set to None
                if len(asp_ind[0]) == 1:
                    from_index = asp_ind[0][0]
                else:
                    from_index = None
                if len(asp_ind[-1]) == 1:
                    to_index = asp_ind[-1][0] 
                else: 
                    to_index = None

                # if both start and end pos are unknown, 
                # e.g. for single-word aspects, 
                # take character positions for help
                if from_index == None and to_index == None and len(asp_ind[0]) != 0:

                    all_char_from = [i for i in range(len(text)) if text.startswith(asp_toks[0], i)]
                    corr_char_from = int(df.loc[ii, "aspect_from_"+str(xx)])

                    if corr_char_from == max(all_char_from):
                        from_index = asp_ind[0][-1]
                    elif corr_char_from == min(all_char_from):
                        from_index = asp_ind[0][0]

                # in case of missing start/end positions,
                # try to find "to"/"from" using aspect token number as distance
                if from_index == None and to_index != None:
                    from_index = to_index - len(asp_toks) +1
                if to_index == None and from_index != None:
                    to_index = from_index + len(asp_toks) -1

                # correct tokenization errors in aspect term tokenization
                if from_index == None or to_index == None or asp_toks != tokens[from_index:to_index+1]:
                    if text in manual_pos.keys() and term in manual_pos[text].keys():
                        from_index = manual_pos[text][term]["from"]
                        to_index = manual_pos[text][term]["to"]

                asp_dict["from"] = from_index
                asp_dict["to"] = to_index + 1

                new_dict["aspects"] += [asp_dict]
        
        new_data += [new_dict]

    return new_data

In [None]:
pos_obj = open("/content/drive/My Drive/Masterarbeit/Data/preprocessing/aspect_positions/semeval_rest_test.json")
loaded_pos = json.load(pos_obj)

json_data = json_maker(df, loaded_pos)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/SEMEVAL-14-REST/test.json","w") as f:
    json.dump(json_data, f)

# Create LCF-ATEPCdat

In [None]:
def pol_to_no_shifted(sentiment):
  
    if sentiment == "positive":
        pol = 2
    elif sentiment == "negative":
        pol = 0
    elif sentiment == "neutral":
        pol = 1

    return str(pol)

In [None]:
def dat_maker(df):
    
    data_lines = []

    for line in df.index:

        text = df.loc[line,"text"]
        tokens = nltk.word_tokenize(text)

        # correct tokens
        for no,tok in enumerate(tokens):
            if tok[-1:] == "-" and len(tok)>2:
                tokens[no] = tok[:-1]
            if tok[:1] == "'" and len(tok)>3:
                tokens[no] = tok[1:]
            if tok in ["'","(",")"]:
                tokens.remove(tok)

        # create aspect-polarity dict
        asp_sent_dict = {}
        max_asp_len = 0
        for col in range(len(asp_cols)):
            aspect = df.loc[line,asp_cols[col]]
            if aspect != None:
                asp_sent_dict[aspect] = pol_to_no_shifted(df.loc[line,pol_cols[col]])
                if len(aspect.split()) > max_asp_len:
                    max_asp_len = len(aspect.split())

        label = ""
        # check for one-word-aspects
        for tok in tokens:
            if tok in asp_sent_dict.keys():
                label += tok + " B-ASP -1\n"
            else:
                label += tok + " O -1\n"

        # check for multi-word-aspects
        for ii in range(2,max_asp_len+1):
            for no,tok in enumerate(tokens):
                new_tok = " ".join(tokens[no:no+ii])
                if new_tok not in tokens and new_tok in asp_sent_dict.keys():
                    label = label.replace(tokens[no]+" O -1",tokens[no]+" B-ASP -1")
                    for xx in range(1,ii):
                        label = label.replace(tokens[no+xx]+" O -1",tokens[no+xx]+" I-ASP -1")

        # create duplicates of review in case of more than one aspect
        for key, val in asp_sent_dict.items():
            if key in tokens:
                new_label = label.replace(key+" B-ASP -1", key+" B-ASP "+val)
                data_lines += [new_label]
                data_lines += ["\n"]
            else:
                for ii in range(2,max_asp_len+1):
                    for no,tok in enumerate(tokens):
                        new_tok = " ".join(tokens[no:no+ii])
                        if new_tok == key:
                            new_label = label.replace(tokens[no]+" B-ASP -1",tokens[no]+" B-ASP "+val)
                            for xx in range(1,ii):
                                new_label = new_label.replace(tokens[no+xx]+" I-ASP -1",tokens[no+xx]+" I-ASP "+val)

                            data_lines += [new_label]
                            data_lines += ["\n"]

    return data_lines

In [None]:
dat_data = dat_maker(df)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/SEMEVAL-14-REST/test.dat","w") as f:
    f.write(''.join(dat_data))

# Create GRACEtxt

In [None]:
asp_cols = ["aspect_term_"+str(ii) for ii in range(1,aspect_number)]

In [None]:
import nltk

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('conll2000')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Create chunks and pos tags. Source: https://towardsdatascience.com/chunking-in-nlp-decoded-b4a71b2b4e24


In [None]:
from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI
from nltk.chunk.util import tree2conlltags, conlltags2tree
from nltk.corpus import conll2000

In [None]:
def conll_tag_chunks(chunk_sents):
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
    
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

class NGramTagChunker(ChunkParserI):

    def __init__(self,train_sentences,tagger_classes=[UnigramTagger,BigramTagger]):
        train_sent_tags=conll_tag_chunks(train_sentences)
        self.chunk_tagger=combined_tagger(train_sent_tags,tagger_classes)
    
    def parse(self,tagged_sentence):
        if not tagged_sentence:
            return None
        pos_tags=[tag for word, tag in tagged_sentence]
        chunk_pos_tags=self.chunk_tagger.tag(pos_tags)
        chunk_tags=[chunk_tag for (pos_tag,chunk_tag) in chunk_pos_tags]
        wpc_tags=[(word,pos_tag,chunk_tag) for ((word,pos_tag),chunk_tag) in zip(tagged_sentence,chunk_tags)]
        return conlltags2tree(wpc_tags)

In [None]:
data = conll2000.chunked_sents()
ntc = NGramTagChunker(data)

Convert tags into BIOES scheme. Source: https://gist.github.com/allanj/5ad206f7f4645c0269b68fb2065712f4

In [None]:
def iob_iobes(tags):
    """
    IOB2 (BIO) -> IOBES
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag == 'O':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'B':
            if i + 1 != len(tags) and \
                    tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('B-', 'S-'))
        elif tag.split('-')[0] == 'I':
            if i + 1 < len(tags) and \
                    tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('I-', 'E-'))
        else:
            raise Exception('Invalid IOB format!')
    return new_tags

In [None]:
def grace_txt_maker(df):
    
    data_lines = []

    for line in df.index:

        text = df.text[line]
        tokens = nltk.word_tokenize(text)

        # correct tokens
        for no,tok in enumerate(tokens):
            if tok[-1:] == "-" and len(tok)>2:
                tokens[no] = tok[:-1]
            if tok[:1] == "'" and len(tok)>3:
                tokens[no] = tok[1:]
            if tok in ["'","(",")"]:
                tokens.remove(tok)

        # create pos tags
        pos_tags = nltk.pos_tag(tokens)

        # create chunk/phrase tags
        full_tags = tree2conlltags(ntc.parse(pos_tags))
        chunks_list = [full_tags[ii][2] for ii in range(len(full_tags))]
        new_chunks = iob_iobes(chunks_list)

        # create aspect-polarity dict
        asp_sent_dict = {}
        max_asp_len = 0
        for col in range(len(asp_cols)):
            aspect = df.loc[line,asp_cols[col]]
            if aspect != None:
                asp_sent_dict[aspect] = df.loc[line,pol_cols[col]].upper()
                if len(aspect.split()) > max_asp_len:
                    max_asp_len = len(aspect.split())

        label = ""
        # check for one-word-aspects
        for pos,tok in enumerate(tokens):
            label += tok + " " + pos_tags[pos][1] + " " + new_chunks[pos]
            if tok in asp_sent_dict.keys():
                label +=  " B_AP " + asp_sent_dict[tok] + " B_AP+" + asp_sent_dict[tok] + "\n"
            else:
                label += " O O O \n"

        # check for multi-word-aspects
        for ii in range(2,max_asp_len+1):
            for no,tok in enumerate(tokens):
                new_tok = " ".join(tokens[no:no+ii])
                if new_tok not in tokens and new_tok in asp_sent_dict.keys():
                    new_pol = asp_sent_dict[new_tok]
                    label = label.replace(tokens[no]+ " " + pos_tags[no][1] + " " + new_chunks[no] + " O O O \n",
                                          tokens[no]+ " " + pos_tags[no][1] + " " + new_chunks[no] + " B_AP " + \
                                          new_pol + " B_AP+" + new_pol + "\n")
                    for xx in range(1,ii):
                        label = label.replace(tokens[no+xx] + " " + pos_tags[no+xx][1] + " " + new_chunks[no+xx] + " O O O \n",
                                          tokens[no+xx]+ " " + pos_tags[no+xx][1] + " " + new_chunks[no+xx] + " I_AP " + \
                                          new_pol + " I_AP+" + new_pol + "\n")

        data_lines += [label]

    return data_lines

In [None]:
grace_txt_data = grace_txt_maker(df)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/SEMEVAL-14-REST/grace_test.txt","w") as f:
    f.write('\n'.join(grace_txt_data))