In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# Data Import

In [2]:
import pandas as pd
import xml.etree.ElementTree as ET

In [3]:
parsedXML = ET.parse("/content/drive/My Drive/Masterarbeit/Data/Original/MAMS/mams_val.xml")

In [4]:
aspect_number = 11

In [5]:
dfcols = ['id','text']
for ii in range(1,aspect_number):
    dfcols.append("aspect_term_{}".format(ii))
    dfcols.append("aspect_polarity_{}".format(ii))
    dfcols.append("aspect_from_{}".format(ii))
    dfcols.append("aspect_to_{}".format(ii))          
df = pd.DataFrame(columns=dfcols)

for sentence in parsedXML.getroot():
    id = sentence.attrib.get('id')
    text = sentence.find('text').text
    line = [id,text]

    for asp in sentence.iter('aspectTerm'):
        term = asp.attrib.get("term")
        pol = asp.attrib.get("polarity")
        a_from = asp.attrib.get("from")
        a_to = asp.attrib.get("to")

        line += [term, pol, a_from, a_to]

    if len(line) < len(dfcols):
        pads = [None] * (len(dfcols)-len(line))
        line += pads
    
    df = df.append(pd.Series(line, index=dfcols), ignore_index=True)

In [6]:
df

Unnamed: 0,id,text,aspect_term_1,aspect_polarity_1,aspect_from_1,aspect_to_1,aspect_term_2,aspect_polarity_2,aspect_from_2,aspect_to_2,aspect_term_3,aspect_polarity_3,aspect_from_3,aspect_to_3,aspect_term_4,aspect_polarity_4,aspect_from_4,aspect_to_4,aspect_term_5,aspect_polarity_5,aspect_from_5,aspect_to_5,aspect_term_6,aspect_polarity_6,aspect_from_6,aspect_to_6,aspect_term_7,aspect_polarity_7,aspect_from_7,aspect_to_7,aspect_term_8,aspect_polarity_8,aspect_from_8,aspect_to_8,aspect_term_9,aspect_polarity_9,aspect_from_9,aspect_to_9,aspect_term_10,aspect_polarity_10,aspect_from_10,aspect_to_10
0,,"After a couple of drinks, the apps--I like the...",drinks,neutral,18,24,roll,positive,68,72,cripsy squid,positive,81,93,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,The basil pepper mojito was a little daunting ...,basil pepper mojito,negative,4,23,flavor,positive,85,91,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,Had to constantly ask the waiter to top up wat...,waiter,negative,26,32,water glasses,neutral,43,56,service,positive,72,79,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,The portions were so small that we still wante...,portions,negative,4,12,dinner,neutral,61,67,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,"The staff is very kind and well trained, they'...",staff,positive,4,9,bar,neutral,97,100,drinks,neutral,109,115,menu,neutral,156,160,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,,Spinache rolls have lots of garlic.,rolls,neutral,9,14,garlic,positive,28,34,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
496,,"I use to always get the turkey club, until rec...",turkey club,neutral,24,35,bread,positive,77,82,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
497,,"The decor is worth a mention, with plush seati...",decor,positive,4,9,bar,neutral,71,74,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
498,,The Food The menu is better suited to the snac...,Food,positive,4,8,snacking,positive,42,50,bar,neutral,63,66,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Drop duplicates

In [7]:
cols_wo_id = df.columns
cols_wo_id = cols_wo_id.drop("id")
df[df.duplicated(cols_wo_id, keep=False)].sort_values("text")

Unnamed: 0,id,text,aspect_term_1,aspect_polarity_1,aspect_from_1,aspect_to_1,aspect_term_2,aspect_polarity_2,aspect_from_2,aspect_to_2,aspect_term_3,aspect_polarity_3,aspect_from_3,aspect_to_3,aspect_term_4,aspect_polarity_4,aspect_from_4,aspect_to_4,aspect_term_5,aspect_polarity_5,aspect_from_5,aspect_to_5,aspect_term_6,aspect_polarity_6,aspect_from_6,aspect_to_6,aspect_term_7,aspect_polarity_7,aspect_from_7,aspect_to_7,aspect_term_8,aspect_polarity_8,aspect_from_8,aspect_to_8,aspect_term_9,aspect_polarity_9,aspect_from_9,aspect_to_9,aspect_term_10,aspect_polarity_10,aspect_from_10,aspect_to_10


In [8]:
df.drop_duplicates(cols_wo_id, ignore_index=True,inplace=True)

In [9]:
text_counts = df.text.value_counts()
text_counts[:10]

The waitress at the bar was very nasty to me because she mistakenly took an order for thai ice tea from me when I asked for thai lemonade in a to-go cup.                               1
I wouldn't go for dinner though cause it gets too crowded and the takeout at night is very slow.                                                                                        1
Marisol at the front desk is serviceable; our sommelier, though not French, knew his Bordeaux, and our server was delightful.                                                           1
Too bad the food and service aren't nearly as nice as the decor.                                                                                                                        1
The wait staff didn't bother to refill my water until I had finish mine and was almost done with my husband's glass too.                                                                1
This pizza shop is one o fhte best places ever and is a hiden gem in a

# First Descriptive Analysis


## Aspects per sentence

In [10]:
pol_cols = ["aspect_polarity_"+str(ii) for ii in range(1,aspect_number)]

In [11]:
prev = 0
total_asp = 0
for no, col in enumerate(pol_cols):

    if col != "aspect_polarity_1":
        print("sentences with exactly ", no, "aspects:", prev - sum(df[col].value_counts()))
        total_asp += no * (prev - sum(df[col].value_counts()))

    if col == "aspect_polarity_"+str(aspect_number-1):
        print("sentences with exactly ", no+1, "aspects:", sum(df[col].value_counts()))
        total_asp += (no+1) * sum(df[col].value_counts())


    prev = sum(df[col].value_counts())

print("total no of aspects: ", total_asp)

sentences with exactly  1 aspects: 0
sentences with exactly  2 aspects: 285
sentences with exactly  3 aspects: 136
sentences with exactly  4 aspects: 55
sentences with exactly  5 aspects: 16
sentences with exactly  6 aspects: 5
sentences with exactly  7 aspects: 2
sentences with exactly  8 aspects: 0
sentences with exactly  9 aspects: 0
sentences with exactly  10 aspects: 1
total no of aspects:  1332


## Sentiment Frequency

In [12]:
df_pol = df.loc[:,pol_cols]
df_pol_counts = df_pol.apply(pd.Series.value_counts)
df_pol_counts.sum(axis=1)

negative    325.0
neutral     604.0
positive    403.0
dtype: float64

In [13]:
sum(df_pol_counts.sum(axis=1))

1332.0

## Sentences with more than one aspect

In [14]:
multi_counter = 0
for line in df.index:
    sentiment_list = []
    for col in pol_cols:
        if df.loc[line,col] != None:
            sentiment_list += [df.loc[line,col]]
    if len(set(sentiment_list)) > 1:
        multi_counter += 1
multi_counter

500

# Remove "conflict"

In [15]:
for line in range(len(df)):
    for col in pol_cols:
        if df.loc[line,col] == "conflict":
            df.loc[line,col] = None
            number = col[-1:]
            df.loc[line,"aspect_term_"+str(number)] = None
            df.loc[line,"aspect_to_"+str(number)] = None
            df.loc[line,"aspect_from_"+str(number)] = None

In [16]:
df

Unnamed: 0,id,text,aspect_term_1,aspect_polarity_1,aspect_from_1,aspect_to_1,aspect_term_2,aspect_polarity_2,aspect_from_2,aspect_to_2,aspect_term_3,aspect_polarity_3,aspect_from_3,aspect_to_3,aspect_term_4,aspect_polarity_4,aspect_from_4,aspect_to_4,aspect_term_5,aspect_polarity_5,aspect_from_5,aspect_to_5,aspect_term_6,aspect_polarity_6,aspect_from_6,aspect_to_6,aspect_term_7,aspect_polarity_7,aspect_from_7,aspect_to_7,aspect_term_8,aspect_polarity_8,aspect_from_8,aspect_to_8,aspect_term_9,aspect_polarity_9,aspect_from_9,aspect_to_9,aspect_term_10,aspect_polarity_10,aspect_from_10,aspect_to_10
0,,"After a couple of drinks, the apps--I like the...",drinks,neutral,18,24,roll,positive,68,72,cripsy squid,positive,81,93,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,The basil pepper mojito was a little daunting ...,basil pepper mojito,negative,4,23,flavor,positive,85,91,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,Had to constantly ask the waiter to top up wat...,waiter,negative,26,32,water glasses,neutral,43,56,service,positive,72,79,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,The portions were so small that we still wante...,portions,negative,4,12,dinner,neutral,61,67,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,"The staff is very kind and well trained, they'...",staff,positive,4,9,bar,neutral,97,100,drinks,neutral,109,115,menu,neutral,156,160,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,,Spinache rolls have lots of garlic.,rolls,neutral,9,14,garlic,positive,28,34,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
496,,"I use to always get the turkey club, until rec...",turkey club,neutral,24,35,bread,positive,77,82,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
497,,"The decor is worth a mention, with plush seati...",decor,positive,4,9,bar,neutral,71,74,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
498,,The Food The menu is better suited to the snac...,Food,positive,4,8,snacking,positive,42,50,bar,neutral,63,66,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Check for wrong positions

In [17]:
mistakes = []
for ii in range(len(df)):
    for xx in range(1,aspect_number):
        asp_col = "aspect_term_"+str(xx)
        from_col = "aspect_from_"+str(xx)
        to_col = "aspect_to_"+str(xx)
        actual_term = df.loc[ii,asp_col]
        if actual_term != None:
            pos_term = df.text[ii][int(df.loc[ii,from_col]):int(df.loc[ii,to_col])]
            if actual_term != pos_term:
                mistakes += [ii]
                print(actual_term, pos_term)
mistakes

[]

# Check for wrong aspect terms

In [18]:
import nltk

In [19]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [20]:
for ii in range(len(df)):
    tokens = nltk.word_tokenize(df.text[ii])
    for xx in range(1,aspect_number):
        actual_term = df.loc[ii,"aspect_term_"+str(xx)]
        if actual_term != None:
            for asp_part in nltk.word_tokenize(actual_term):
                if asp_part not in tokens and asp_part+"-" not in tokens:
                    print(ii,"-",xx,":",tokens)
                    print(actual_term, asp_part)

# Final Descriptive Analysis


## Aspects per sentence

In [21]:
pol_cols = ["aspect_polarity_"+str(ii) for ii in range(1,aspect_number)]

In [22]:
prev = 0
total_asp = 0
for no, col in enumerate(pol_cols):

    if col != "aspect_polarity_1":
        print("sentences with exactly ", no, "aspects:", prev - sum(df[col].value_counts()))
        total_asp += no * (prev - sum(df[col].value_counts()))

    if col == "aspect_polarity_"+str(aspect_number-1):
        print("sentences with exactly ", no+1, "aspects:", sum(df[col].value_counts()))
        total_asp += (no+1) * sum(df[col].value_counts())


    prev = sum(df[col].value_counts())

print("total no of aspects: ", total_asp)

sentences with exactly  1 aspects: 0
sentences with exactly  2 aspects: 285
sentences with exactly  3 aspects: 136
sentences with exactly  4 aspects: 55
sentences with exactly  5 aspects: 16
sentences with exactly  6 aspects: 5
sentences with exactly  7 aspects: 2
sentences with exactly  8 aspects: 0
sentences with exactly  9 aspects: 0
sentences with exactly  10 aspects: 1
total no of aspects:  1332


## Sentiment Frequency

In [23]:
df_pol = df.loc[:,pol_cols]
df_pol_counts = df_pol.apply(pd.Series.value_counts)
df_pol_counts.sum(axis=1)

negative    325.0
neutral     604.0
positive    403.0
dtype: float64

In [24]:
sum(df_pol_counts.sum(axis=1))

1332.0

## Sentences with more than one aspect

In [25]:
multi_counter = 0
for line in df.index:
    sentiment_list = []
    for col in pol_cols:
        if df.loc[line,col] != None:
            sentiment_list += [df.loc[line,col]]
    if len(set(sentiment_list)) > 1:
        multi_counter += 1
multi_counter

500

# Save as xml

In [None]:
def xml_maker(df,aspect_number):

    root = ET.Element('sentences')

    for line in df.index:
        name = "sentence"
        entry = ET.SubElement(root, name)
        entry.set("id", str(df["id"][line]))

        text_child = ET.SubElement(entry, "text")
        text_child.text = str(df["text"][line])

        asp_child = ET.SubElement(entry, "aspectTerms")
        for xx in range(1,aspect_number):
            if df.loc[line,"aspect_term_"+str(xx)] != None:
                asp_subchild = ET.SubElement(asp_child, "aspectTerm")
                asp_subchild.set("from",str(df["aspect_from_"+str(xx)][line]))
                asp_subchild.set("polarity",str(df["aspect_polarity_"+str(xx)][line]))
                asp_subchild.set("term",str(df["aspect_term_"+str(xx)][line]))
                asp_subchild.set("to",str(df["aspect_to_"+str(xx)][line]))

    return ET.tostring(root)

In [None]:
xml_data = xml_maker(df,aspect_number)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/MAMS/val.xml","w") as f:
    f.write(xml_data.decode('utf-8'))

# Create xml.seg

In [None]:
def pol_to_no(sentiment):
  
    if sentiment == "positive":
        pol = 1
    elif sentiment == "negative":
        pol = -1
    elif sentiment == "neutral":
        pol = 0

    return str(pol)

In [None]:
def xml_seg_maker(df):

    df_wo_id = df.drop(columns="id", axis=1, inplace=False)
    df_wo_id.reset_index(drop=True, inplace=True)
    data_lines = []

    for ii in df_wo_id.index:

        line = list(df_wo_id.loc[ii])
        o_text = line[0]
        aspects = [line[xx] for xx in range(1,len(df_wo_id.loc[0]),4) if line[xx] != None]
        pols = [line[xx] for xx in range(2,len(df_wo_id.loc[0]),4) if line[xx] != None]

        for asp in range(len(aspects)):
            text = o_text.replace(aspects[asp],'$T$')
            pol = pol_to_no(pols[asp])

            data_lines += [text,aspects[asp],pol]
    
    return data_lines

In [None]:
xml_seg_data = xml_seg_maker(df)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/MAMS/val.xml.seg","w") as f:
    f.write('\n'.join(xml_seg_data))

# Create BERT+txt

In [None]:
asp_cols = ["aspect_term_"+str(ii) for ii in range(1,aspect_number)]

In [None]:
def sent_conv(sentiment):

    if sentiment == "positive":
        return "POS"
    elif sentiment == "negative":
        return "NEG"
    elif sentiment == "neutral":
        return "NEU"

In [None]:
def txt_maker(df):

    data_lines = []

    for line in df.index:

        text = df.loc[line,"text"]
        tokens = nltk.word_tokenize(text)

        # correct tokens
        for no,tok in enumerate(tokens):
            if tok[-1:] == "-" and len(tok)>2:
                tokens[no] = tok[:-1]
            if tok[:1] == "'" and len(tok)>3:
                tokens[no] = tok[1:]
            if tok in ["'","(",")"]:
                tokens.remove(tok)

        # create aspect-polarity dict
        asp_sent_dict = {}
        max_asp_len = 0
        for col in range(len(asp_cols)):
            aspect = df.loc[line,asp_cols[col]]
            if aspect != None:
                asp_sent_dict[aspect] = sent_conv(df.loc[line,pol_cols[col]])
                if len(aspect.split()) > max_asp_len:
                    max_asp_len = len(aspect.split())


        label = ""
        # check for one-word-aspects
        for tok in tokens:
            if tok in asp_sent_dict.keys():
                label += tok + "=T-" + asp_sent_dict[tok] + " "
            else:
                label += tok + "=O "
        label = label[:-1]

        # check for multi-word-aspects
        for ii in range(2,max_asp_len+1):
            for no,tok in enumerate(tokens):
                new_tok = " ".join(tokens[no:no+ii])
                if new_tok in asp_sent_dict.keys():
                    new_pol = asp_sent_dict[new_tok]
                    old_label = " ".join([tokens[no+xx]+"=O" for xx in range(ii) if no+xx < len(tokens)])
                    new_label = " ".join([tokens[no+xx]+"=T-"+new_pol for xx in range(ii) if no+xx < len(tokens)])
                    label = label.replace(old_label, new_label)
  
        data_lines += [text+"####"+label]

    return data_lines

In [None]:
txt_data = txt_maker(df)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/MAMS/val.txt","w") as f:
    f.write('\n'.join(txt_data))

# Create RGATjson

In [None]:
!pip install stanza

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/50/ae/a70a58ce6b4e2daad538688806ee0f238dbe601954582a74ea57cde6c532/stanza-1.2-py3-none-any.whl (282kB)
[K     |█▏                              | 10kB 12.0MB/s eta 0:00:01[K     |██▎                             | 20kB 18.3MB/s eta 0:00:01[K     |███▌                            | 30kB 9.9MB/s eta 0:00:01[K     |████▋                           | 40kB 7.9MB/s eta 0:00:01[K     |█████▉                          | 51kB 4.8MB/s eta 0:00:01[K     |███████                         | 61kB 5.2MB/s eta 0:00:01[K     |████████▏                       | 71kB 5.6MB/s eta 0:00:01[K     |█████████▎                      | 81kB 6.1MB/s eta 0:00:01[K     |██████████▌                     | 92kB 5.7MB/s eta 0:00:01[K     |███████████▋                    | 102kB 4.9MB/s eta 0:00:01[K     |████████████▉                   | 112kB 4.9MB/s eta 0:00:01[K     |██████████████                  | 122kB 4.9MB/s eta 0:00:

In [None]:
import stanza

In [None]:
stanza.download('en')
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 34.2MB/s]                    
2021-05-10 11:08:35 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.2.0/en/default.zip: 100%|██████████| 411M/411M [01:18<00:00, 5.26MB/s]
2021-05-10 11:10:01 INFO: Finished downloading models and saved to /root/stanza_resources.
2021-05-10 11:10:01 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2021-05-10 11:10:01 INFO: Use device: cpu
2021-05-10 11:10:01 INFO: Loading: tokenize
2021-05-10 11:10:01 INFO: Loading: pos
2021-05-10 11:10:02 INFO: Loading: lemma
2021-05-10 11:10:02 INFO: Loading: depparse
2021-05-10 11:10:02 INFO: Done loading processors!


In [None]:
from more_itertools import locate

In [None]:
import json

## in case of creating for the first time

In [None]:
def json_make_pos(df):
   
    manual_pos = {}

    for ii in df.index:
        
        new_dict = {}
        text = df.loc[ii,"text"]
        tokens = [token.text for sentence in nlp(text).sentences for token in sentence.tokens]
        
        new_dict["aspects"] = []
        for xx in range(1,aspect_number):         
            if df.loc[ii,"aspect_term_"+str(xx)] != None:
                asp_dict = {}   
                term = df.loc[ii,"aspect_term_"+str(xx)]

                # construct aspect position on token level
                asp_toks = [token.text for sentence in nlp(term).sentences for token in sentence.tokens]
                asp_ind = [list(locate(tokens, lambda a: a == tok)) for tok in asp_toks]

                # for aspects appearing only once in text, take the correct position
                # otherwise set to None
                if len(asp_ind[0]) == 1:
                    from_index = asp_ind[0][0]
                else:
                    from_index = None
                if len(asp_ind[-1]) == 1:
                    to_index = asp_ind[-1][0] 
                else: 
                    to_index = None

                # if both start and end pos are unknown, 
                # e.g. for single-word aspects, 
                # take character positions for help
                if from_index == None and to_index == None and len(asp_ind[0]) != 0:
                    print(ii, ": ", text)
                    print("original term: ",term)
                    all_char_from = [i for i in range(len(text)) if text.startswith(asp_toks[0], i)]
                    print("all start chars: ",all_char_from)
                    corr_char_from = int(df.loc[ii, "aspect_from_"+str(xx)])
                    print("correct start char: ",corr_char_from)
                    print("text beginning at correct start char: ", text[corr_char_from:])
                    print("tokens: ",tokens)
                    print("original asp tokens: ", asp_toks)
                    print("original asp indices: ",asp_ind)
                    if corr_char_from == max(all_char_from):
                        from_index = asp_ind[0][-1]
                    elif corr_char_from == min(all_char_from):
                        from_index = asp_ind[0][0]

                # in case of missing start/end positions,
                # try to find "to"/"from" using aspect token number as distance
                if from_index == None and to_index != None:
                    from_index = to_index - len(asp_toks) +1
                if to_index == None and from_index != None:
                    to_index = from_index + len(asp_toks) -1

                # correct tokenization errors in aspect term tokenization
                if from_index == None or to_index == None or asp_toks != tokens[from_index:to_index+1]:

                    print("Tokenization Error in line ",ii,"!")
                    for pos, tok in enumerate(tokens):
                        print(pos, tok)
                    print("aspect term: ", term)
                    print("original asp tokens: ", asp_toks)

                    from_index = int(input("start position?"))
                    to_index = int(input("end position?"))

                    # add manually stated positions to dict for reproducibility
                    manual_pos[text] = {}
                    manual_pos[text][term] = {}
                    manual_pos[text][term]["from"] = from_index
                    manual_pos[text][term]["to"] = to_index
                    print(manual_pos)

                    print("Final aspect tokens: ", tokens[from_index:to_index+1])

                asp_dict["from"] = from_index
                asp_dict["to"] = to_index + 1

    with open("/content/drive/My Drive/Masterarbeit/Data/preprocessing/aspect_positions/mams_val.json","w") as f:
        json.dump(manual_pos, f)

json_make_pos(df)

Tokenization Error in line  18 !
0 The
1 servers
2 ,
3 casual
4 in
5 their
6 striped
7 button
8 -downs
9 ,
10 anticipate
11 and
12 fulfill
13 needs
14 as
15 if
16 they
17 were
18 trained
19 as
20 mind
21 readers
22 .
aspect term:  striped button-downs
original asp tokens:  ['striped', 'button-downs']
start position?6
end position?8
{'The servers, casual in their striped button-downs, anticipate and fulfill needs as if they were trained as mind readers.': {'striped button-downs': {'from': 6, 'to': 8}}}
Final aspect tokens:  ['striped', 'button', '-downs']
Tokenization Error in line  141 !
0 Other
1 small
2 dishes
3 ,
4 such
5 as
6 the
7 fragrant
8 sesame
9 oil
10 -
11 scented
12 raw
13 spicy
14 tuna
15 ,
16 are
17 excellent
18 with
19 a
20 beer-sized
21 mug
22 of
23 cold
24 oolong
25 tea
26 .
aspect term:  fragrant sesame oil-scented raw spicy tuna
original asp tokens:  ['fragrant', 'sesame', 'oil', '-scented', 'raw', 'spicy', 'tuna']
start position?7
end position?14
{'The servers, casu

## in case of reproducing the dataset

In [None]:
def json_maker(df, manual_pos):
   
    new_data = []

    for ii in df.index:
        
        new_dict = {}
        text = df.loc[ii,"text"]

        tokens = [token.text for sentence in nlp(text).sentences for token in sentence.tokens]
        new_dict["token"] = tokens

        new_dict["pos"] = [word.xpos for sentence in nlp(text).sentences for word in sentence.words]
        new_dict["head"] = [str(word.head) for sentence in nlp(text).sentences for word in sentence.words]
        new_dict["deprel"] = [word.deprel for sentence in nlp(text).sentences for word in sentence.words]
        
        new_dict["aspects"] = []
        for xx in range(1,aspect_number):         
            if df.loc[ii,"aspect_term_"+str(xx)] != None:
                asp_dict = {}   
                term = df.loc[ii,"aspect_term_"+str(xx)]
                asp_dict["term"] = term
                asp_dict["polarity"] = df.loc[ii,"aspect_polarity_"+str(xx)]

                # construct aspect position on token level
                asp_toks = [token.text for sentence in nlp(term).sentences for token in sentence.tokens]
                asp_ind = [list(locate(tokens, lambda a: a == term)) for term in asp_toks]

                # for aspects appearing only once in text, take the correct position
                # otherwise set to None
                if len(asp_ind[0]) == 1:
                    from_index = asp_ind[0][0]
                else:
                    from_index = None
                if len(asp_ind[-1]) == 1:
                    to_index = asp_ind[-1][0] 
                else: 
                    to_index = None

                # if both start and end pos are unknown, 
                # e.g. for single-word aspects, 
                # take character positions for help
                if from_index == None and to_index == None and len(asp_ind[0]) != 0:

                    all_char_from = [i for i in range(len(text)) if text.startswith(asp_toks[0], i)]
                    corr_char_from = int(df.loc[ii, "aspect_from_"+str(xx)])

                    if corr_char_from == max(all_char_from):
                        from_index = asp_ind[0][-1]
                    elif corr_char_from == min(all_char_from):
                        from_index = asp_ind[0][0]

                # in case of missing start/end positions,
                # try to find "to"/"from" using aspect token number as distance
                if from_index == None and to_index != None:
                    from_index = to_index - len(asp_toks) +1
                if to_index == None and from_index != None:
                    to_index = from_index + len(asp_toks) -1

                # correct tokenization errors in aspect term tokenization
                if from_index == None or to_index == None or asp_toks != tokens[from_index:to_index+1]:
                    if text in manual_pos.keys() and term in manual_pos[text].keys():
                        from_index = manual_pos[text][term]["from"]
                        to_index = manual_pos[text][term]["to"]

                asp_dict["from"] = from_index
                asp_dict["to"] = to_index + 1

                new_dict["aspects"] += [asp_dict]
        
        new_data += [new_dict]

    return new_data

In [None]:
pos_obj = open("/content/drive/My Drive/Masterarbeit/Data/preprocessing/aspect_positions/mams_val.json")
loaded_pos = json.load(pos_obj)

json_data = json_maker(df, loaded_pos)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/MAMS/val.json","w") as f:
    json.dump(json_data, f)

# Create LCF-ATEPCdat

In [None]:
def pol_to_no_shifted(sentiment):
  
    if sentiment == "positive":
        pol = 2
    elif sentiment == "negative":
        pol = 0
    elif sentiment == "neutral":
        pol = 1

    return str(pol)

In [None]:
def dat_maker(df):
    
    data_lines = []

    for line in df.index:

        text = df.loc[line,"text"]
        tokens = nltk.word_tokenize(text)

        # correct tokens
        for no,tok in enumerate(tokens):
            if tok[-1:] == "-" and len(tok)>2:
                tokens[no] = tok[:-1]
            if tok[:1] == "'" and len(tok)>3:
                tokens[no] = tok[1:]
            if tok in ["'","(",")"]:
                tokens.remove(tok)

        # create aspect-polarity dict
        asp_sent_dict = {}
        max_asp_len = 0
        for col in range(len(asp_cols)):
            aspect = df.loc[line,asp_cols[col]]
            if aspect != None:
                asp_sent_dict[aspect] = pol_to_no_shifted(df.loc[line,pol_cols[col]])
                if len(aspect.split()) > max_asp_len:
                    max_asp_len = len(aspect.split())

        label = ""
        # check for one-word-aspects
        for tok in tokens:
            if tok in asp_sent_dict.keys():
                label += tok + " B-ASP -1\n"
            else:
                label += tok + " O -1\n"

        # check for multi-word-aspects
        for ii in range(2,max_asp_len+1):
            for no,tok in enumerate(tokens):
                new_tok = " ".join(tokens[no:no+ii])
                if new_tok not in tokens and new_tok in asp_sent_dict.keys():
                    label = label.replace(tokens[no]+" O -1",tokens[no]+" B-ASP -1")
                    for xx in range(1,ii):
                        label = label.replace(tokens[no+xx]+" O -1",tokens[no+xx]+" I-ASP -1")

        # create duplicates of review in case of more than one aspect
        for key, val in asp_sent_dict.items():
            if key in tokens:
                new_label = label.replace(key+" B-ASP -1", key+" B-ASP "+val)
                data_lines += [new_label]
                data_lines += ["\n"]
            else:
                for ii in range(2,max_asp_len+1):
                    for no,tok in enumerate(tokens):
                        new_tok = " ".join(tokens[no:no+ii])
                        if new_tok == key:
                            new_label = label.replace(tokens[no]+" B-ASP -1",tokens[no]+" B-ASP "+val)
                            for xx in range(1,ii):
                                new_label = new_label.replace(tokens[no+xx]+" I-ASP -1",tokens[no+xx]+" I-ASP "+val)

                            data_lines += [new_label]
                            data_lines += ["\n"]

    return data_lines

In [None]:
dat_data = dat_maker(df)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/MAMS/val.dat","w") as f:
    f.write(''.join(dat_data))

# Create GRACEtxt

In [None]:
asp_cols = ["aspect_term_"+str(ii) for ii in range(1,aspect_number)]

In [None]:
import nltk

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('conll2000')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Create chunks and pos tags. Source: https://towardsdatascience.com/chunking-in-nlp-decoded-b4a71b2b4e24


In [None]:
from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI
from nltk.chunk.util import tree2conlltags, conlltags2tree
from nltk.corpus import conll2000

In [None]:
def conll_tag_chunks(chunk_sents):
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
    
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

class NGramTagChunker(ChunkParserI):

    def __init__(self,train_sentences,tagger_classes=[UnigramTagger,BigramTagger]):
        train_sent_tags=conll_tag_chunks(train_sentences)
        self.chunk_tagger=combined_tagger(train_sent_tags,tagger_classes)
    
    def parse(self,tagged_sentence):
        if not tagged_sentence:
            return None
        pos_tags=[tag for word, tag in tagged_sentence]
        chunk_pos_tags=self.chunk_tagger.tag(pos_tags)
        chunk_tags=[chunk_tag for (pos_tag,chunk_tag) in chunk_pos_tags]
        wpc_tags=[(word,pos_tag,chunk_tag) for ((word,pos_tag),chunk_tag) in zip(tagged_sentence,chunk_tags)]
        return conlltags2tree(wpc_tags)

In [None]:
data = conll2000.chunked_sents()
ntc = NGramTagChunker(data)

Convert tags into BIOES scheme. Source: https://gist.github.com/allanj/5ad206f7f4645c0269b68fb2065712f4

In [None]:
def iob_iobes(tags):
    """
    IOB2 (BIO) -> IOBES
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag == 'O':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'B':
            if i + 1 != len(tags) and \
                    tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('B-', 'S-'))
        elif tag.split('-')[0] == 'I':
            if i + 1 < len(tags) and \
                    tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('I-', 'E-'))
        else:
            raise Exception('Invalid IOB format!')
    return new_tags

In [None]:
def grace_txt_maker(df):
    
    data_lines = []

    for line in df.index:

        text = df.text[line]
        tokens = nltk.word_tokenize(text)

        # correct tokens
        for no,tok in enumerate(tokens):
            if tok[-1:] == "-" and len(tok)>2:
                tokens[no] = tok[:-1]
            if tok[:1] == "'" and len(tok)>3:
                tokens[no] = tok[1:]
            if tok in ["'","(",")"]:
                tokens.remove(tok)

        # create pos tags
        pos_tags = nltk.pos_tag(tokens)

        # create chunk/phrase tags
        full_tags = tree2conlltags(ntc.parse(pos_tags))
        chunks_list = [full_tags[ii][2] for ii in range(len(full_tags))]
        new_chunks = iob_iobes(chunks_list)

        # create aspect-polarity dict
        asp_sent_dict = {}
        max_asp_len = 0
        for col in range(len(asp_cols)):
            aspect = df.loc[line,asp_cols[col]]
            if aspect != None:
                asp_sent_dict[aspect] = df.loc[line,pol_cols[col]].upper()
                if len(aspect.split()) > max_asp_len:
                    max_asp_len = len(aspect.split())

        label = ""
        # check for one-word-aspects
        for pos,tok in enumerate(tokens):
            label += tok + " " + pos_tags[pos][1] + " " + new_chunks[pos]
            if tok in asp_sent_dict.keys():
                label +=  " B_AP " + asp_sent_dict[tok] + " B_AP+" + asp_sent_dict[tok] + "\n"
            else:
                label += " O O O \n"

        # check for multi-word-aspects
        for ii in range(2,max_asp_len+1):
            for no,tok in enumerate(tokens):
                new_tok = " ".join(tokens[no:no+ii])
                if new_tok not in tokens and new_tok in asp_sent_dict.keys():
                    new_pol = asp_sent_dict[new_tok]
                    label = label.replace(tokens[no]+ " " + pos_tags[no][1] + " " + new_chunks[no] + " O O O \n",
                                          tokens[no]+ " " + pos_tags[no][1] + " " + new_chunks[no] + " B_AP " + \
                                          new_pol + " B_AP+" + new_pol + "\n")
                    for xx in range(1,ii):
                        label = label.replace(tokens[no+xx] + " " + pos_tags[no+xx][1] + " " + new_chunks[no+xx] + " O O O \n",
                                          tokens[no+xx]+ " " + pos_tags[no+xx][1] + " " + new_chunks[no+xx] + " I_AP " + \
                                          new_pol + " I_AP+" + new_pol + "\n")

        data_lines += [label]

    return data_lines

In [None]:
grace_txt_data = grace_txt_maker(df)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/MAMS/grace_val.txt","w") as f:
    f.write('\n'.join(grace_txt_data))