In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# Data Import

In [2]:
import pandas as pd
import xml.etree.ElementTree as ET

In [3]:
parsedXML = ET.parse("/content/drive/My Drive/Masterarbeit/Data/Original/MAMS/mams_train.xml")

In [4]:
aspect_number = 12

In [5]:
dfcols = ['id','text']
for ii in range(1,aspect_number):
    dfcols.append("aspect_term_{}".format(ii))
    dfcols.append("aspect_polarity_{}".format(ii))
    dfcols.append("aspect_from_{}".format(ii))
    dfcols.append("aspect_to_{}".format(ii))          
df = pd.DataFrame(columns=dfcols)

for sentence in parsedXML.getroot():
    id = sentence.attrib.get('id')
    text = sentence.find('text').text
    line = [id,text]

    for asp in sentence.iter('aspectTerm'):
        term = asp.attrib.get("term")
        pol = asp.attrib.get("polarity")
        a_from = asp.attrib.get("from")
        a_to = asp.attrib.get("to")

        line += [term, pol, a_from, a_to]

    if len(line) < len(dfcols):
        pads = [None] * (len(dfcols)-len(line))
        line += pads
    
    df = df.append(pd.Series(line, index=dfcols), ignore_index=True)

In [6]:
df

Unnamed: 0,id,text,aspect_term_1,aspect_polarity_1,aspect_from_1,aspect_to_1,aspect_term_2,aspect_polarity_2,aspect_from_2,aspect_to_2,aspect_term_3,aspect_polarity_3,aspect_from_3,aspect_to_3,aspect_term_4,aspect_polarity_4,aspect_from_4,aspect_to_4,aspect_term_5,aspect_polarity_5,aspect_from_5,aspect_to_5,aspect_term_6,aspect_polarity_6,aspect_from_6,aspect_to_6,aspect_term_7,aspect_polarity_7,aspect_from_7,aspect_to_7,aspect_term_8,aspect_polarity_8,aspect_from_8,aspect_to_8,aspect_term_9,aspect_polarity_9,aspect_from_9,aspect_to_9,aspect_term_10,aspect_polarity_10,aspect_from_10,aspect_to_10,aspect_term_11,aspect_polarity_11,aspect_from_11,aspect_to_11
0,,The decor is not special at all but their food...,decor,negative,4,9,food,positive,42,46,prices,positive,59,65,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,"when tables opened up, the manager sat another...",tables,neutral,5,11,manager,negative,27,34,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,Though the menu includes some unorthodox offer...,menu,neutral,11,15,peanut butter roll,negative,54,72,classics,positive,93,101,sushi,positive,145,150,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,"service is good although a bit in your face, w...",service,positive,0,7,food,neutral,78,82,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,PS- I just went for brunch on Saturday and the...,brunch,neutral,20,26,eggs served with onions,positive,47,70,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4292,,"For dinner, I love the churrasco and halibut w...",dinner,neutral,4,10,churrasco,positive,23,32,halibut with of course black beans,positive,37,71,rice,positive,83,87,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4293,,"Was there for dinner last night, and the food ...",dinner,neutral,14,20,food,positive,41,45,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4294,,The menu sounded good but the grilled eggplant...,menu,neutral,4,8,grilled eggplant roll,negative,30,51,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4295,,Service is coddling and correct and there's no...,Service,positive,0,7,cheese,neutral,87,93,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Drop duplicates

In [7]:
cols_wo_id = df.columns
cols_wo_id = cols_wo_id.drop("id")
df[df.duplicated(cols_wo_id, keep=False)].sort_values("text")

Unnamed: 0,id,text,aspect_term_1,aspect_polarity_1,aspect_from_1,aspect_to_1,aspect_term_2,aspect_polarity_2,aspect_from_2,aspect_to_2,aspect_term_3,aspect_polarity_3,aspect_from_3,aspect_to_3,aspect_term_4,aspect_polarity_4,aspect_from_4,aspect_to_4,aspect_term_5,aspect_polarity_5,aspect_from_5,aspect_to_5,aspect_term_6,aspect_polarity_6,aspect_from_6,aspect_to_6,aspect_term_7,aspect_polarity_7,aspect_from_7,aspect_to_7,aspect_term_8,aspect_polarity_8,aspect_from_8,aspect_to_8,aspect_term_9,aspect_polarity_9,aspect_from_9,aspect_to_9,aspect_term_10,aspect_polarity_10,aspect_from_10,aspect_to_10,aspect_term_11,aspect_polarity_11,aspect_from_11,aspect_to_11


In [8]:
df.drop_duplicates(cols_wo_id, ignore_index=True,inplace=True)

In [9]:
text_counts = df.text.value_counts()
text_counts[:10]

The waiter placed the wrong entree in front of us each time.                                                                                                                                   1
We get our beers and then are told from a confuzed waitress that she realized that she can't give us the happy hour drink prices if we are not sitting at the bar.                             1
In the middle of dessert, the waitress silently passes by and drops the check.                                                                                                                 1
Upon his arrival the staff checked in his bags and put them in the storage room, but when my bf asked for a check-in ticket, the host refused to give one and threw the bags back at us.       1
In summer, the small outdoor garden is an ideal place to sip coffee while reading under a canopy of trees and sky.                                                                             1
The lobster is good but over-priced

# First Descriptive Analysis


## Aspects per sentence

In [10]:
pol_cols = ["aspect_polarity_"+str(ii) for ii in range(1,aspect_number)]

In [11]:
prev = 0
total_asp = 0
for no, col in enumerate(pol_cols):

    if col != "aspect_polarity_1":
        print("sentences with exactly ", no, "aspects:", prev - sum(df[col].value_counts()))
        total_asp += no * (prev - sum(df[col].value_counts()))

    if col == "aspect_polarity_"+str(aspect_number-1):
        print("sentences with exactly ", no+1, "aspects:", sum(df[col].value_counts()))
        total_asp += (no+1) * sum(df[col].value_counts())

    prev = sum(df[col].value_counts())

print("total no of aspects: ", total_asp)

sentences with exactly  1 aspects: 0
sentences with exactly  2 aspects: 2568
sentences with exactly  3 aspects: 1169
sentences with exactly  4 aspects: 364
sentences with exactly  5 aspects: 126
sentences with exactly  6 aspects: 48
sentences with exactly  7 aspects: 13
sentences with exactly  8 aspects: 6
sentences with exactly  9 aspects: 1
sentences with exactly  10 aspects: 1
sentences with exactly  11 aspects: 1
total no of aspects:  11186


## Sentiment Frequency

In [12]:
df_pol = df.loc[:,pol_cols]
df_pol_counts = df_pol.apply(pd.Series.value_counts)
df_pol_counts.sum(axis=1)

negative    2764.0
neutral     5042.0
positive    3380.0
dtype: float64

In [13]:
sum(df_pol_counts.sum(axis=1))

11186.0

## Sentences with more than one aspect

In [14]:
multi_counter = 0
for line in df.index:
    sentiment_list = []
    for col in pol_cols:
        if df.loc[line,col] != None:
            sentiment_list += [df.loc[line,col]]
    if len(set(sentiment_list)) > 1:
        multi_counter += 1
multi_counter

4297

# Remove "conflict"

In [15]:
for line in range(len(df)):
    for col in pol_cols:
        if df.loc[line,col] == "conflict":
            df.loc[line,col] = None
            number = col[-1:]
            df.loc[line,"aspect_term_"+str(number)] = None
            df.loc[line,"aspect_to_"+str(number)] = None
            df.loc[line,"aspect_from_"+str(number)] = None

In [16]:
df

Unnamed: 0,id,text,aspect_term_1,aspect_polarity_1,aspect_from_1,aspect_to_1,aspect_term_2,aspect_polarity_2,aspect_from_2,aspect_to_2,aspect_term_3,aspect_polarity_3,aspect_from_3,aspect_to_3,aspect_term_4,aspect_polarity_4,aspect_from_4,aspect_to_4,aspect_term_5,aspect_polarity_5,aspect_from_5,aspect_to_5,aspect_term_6,aspect_polarity_6,aspect_from_6,aspect_to_6,aspect_term_7,aspect_polarity_7,aspect_from_7,aspect_to_7,aspect_term_8,aspect_polarity_8,aspect_from_8,aspect_to_8,aspect_term_9,aspect_polarity_9,aspect_from_9,aspect_to_9,aspect_term_10,aspect_polarity_10,aspect_from_10,aspect_to_10,aspect_term_11,aspect_polarity_11,aspect_from_11,aspect_to_11
0,,The decor is not special at all but their food...,decor,negative,4,9,food,positive,42,46,prices,positive,59,65,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,"when tables opened up, the manager sat another...",tables,neutral,5,11,manager,negative,27,34,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,Though the menu includes some unorthodox offer...,menu,neutral,11,15,peanut butter roll,negative,54,72,classics,positive,93,101,sushi,positive,145,150,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,"service is good although a bit in your face, w...",service,positive,0,7,food,neutral,78,82,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,PS- I just went for brunch on Saturday and the...,brunch,neutral,20,26,eggs served with onions,positive,47,70,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4292,,"For dinner, I love the churrasco and halibut w...",dinner,neutral,4,10,churrasco,positive,23,32,halibut with of course black beans,positive,37,71,rice,positive,83,87,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4293,,"Was there for dinner last night, and the food ...",dinner,neutral,14,20,food,positive,41,45,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4294,,The menu sounded good but the grilled eggplant...,menu,neutral,4,8,grilled eggplant roll,negative,30,51,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4295,,Service is coddling and correct and there's no...,Service,positive,0,7,cheese,neutral,87,93,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Check for wrong positions

In [17]:
mistakes = []
for ii in range(len(df)):
    for xx in range(1,aspect_number):
        asp_col = "aspect_term_"+str(xx)
        from_col = "aspect_from_"+str(xx)
        to_col = "aspect_to_"+str(xx)
        actual_term = df.loc[ii,asp_col]
        if actual_term != None:
            pos_term = df.text[ii][int(df.loc[ii,from_col]):int(df.loc[ii,to_col])]
            if actual_term != pos_term:
                mistakes += [ii]
                print(actual_term, pos_term)
mistakes

[]

# Check for wrong aspect terms

In [18]:
import nltk

In [19]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [20]:
for ii in range(len(df)):
    tokens = nltk.word_tokenize(df.text[ii])
    for xx in range(1,aspect_number):
        actual_term = df.loc[ii,"aspect_term_"+str(xx)]
        if actual_term != None:
            for asp_part in nltk.word_tokenize(actual_term):
                if asp_part not in tokens and asp_part+"-" not in tokens:
                    print(ii,"-",xx,":",tokens)
                    print(actual_term, asp_part)

1191 - 2 : ['I', 'liked', 'the', 'beer', 'selection', '!']
beer s s
1203 - 4 : ['No', 'table', 'space', 'and', 'one', 'of', 'the', 'angry', 'neighbors', 'decided', 'to', 'take', 'matters', 'into', 'his', 'own', 'hands', 'by', 'throwing', 'a', 'bucket', 'of', 'water', 'out', 'his', 'window', 'and', 'onto', 'the', 'patrons', 'waiting', 'for', 'their', 'tables', '.']
table s s
1535 - 1 : ['My', 'only', 'complaint', 'might', 'be', 'the', 'beer', 'selection', '-', 'they', 'did', "n't", 'really', 'have', 'any', 'dark', 'beers', ',', 'which', 'I', 'like', 'to', 'have', 'with', 'a', 'steak', '.']
beer s s
1594 - 4 : ['I', 'tried', 'the', 'Crab', 'Croquettes', '(', 'delicious', ',', 'and', 'yes', ',', 'the', 'sauce', 'IS', 'hot', ')', 'and', 'the', 'Salade', 'Jumelle', '(', 'a', 'good', ',', 'basic', 'mesculen', 'salad', ')', '.']
Salad Salad
1773 - 3 : ['Place', 'was', 'empty', ',', 'waitress', 'was', 'rude', ',', 'told', 'us', 'that', 'even', 'though', 'there', 'were', 'drink', 'specials', 'a

In [21]:
df.loc[1191,"aspect_term_2"] = "beer"
df.loc[1191,"aspect_to_2"] = str(int(df.loc[1191,"aspect_to_2"])-2)

In [22]:
df.loc[1203,"aspect_term_4"] = "table"
df.loc[1203,"aspect_to_4"] = str(int(df.loc[1203,"aspect_to_4"])-2)

In [23]:
df.loc[1535,"aspect_term_1"] = "beer"
df.loc[1535,"aspect_to_1"] = str(int(df.loc[1535,"aspect_to_1"])-2)

In [24]:
df.loc[1594,"aspect_term_4"] = "Salade"
df.loc[1594,"aspect_to_4"] = str(int(df.loc[1594,"aspect_to_4"])+1)

In [25]:
df.loc[1773,"aspect_term_3"] = "drink"
df.loc[1773,"aspect_to_3"] = str(int(df.loc[1773,"aspect_to_3"])-2)

In [26]:
df.loc[2938,"aspect_term_1"] = "Cheeses"
df.loc[2938,"aspect_to_1"] = str(int(df.loc[2938,"aspect_to_1"])+1)

In [27]:
df.loc[3865,"aspect_term_3"] = "clam"
df.loc[3865,"aspect_to_3"] = str(int(df.loc[3865,"aspect_to_3"])-2)

In [28]:
df.loc[3903,"aspect_term_3"] = "beet"
df.loc[3903,"aspect_to_3"] = str(int(df.loc[3903,"aspect_to_3"])-2)

Re-check for wrong positions

In [29]:
mistakes = []
for ii in range(len(df)):
    for xx in range(1,aspect_number):
        asp_col = "aspect_term_"+str(xx)
        from_col = "aspect_from_"+str(xx)
        to_col = "aspect_to_"+str(xx)
        actual_term = df.loc[ii,asp_col]
        if actual_term != None:
            pos_term = df.text[ii][int(df.loc[ii,from_col]):int(df.loc[ii,to_col])]
            if actual_term != pos_term:
                mistakes += [ii]
                print(actual_term, pos_term)
mistakes

[]

# Descriptive Analysis


## Aspects per sentence

In [30]:
pol_cols = ["aspect_polarity_"+str(ii) for ii in range(1,aspect_number)]

In [31]:
prev = 0
total_asp = 0
for no, col in enumerate(pol_cols):

    if col != "aspect_polarity_1":
        print("sentences with exactly ", no, "aspects:", prev - sum(df[col].value_counts()))
        total_asp += no * (prev - sum(df[col].value_counts()))

    if col == "aspect_polarity_"+str(aspect_number-1):
        print("sentences with exactly ", no+1, "aspects:", sum(df[col].value_counts()))
        total_asp += (no+1) * sum(df[col].value_counts())

    prev = sum(df[col].value_counts())

print("total no of aspects: ", total_asp)

sentences with exactly  1 aspects: 0
sentences with exactly  2 aspects: 2568
sentences with exactly  3 aspects: 1169
sentences with exactly  4 aspects: 364
sentences with exactly  5 aspects: 126
sentences with exactly  6 aspects: 48
sentences with exactly  7 aspects: 13
sentences with exactly  8 aspects: 6
sentences with exactly  9 aspects: 1
sentences with exactly  10 aspects: 1
sentences with exactly  11 aspects: 1
total no of aspects:  11186


## Sentiment Frequency

In [32]:
df_pol = df.loc[:,pol_cols]
df_pol_counts = df_pol.apply(pd.Series.value_counts)
df_pol_counts.sum(axis=1)

negative    2764.0
neutral     5042.0
positive    3380.0
dtype: float64

In [33]:
sum(df_pol_counts.sum(axis=1))

11186.0

## Sentences with more than one aspect

In [34]:
multi_counter = 0
for line in df.index:
    sentiment_list = []
    for col in pol_cols:
        if df.loc[line,col] != None:
            sentiment_list += [df.loc[line,col]]
    if len(set(sentiment_list)) > 1:
        multi_counter += 1
multi_counter

4297

# Save as xml

In [None]:
def xml_maker(df,aspect_number):

    root = ET.Element('sentences')

    for line in df.index:
        name = "sentence"
        entry = ET.SubElement(root, name)
        entry.set("id", str(df["id"][line]))

        text_child = ET.SubElement(entry, "text")
        text_child.text = str(df["text"][line])

        asp_child = ET.SubElement(entry, "aspectTerms")
        for xx in range(1,aspect_number):
            if df.loc[line,"aspect_term_"+str(xx)] != None:
                asp_subchild = ET.SubElement(asp_child, "aspectTerm")
                asp_subchild.set("from",str(df["aspect_from_"+str(xx)][line]))
                asp_subchild.set("polarity",str(df["aspect_polarity_"+str(xx)][line]))
                asp_subchild.set("term",str(df["aspect_term_"+str(xx)][line]))
                asp_subchild.set("to",str(df["aspect_to_"+str(xx)][line]))

    return ET.tostring(root)

In [None]:
xml_data = xml_maker(df,aspect_number)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/MAMS/train.xml","w") as f:
    f.write(xml_data.decode('utf-8'))

# Create xml.seg

In [None]:
def pol_to_no(sentiment):
  
    if sentiment == "positive":
        pol = 1
    elif sentiment == "negative":
        pol = -1
    elif sentiment == "neutral":
        pol = 0

    return str(pol)

In [None]:
def xml_seg_maker(df):

    df_wo_id = df.drop(columns="id", axis=1, inplace=False)
    df_wo_id.reset_index(drop=True, inplace=True)
    data_lines = []

    for ii in df_wo_id.index:

        line = list(df_wo_id.loc[ii])
        o_text = line[0]
        aspects = [line[xx] for xx in range(1,len(df_wo_id.loc[0]),4) if line[xx] != None]
        pols = [line[xx] for xx in range(2,len(df_wo_id.loc[0]),4) if line[xx] != None]

        for asp in range(len(aspects)):
            text = o_text.replace(aspects[asp],'$T$')
            pol = pol_to_no(pols[asp])

            data_lines += [text,aspects[asp],pol]
    
    return data_lines

In [None]:
xml_seg_data = xml_seg_maker(df)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/MAMS/train.xml.seg","w") as f:
    f.write('\n'.join(xml_seg_data))

# Create BERT+txt

In [None]:
asp_cols = ["aspect_term_"+str(ii) for ii in range(1,aspect_number)]

In [None]:
def sent_conv(sentiment):

    if sentiment == "positive":
        return "POS"
    elif sentiment == "negative":
        return "NEG"
    elif sentiment == "neutral":
        return "NEU"

In [None]:
def txt_maker(df):

    data_lines = []

    for line in df.index:

        text = df.loc[line,"text"]
        tokens = nltk.word_tokenize(text)

        # correct tokens
        for no,tok in enumerate(tokens):
            if tok[-1:] == "-" and len(tok)>2:
                tokens[no] = tok[:-1]
            if tok[:1] == "'" and len(tok)>3:
                tokens[no] = tok[1:]
            if tok in ["'","(",")"]:
                tokens.remove(tok)

        # create aspect-polarity dict
        asp_sent_dict = {}
        max_asp_len = 0
        for col in range(len(asp_cols)):
            aspect = df.loc[line,asp_cols[col]]
            if aspect != None:
                asp_sent_dict[aspect] = sent_conv(df.loc[line,pol_cols[col]])
                if len(aspect.split()) > max_asp_len:
                    max_asp_len = len(aspect.split())


        label = ""
        # check for one-word-aspects
        for tok in tokens:
            if tok in asp_sent_dict.keys():
                label += tok + "=T-" + asp_sent_dict[tok] + " "
            else:
                label += tok + "=O "
        label = label[:-1]

        # check for multi-word-aspects
        for ii in range(2,max_asp_len+1):
            for no,tok in enumerate(tokens):
                new_tok = " ".join(tokens[no:no+ii])
                if new_tok in asp_sent_dict.keys():
                    new_pol = asp_sent_dict[new_tok]
                    old_label = " ".join([tokens[no+xx]+"=O" for xx in range(ii) if no+xx < len(tokens)])
                    new_label = " ".join([tokens[no+xx]+"=T-"+new_pol for xx in range(ii) if no+xx < len(tokens)])
                    label = label.replace(old_label, new_label)
  
        data_lines += [text+"####"+label]

    return data_lines

In [None]:
txt_data = txt_maker(df)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/MAMS/train.txt","w") as f:
    f.write('\n'.join(txt_data))

# Create RGATjson

In [None]:
!pip install stanza

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/50/ae/a70a58ce6b4e2daad538688806ee0f238dbe601954582a74ea57cde6c532/stanza-1.2-py3-none-any.whl (282kB)
[K     |█▏                              | 10kB 17.0MB/s eta 0:00:01[K     |██▎                             | 20kB 24.0MB/s eta 0:00:01[K     |███▌                            | 30kB 11.6MB/s eta 0:00:01[K     |████▋                           | 40kB 10.1MB/s eta 0:00:01[K     |█████▉                          | 51kB 8.1MB/s eta 0:00:01[K     |███████                         | 61kB 8.7MB/s eta 0:00:01[K     |████████▏                       | 71kB 8.7MB/s eta 0:00:01[K     |█████████▎                      | 81kB 8.8MB/s eta 0:00:01[K     |██████████▌                     | 92kB 8.7MB/s eta 0:00:01[K     |███████████▋                    | 102kB 8.6MB/s eta 0:00:01[K     |████████████▉                   | 112kB 8.6MB/s eta 0:00:01[K     |██████████████                  | 122kB 8.6MB/s eta 0:0

In [None]:
import stanza

In [None]:
stanza.download('en')
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 25.3MB/s]                    
2021-05-10 07:26:30 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.2.0/en/default.zip: 100%|██████████| 411M/411M [01:59<00:00, 3.45MB/s]
2021-05-10 07:28:39 INFO: Finished downloading models and saved to /root/stanza_resources.
2021-05-10 07:28:39 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2021-05-10 07:28:39 INFO: Use device: cpu
2021-05-10 07:28:39 INFO: Loading: tokenize
2021-05-10 07:28:40 INFO: Loading: pos
2021-05-10 07:28:40 INFO: Loading: lemma
2021-05-10 07:28:40 INFO: Loading: depparse
2021-05-10 07:28:40 INFO: Done loading processors!


In [None]:
from more_itertools import locate

## in case of creating for the first time

In [None]:
import json

In [None]:
def json_make_pos(df):
   
    manual_pos = {}

    for ii in df.index:
        
        new_dict = {}
        text = df.loc[ii,"text"]
        tokens = [token.text for sentence in nlp(text).sentences for token in sentence.tokens]
        
        new_dict["aspects"] = []
        for xx in range(1,aspect_number):         
            if df.loc[ii,"aspect_term_"+str(xx)] != None:
                asp_dict = {}   
                term = df.loc[ii,"aspect_term_"+str(xx)]

                # construct aspect position on token level
                asp_toks = [token.text for sentence in nlp(term).sentences for token in sentence.tokens]
                asp_ind = [list(locate(tokens, lambda a: a == tok)) for tok in asp_toks]

                # for aspects appearing only once in text, take the correct position
                # otherwise set to None
                if len(asp_ind[0]) == 1:
                    from_index = asp_ind[0][0]
                else:
                    from_index = None
                if len(asp_ind[-1]) == 1:
                    to_index = asp_ind[-1][0] 
                else: 
                    to_index = None

                # if both start and end pos are unknown, 
                # e.g. for single-word aspects, 
                # take character positions for help
                if from_index == None and to_index == None and len(asp_ind[0]) != 0:
                    print(ii, ": ", text)
                    print("original term: ",term)
                    all_char_from = [i for i in range(len(text)) if text.startswith(asp_toks[0], i)]
                    print("all start chars: ",all_char_from)
                    corr_char_from = int(df.loc[ii, "aspect_from_"+str(xx)])
                    print("correct start char: ",corr_char_from)
                    print("text beginning at correct start char: ", text[corr_char_from:])
                    print("tokens: ",tokens)
                    print("original asp tokens: ", asp_toks)
                    print("original asp indices: ",asp_ind)
                    if corr_char_from == max(all_char_from):
                        from_index = asp_ind[0][-1]
                    elif corr_char_from == min(all_char_from):
                        from_index = asp_ind[0][0]

                # in case of missing start/end positions,
                # try to find "to"/"from" using aspect token number as distance
                if from_index == None and to_index != None:
                    from_index = to_index - len(asp_toks) +1
                if to_index == None and from_index != None:
                    to_index = from_index + len(asp_toks) -1

                # correct tokenization errors in aspect term tokenization
                if from_index == None or to_index == None or asp_toks != tokens[from_index:to_index+1]:

                    print("Tokenization Error in line ",ii,"!")
                    for pos, tok in enumerate(tokens):
                        print(pos, tok)
                    print("aspect term: ", term)
                    print("original asp tokens: ", asp_toks)

                    from_index = int(input("start position?"))
                    to_index = int(input("end position?"))

                    # add manually stated positions to dict for reproducibility
                    manual_pos[text] = {}
                    manual_pos[text][term] = {}
                    manual_pos[text][term]["from"] = from_index
                    manual_pos[text][term]["to"] = to_index
                    print(manual_pos)

                    print("Final aspect tokens: ", tokens[from_index:to_index+1])

                asp_dict["from"] = from_index
                asp_dict["to"] = to_index + 1

    with open("/content/drive/My Drive/Masterarbeit/Data/preprocessing/aspect_positions/mams_train.json","w") as f:
        json.dump(manual_pos, f)

json_make_pos(df)

Tokenization Error in line  62 !
0 The
1 menu
2 features
3 mild
4 versions
5 of
6 Lone
7 Star
8 state
9 favorites
10 ,
11 from
12 double
13 -
14 basted
15 baby
16 back
17 ribs
18 and
19 steak
20 fajitas
21 to
22 red-
23 beef
24 chili
25 and
26 deep
27 -
28 fried
29 onions
30 .
aspect term:  red-beef chili
original asp tokens:  ['red', '-', 'beef', 'chili']
start position?22
end position?24
{'The menu features mild versions of Lone Star state favorites, from double-basted baby back ribs and steak fajitas to red-beef chili and deep-fried onions.': {'red-beef chili': {'from': 22, 'to': 24}}}
Final aspect tokens:  ['red-', 'beef', 'chili']
Tokenization Error in line  109 !
0 Staple
1 entrees
2 like
3 moussaka
4 more
5 than
6 make
7 the
8 grade
9 ,
10 but
11 the
12 selection
13 of
14 clay
15 pot-cooked
16 dishes
17 --
18 tender
19 lamb
20 with
21 orzo
22 ,
23 or
24 fish
25 of
26 the
27 day
28 with
29 eggplant
30 and
31 zucchini--
32 take
33 comfort
34 food
35 to
36 a
37 new
38 level
39 .
as

## in case of reproducing the dataset

In [None]:
def json_maker(df, manual_pos):
   
    new_data = []

    for ii in df.index:
        
        new_dict = {}
        text = df.loc[ii,"text"]

        tokens = [token.text for sentence in nlp(text).sentences for token in sentence.tokens]
        new_dict["token"] = tokens

        new_dict["pos"] = [word.xpos for sentence in nlp(text).sentences for word in sentence.words]
        new_dict["head"] = [str(word.head) for sentence in nlp(text).sentences for word in sentence.words]
        new_dict["deprel"] = [word.deprel for sentence in nlp(text).sentences for word in sentence.words]
        
        new_dict["aspects"] = []
        for xx in range(1,aspect_number):         
            if df.loc[ii,"aspect_term_"+str(xx)] != None:
                asp_dict = {}   
                term = df.loc[ii,"aspect_term_"+str(xx)]
                asp_dict["term"] = term
                asp_dict["polarity"] = df.loc[ii,"aspect_polarity_"+str(xx)]

                # construct aspect position on token level
                asp_toks = [token.text for sentence in nlp(term).sentences for token in sentence.tokens]
                asp_ind = [list(locate(tokens, lambda a: a == term)) for term in asp_toks]

                # for aspects appearing only once in text, take the correct position
                # otherwise set to None
                if len(asp_ind[0]) == 1:
                    from_index = asp_ind[0][0]
                else:
                    from_index = None
                if len(asp_ind[-1]) == 1:
                    to_index = asp_ind[-1][0] 
                else: 
                    to_index = None

                # if both start and end pos are unknown, 
                # e.g. for single-word aspects, 
                # take character positions for help
                if from_index == None and to_index == None and len(asp_ind[0]) != 0:

                    all_char_from = [i for i in range(len(text)) if text.startswith(asp_toks[0], i)]
                    corr_char_from = int(df.loc[ii, "aspect_from_"+str(xx)])

                    if corr_char_from == max(all_char_from):
                        from_index = asp_ind[0][-1]
                    elif corr_char_from == min(all_char_from):
                        from_index = asp_ind[0][0]

                # in case of missing start/end positions,
                # try to find "to"/"from" using aspect token number as distance
                if from_index == None and to_index != None:
                    from_index = to_index - len(asp_toks) +1
                if to_index == None and from_index != None:
                    to_index = from_index + len(asp_toks) -1

                # correct tokenization errors in aspect term tokenization
                if from_index == None or to_index == None or asp_toks != tokens[from_index:to_index+1]:
                    if text in manual_pos.keys() and term in manual_pos[text].keys():
                        from_index = manual_pos[text][term]["from"]
                        to_index = manual_pos[text][term]["to"]

                asp_dict["from"] = from_index
                asp_dict["to"] = to_index + 1

                new_dict["aspects"] += [asp_dict]
        
        new_data += [new_dict]

    return new_data

In [None]:
pos_obj = open("/content/drive/My Drive/Masterarbeit/Data/preprocessing/aspect_positions/mams_train.json")
loaded_pos = json.load(pos_obj)

json_data = json_maker(df, loaded_pos)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/MAMS/train.json","w") as f:
    json.dump(json_data, f)

# Create LCF-ATEPCdat

In [None]:
def pol_to_no_shifted(sentiment):
  
    if sentiment == "positive":
        pol = 2
    elif sentiment == "negative":
        pol = 0
    elif sentiment == "neutral":
        pol = 1

    return str(pol)

In [None]:
def dat_maker(df):
    
    data_lines = []

    for line in df.index:

        text = df.loc[line,"text"]
        tokens = nltk.word_tokenize(text)

        # correct tokens
        for no,tok in enumerate(tokens):
            if tok[-1:] == "-" and len(tok)>2:
                tokens[no] = tok[:-1]
            if tok[:1] == "'" and len(tok)>3:
                tokens[no] = tok[1:]
            if tok in ["'","(",")"]:
                tokens.remove(tok)

        # create aspect-polarity dict
        asp_sent_dict = {}
        max_asp_len = 0
        for col in range(len(asp_cols)):
            aspect = df.loc[line,asp_cols[col]]
            if aspect != None:
                asp_sent_dict[aspect] = pol_to_no_shifted(df.loc[line,pol_cols[col]])
                if len(aspect.split()) > max_asp_len:
                    max_asp_len = len(aspect.split())

        label = ""
        # check for one-word-aspects
        for tok in tokens:
            if tok in asp_sent_dict.keys():
                label += tok + " B-ASP -1\n"
            else:
                label += tok + " O -1\n"

        # check for multi-word-aspects
        for ii in range(2,max_asp_len+1):
            for no,tok in enumerate(tokens):
                new_tok = " ".join(tokens[no:no+ii])
                if new_tok not in tokens and new_tok in asp_sent_dict.keys():
                    label = label.replace(tokens[no]+" O -1",tokens[no]+" B-ASP -1")
                    for xx in range(1,ii):
                        label = label.replace(tokens[no+xx]+" O -1",tokens[no+xx]+" I-ASP -1")

        # create duplicates of review in case of more than one aspect
        for key, val in asp_sent_dict.items():
            if key in tokens:
                new_label = label.replace(key+" B-ASP -1", key+" B-ASP "+val)
                data_lines += [new_label]
                data_lines += ["\n"]
            else:
                for ii in range(2,max_asp_len+1):
                    for no,tok in enumerate(tokens):
                        new_tok = " ".join(tokens[no:no+ii])
                        if new_tok == key:
                            new_label = label.replace(tokens[no]+" B-ASP -1",tokens[no]+" B-ASP "+val)
                            for xx in range(1,ii):
                                new_label = new_label.replace(tokens[no+xx]+" I-ASP -1",tokens[no+xx]+" I-ASP "+val)

                            data_lines += [new_label]
                            data_lines += ["\n"]

    return data_lines

In [None]:
dat_data = dat_maker(df)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/MAMS/train.dat","w") as f:
    f.write(''.join(dat_data))

# Create GRACEtxt

In [None]:
asp_cols = ["aspect_term_"+str(ii) for ii in range(1,aspect_number)]

In [None]:
import nltk

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('conll2000')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Create chunks and pos tags. Source: https://towardsdatascience.com/chunking-in-nlp-decoded-b4a71b2b4e24


In [None]:
from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI
from nltk.chunk.util import tree2conlltags, conlltags2tree
from nltk.corpus import conll2000

In [None]:
def conll_tag_chunks(chunk_sents):
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
    
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

class NGramTagChunker(ChunkParserI):

    def __init__(self,train_sentences,tagger_classes=[UnigramTagger,BigramTagger]):
        train_sent_tags=conll_tag_chunks(train_sentences)
        self.chunk_tagger=combined_tagger(train_sent_tags,tagger_classes)
    
    def parse(self,tagged_sentence):
        if not tagged_sentence:
            return None
        pos_tags=[tag for word, tag in tagged_sentence]
        chunk_pos_tags=self.chunk_tagger.tag(pos_tags)
        chunk_tags=[chunk_tag for (pos_tag,chunk_tag) in chunk_pos_tags]
        wpc_tags=[(word,pos_tag,chunk_tag) for ((word,pos_tag),chunk_tag) in zip(tagged_sentence,chunk_tags)]
        return conlltags2tree(wpc_tags)

In [None]:
data = conll2000.chunked_sents()
ntc = NGramTagChunker(data)

Convert tags into BIOES scheme. Source: https://gist.github.com/allanj/5ad206f7f4645c0269b68fb2065712f4

In [None]:
def iob_iobes(tags):
    """
    IOB2 (BIO) -> IOBES
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag == 'O':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'B':
            if i + 1 != len(tags) and \
                    tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('B-', 'S-'))
        elif tag.split('-')[0] == 'I':
            if i + 1 < len(tags) and \
                    tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('I-', 'E-'))
        else:
            raise Exception('Invalid IOB format!')
    return new_tags

In [None]:
def grace_txt_maker(df):
    
    data_lines = []

    for line in df.index:

        text = df.text[line]
        tokens = nltk.word_tokenize(text)

        # correct tokens
        for no,tok in enumerate(tokens):
            if tok[-1:] == "-" and len(tok)>2:
                tokens[no] = tok[:-1]
            if tok[:1] == "'" and len(tok)>3:
                tokens[no] = tok[1:]
            if tok in ["'","(",")"]:
                tokens.remove(tok)

        # create pos tags
        pos_tags = nltk.pos_tag(tokens)

        # create chunk/phrase tags
        full_tags = tree2conlltags(ntc.parse(pos_tags))
        chunks_list = [full_tags[ii][2] for ii in range(len(full_tags))]
        new_chunks = iob_iobes(chunks_list)

        # create aspect-polarity dict
        asp_sent_dict = {}
        max_asp_len = 0
        for col in range(len(asp_cols)):
            aspect = df.loc[line,asp_cols[col]]
            if aspect != None:
                asp_sent_dict[aspect] = df.loc[line,pol_cols[col]].upper()
                if len(aspect.split()) > max_asp_len:
                    max_asp_len = len(aspect.split())

        label = ""
        # check for one-word-aspects
        for pos,tok in enumerate(tokens):
            label += tok + " " + pos_tags[pos][1] + " " + new_chunks[pos]
            if tok in asp_sent_dict.keys():
                label +=  " B_AP " + asp_sent_dict[tok] + " B_AP+" + asp_sent_dict[tok] + "\n"
            else:
                label += " O O O \n"

        # check for multi-word-aspects
        for ii in range(2,max_asp_len+1):
            for no,tok in enumerate(tokens):
                new_tok = " ".join(tokens[no:no+ii])
                if new_tok not in tokens and new_tok in asp_sent_dict.keys():
                    new_pol = asp_sent_dict[new_tok]
                    label = label.replace(tokens[no]+ " " + pos_tags[no][1] + " " + new_chunks[no] + " O O O \n",
                                          tokens[no]+ " " + pos_tags[no][1] + " " + new_chunks[no] + " B_AP " + \
                                          new_pol + " B_AP+" + new_pol + "\n")
                    for xx in range(1,ii):
                        label = label.replace(tokens[no+xx] + " " + pos_tags[no+xx][1] + " " + new_chunks[no+xx] + " O O O \n",
                                          tokens[no+xx]+ " " + pos_tags[no+xx][1] + " " + new_chunks[no+xx] + " I_AP " + \
                                          new_pol + " I_AP+" + new_pol + "\n")

        data_lines += [label]

    return data_lines

In [None]:
grace_txt_data = grace_txt_maker(df)

with open("/content/drive/My Drive/Masterarbeit/Data/Final/MAMS/grace_train.txt","w") as f:
    f.write('\n'.join(grace_txt_data))