In [1]:
import pandas as pd
import os
import gzip
from tqdm.autonotebook import tqdm
import re



In [None]:
def en_contraction_removal(text: str) -> str:
    apostrophe_handled = re.sub("’", "'", text)
    # from https://gist.githubusercontent.com/tthustla/74e99a00541264e93c3bee8b2b49e6d8/raw/599100471e8127d6efad446717dc951a10b69777/yatwapart1_01.py
    contraction_mapping = {
                    "i.e.": 'for example',
                    "e.g.": 'for example',
                    "youre": "you are",
                    "youll": "you will",
                    "theyre": "they are", "theyll": "they will",
                    "weve": "we have",
                    "shouldnt": "should not",
                    "dont": "do not",
                    "doesnt": "does not", "doesn": "does not",
                    "didnt": "did not",
                    "wasn": "was not",
                    "arent": "are not", "aren": "are not",
                    "aint": "is not", "isnt": "is not", "isn": "is not",
                    "wouldnt": "would not", "wouldn": "would not",
                    "ain't": "is not", "aren't": "are not","can't": "cannot", 
                   "can't've": "cannot have", "'cause": "because", "could've": "could have", 
                   "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
                   "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                   "he'll've": "he will have", "he's": "he is", "how'd": "how did", 
                   "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                   "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                   "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                   "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                   "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                   "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                   "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                   "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                   "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                   "this's": "this is",
                   "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                   "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                       "here's": "here is",
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not", 
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                   "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", 
                   "why've": "why have", "will've": "will have", "won't": "will not", 
                   "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                   "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                   "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                   "you'll've": "you will have", "you're": "you are", "you've": "you have" }
    expanded = ' '.join([contraction_mapping[t.lower()] if t.lower() in contraction_mapping else t for t in apostrophe_handled.split(" ")])
    return expanded

In [None]:
import hunspell
hobj = hunspell.HunSpell('/Library/Spelling/en_US.dic', '/Library/Spelling/en_US.aff')
known_words = ['GMO', 'GMOs', 'Wal-Mart', 'Publix', 'Glyphosate', 'co2', 'Waitrose', '<URL>', 'certifier', 'TLDR', 'Coca~Cola', 'Quora', 'sci-fi']

for w in known_words:
    hobj.add(w)

In [None]:
url_regex = r'(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&\(\)\*\+,;=.]+'

def replace_urls_regex(sentence: str, url_token: str = '<URL>') -> str:
    return re.sub(url_regex, url_token, sentence)

def replace_urls(words, url_token: str = '<URL>'):
    return [url_token if (w.lower().startswith('www') or w.lower().startswith('http')) else w for w in words]

def spellcheck_sentence(row) -> str:
    sent = row['Sentence']
    #print(sent)
    to_remove = [',', '(', ')', ':', '?', '&', '/', '*']
    for tr in to_remove:
        sent = sent.replace(tr, ' ')
        
    sent = sent.replace('€™', "'")
    sent = sent.replace('�', "'")
    sent = en_contraction_removal(sent)
    sent = sent.replace("'", ' ')
    sent = replace_urls_regex(sent)


    tokens = sent.split(' ')
    result = []
    for t in tokens:
        if t == ' ':
            continue
        if not hobj.spell(t):
            suggestions = hobj.suggest(t)
            if not suggestions:
                result.append(t)
            else:
                if suggestions[0] == 'e':
                    result.append(t)
                    continue
                result.append(suggestions[0])
                print(f'{t} -> {suggestions[0]}')
        else:
            result.append(t)
    return ' '.join(result)
        
#spellcheck_sentence('This is a tset with a wong wod. Adn now anotheer one why does this notjn workd')

# Spell Checking

In [None]:
splits = ['train', 'validation', 'test']
path = os.path.join(os.getcwd(), 'data', 'data', 'organic2019')

for s in splits:
    print('Split: ' + str(s))
    fn = os.path.join(path, s + '.csv')
    df = pd.read_csv(fn, sep='|') #
    df['Sentence'] = df.apply(spellcheck_sentence, axis=1)
    
    fn = os.path.join(path, s + '_sp.csv')
    df.to_csv(fn, sep='|', index=False)

# Exploration

In [50]:
path = os.path.join(os.getcwd(), 'data', 'data', 'germeval2017')
header = ['url', 'text', 'relevance', 'document sentiment', 'aspect sentiments']
test2 = pd.read_csv(os.path.join(path, 'test_TIMESTAMP2.tsv'), sep='\t', header=None, names=header, index_col=False)
test1 = pd.read_csv(os.path.join(path, 'test_TIMESTAMP1.tsv'), sep='\t', header=None, names=header, index_col=False)
train = pd.read_csv(os.path.join(path, 'train_v1.4.tsv'), sep='\t', header=None, names=header, index_col=False)
val = pd.read_csv(os.path.join(path, 'dev_v1.4.tsv'), sep='\t', header=None, names=header, index_col=False)

test1['split'] = test1.apply(lambda r: 'test1', axis=1)
test2['split'] = test2.apply(lambda r: 'test2', axis=1)
train['split'] = train.apply(lambda r: 'train', axis=1)
val['split'] = val.apply(lambda r: 'val', axis=1)


complete = test2.append(test1).append(train).append(val)
complete              

Unnamed: 0,url,text,relevance,document sentiment,aspect sentiments,split
0,http://twitter.com/\_Kloetzchen\_/statuses/798...,"Wenn die Bahn so voll ist, dass man lieber noc...",True,negative,Auslastung_und_Platzangebot:negative Auslastun...,test2
1,http://kinderglueck-der-familienblog.de/reisen...,Familienurlaub im Kinderhotel in Österreich • ...,False,neutral,,test2
2,http://news-koblenz.de,14.11.2016 – 13:53 Berlin-Mitte (ots) - Anläss...,True,negative,Sonstige_Unregelmässigkeiten:negative Sonstige...,test2
3,http://www.galopponline.de/news/galopp-news/gr...,"02 - Allgemeines Forum Diskussionen, Fragen un...",True,neutral,Allgemein:neutral,test2
4,http://twitter.com/Zischke/statuses/7980841919...,Menschen die ihre reservierten Plätze in der B...,True,negative,Atmosphäre:negative,test2
5,http://www.gutefrage.net/frage/wie-viel-kostet...,Wie viel kostet einen Monatskarte bei der deut...,True,neutral,Allgemein:neutral,test2
6,https://www.kurzurlaub.de/angebote/123646-oste...,Hallo! Ich habe mir eine eBahncard in die Navi...,True,negative,Ticketkauf:negative DB_App_und_Website:neutral...,test2
7,http://www.radiokoeln.de/koeln/rk/1245912/nrw-...,Wenn man so etwas bewußt also absichtlich und ...,True,negative,Allgemein:negative Ticketkauf:negative,test2
8,http://twitter.com/placetogo/statuses/79829362...,S-Bahn 23:30 HH: einmal Maruscha genannt werde...,True,neutral,Allgemein:neutral,test2
9,http://twitter.com/SputnikMagazin/statuses/798...,Er wollte schlichten: Mann in S-Bahn brutal ve...,True,negative,Atmosphäre:neutral Sicherheit:negative,test2


In [37]:
test2.shape

(1842, 5)

### Split sentiments

In [56]:
complete_split = pd.DataFrame()

for _, r in tqdm(complete.iterrows(), total=complete.shape[0]):
    asp_sent = r['aspect sentiments']
    if pd.isnull(asp_sent) or asp_sent == '':
        complete_split = complete_split.append(r)
        continue
    asp_sent = asp_sent.split(" ")
    #print(asp_sent)
    aspects = []
    for as_pair in asp_sent:
        if as_pair == '' or len(as_pair.split(':')) != 2:
            continue

        as_pair = as_pair.split(':')
        aspect = as_pair[0]
        
        if len(aspect.split('#')) > 0:
            aspect = aspect.split('#')[0]
        
        # prevent duplicates
        if aspect in aspects:
            continue

        aspects.append(aspect)
        
        #print('\t' + str(as_pair))
        sentiment = as_pair[1]
        rd = r.to_dict()
        rd['specific_sentiment'] = sentiment
        rd['specific_aspect'] = aspect
        rd['asp_sent'] = f'{aspect}-{sentiment}'
        complete_split = complete_split.append(rd, ignore_index=True)

    
complete_split

A Jupyter Widget

Unnamed: 0,asp_sent,aspect sentiments,document sentiment,relevance,specific_aspect,specific_sentiment,split,text,url
0,Auslastung_und_Platzangebot-negative,Auslastung_und_Platzangebot:negative Auslastun...,negative,1.0,Auslastung_und_Platzangebot,negative,test2,"Wenn die Bahn so voll ist, dass man lieber noc...",http://twitter.com/\_Kloetzchen\_/statuses/798...
1,,,neutral,0.0,,,test2,Familienurlaub im Kinderhotel in Österreich • ...,http://kinderglueck-der-familienblog.de/reisen...
2,Sonstige_Unregelmässigkeiten-negative,Sonstige_Unregelmässigkeiten:negative Sonstige...,negative,1.0,Sonstige_Unregelmässigkeiten,negative,test2,14.11.2016 – 13:53 Berlin-Mitte (ots) - Anläss...,http://news-koblenz.de
3,Allgemein-neutral,Allgemein:neutral,neutral,1.0,Allgemein,neutral,test2,"02 - Allgemeines Forum Diskussionen, Fragen un...",http://www.galopponline.de/news/galopp-news/gr...
4,Atmosphäre-negative,Atmosphäre:negative,negative,1.0,Atmosphäre,negative,test2,Menschen die ihre reservierten Plätze in der B...,http://twitter.com/Zischke/statuses/7980841919...
5,Allgemein-neutral,Allgemein:neutral,neutral,1.0,Allgemein,neutral,test2,Wie viel kostet einen Monatskarte bei der deut...,http://www.gutefrage.net/frage/wie-viel-kostet...
6,Ticketkauf-negative,Ticketkauf:negative DB_App_und_Website:neutral...,negative,1.0,Ticketkauf,negative,test2,Hallo! Ich habe mir eine eBahncard in die Navi...,https://www.kurzurlaub.de/angebote/123646-oste...
7,DB_App_und_Website-neutral,Ticketkauf:negative DB_App_und_Website:neutral...,negative,1.0,DB_App_und_Website,neutral,test2,Hallo! Ich habe mir eine eBahncard in die Navi...,https://www.kurzurlaub.de/angebote/123646-oste...
8,Allgemein-negative,Allgemein:negative Ticketkauf:negative,negative,1.0,Allgemein,negative,test2,Wenn man so etwas bewußt also absichtlich und ...,http://www.radiokoeln.de/koeln/rk/1245912/nrw-...
9,Ticketkauf-negative,Allgemein:negative Ticketkauf:negative,negative,1.0,Ticketkauf,negative,test2,Wenn man so etwas bewußt also absichtlich und ...,http://www.radiokoeln.de/koeln/rk/1245912/nrw-...


In [57]:
complete_split.to_pickle(os.path.join(path, 'merge.pkl'))

In [106]:
train_split = complete_split['split'] == 'train'
val_split = complete_split['split'] == 'val'
test1_split = complete_split['split'] == 'test1'
test2_split = complete_split['split'] == 'test2'

t2_agg = complete_split[test2_split].groupby(['specific_aspect', 'specific_sentiment'], as_index=False).count().rename(columns={'asp_sent':'diachronic test'})
t1_agg = complete_split[test1_split].groupby(['specific_aspect', 'specific_sentiment'], as_index=False).count().rename(columns={'asp_sent':'synchronic test'})
tr_agg = complete_split[train_split].groupby(['specific_aspect', 'specific_sentiment'], as_index=False).count().rename(columns={'asp_sent':'train'})
vl_agg = complete_split[val_split].groupby(['specific_aspect', 'specific_sentiment'], as_index=False).count().rename(columns={'asp_sent':'validation'})


cnt_agg = t1_agg.merge(t2_agg, on=['specific_aspect', 'specific_sentiment']).merge(tr_agg, on=['specific_aspect', 'specific_sentiment']).merge(vl_agg, on=['specific_aspect', 'specific_sentiment'])


In [None]:
cnt_agg = cnt_agg.set_index(['specific_aspect', 'specific_sentiment'])
cnt_agg[['train', 'validation', 'synchronic test', 'diachronic test']]

In [129]:
t2_sum = t2_agg[['diachronic test', 'specific_aspect']].groupby('specific_aspect', as_index=False).sum()
t1_sum = t1_agg[['synchronic test', 'specific_aspect']].groupby('specific_aspect', as_index=False).sum()
tr_sum = tr_agg[['train', 'specific_aspect']].groupby('specific_aspect', as_index=False).sum()
va_sum = vl_agg[['validation', 'specific_aspect']].groupby('specific_aspect', as_index=False).sum()
va_sum.sort_values(by='specific_aspect')

Unnamed: 0,specific_aspect,validation
0,Allgemein,1475
1,Atmosphäre,139
2,Auslastung_und_Platzangebot,33
3,Barrierefreiheit,17
4,Connectivity,23
5,DB_App_und_Website,23
6,Design,4
7,Gastronomisches_Angebot,4
8,Gepäck,3
9,Image,7


In [124]:
df_sum = t1_sum.merge(t2_sum, on=['specific_aspect']).merge(tr_sum, on=['specific_aspect']).merge(va_sum, on=['specific_aspect'])
df_sum

Unnamed: 0,specific_aspect,synchronic test,diachronic test,train,validation
0,Allgemein,1398,1024,12138,1475
1,Atmosphäre,148,53,1046,139
2,Auslastung_und_Platzangebot,35,20,251,33
3,Barrierefreiheit,9,2,64,17
4,Connectivity,36,73,257,23
5,DB_App_und_Website,28,18,185,23
6,Design,4,2,31,4
7,Gastronomisches_Angebot,3,3,44,4
8,Gepäck,2,6,18,3
9,Informationen,58,35,330,34
