In [1]:
import pandas as pd
import os
import gzip
from tqdm.autonotebook import tqdm
import re
tqdm.pandas()



In [2]:
import hunspell
hobj = hunspell.HunSpell('/Library/Spelling/de_DE.dic', '/Library/Spelling/de_DE.aff')
known_words = ['@DB_Bahn', 'ÖPNV', 'Hashtag', 'GDL', 'Hbf', 'Fahrplanwechsel', 'co2', 'Waitrose', '<URL>', 'certifier', 'TLDR', 'Coca~Cola', 'Quora', 'sci-fi']

for w in known_words:
    hobj.add(w)
    
# also add the english one
hobj_en = hunspell.HunSpell('/Library/Spelling/en_US.dic', '/Library/Spelling/en_US.aff')

In [9]:
url_regex = r'(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&\(\)\*\+,;=.]+'

def replace_urls_regex(sentence: str, url_token: str = '<URL>') -> str:
    return re.sub(url_regex, url_token, sentence)

def replace_urls(words, url_token: str = '<URL>'):
    return [url_token if (w.lower().startswith('www') or w.lower().startswith('http')) else w for w in words]


def clean_text_without_sp(row) -> str:
    sent = ''
    try:
        sent = row['text']
        sent = replace_urls_regex(sent)    

        #print(sent)
        to_remove = [',', '(', ')', ':', '?', '&', '/', '*', '!', ';', '"', '.', '+']
        for tr in to_remove:
            sent = sent.replace(tr, ' ')

        sent = sent.replace('€™', "'")
        sent = sent.replace('�', "'")
        sent = en_contraction_removal(sent)
        sent = sent.replace("'", ' ')
    except Exception as err:
        print('Could not clean sentence: ' + str(err))
        
        try:
            sent = row['text']
            return sent
        except Exception as err:
            print('Could not get sentence from row. Returning empty sentence: ' + str(err))
            return ''
    return sent

def clean_text(row) -> str:
    sent = ''
    try:
        sent = spellcheck_sentence(row)
    except Exception as err:
        print('Could not spellcheck sentence: ' + str(err))
        
        try:
            sent = row['text']
            return sent
        except Exception as err:
            print('Could not get sentence from row. Returning empty sentence: ' + str(err))
            return ''
    return sent
        

def spellcheck_sentence(row) -> str:
    sent = row['text']
    sent = replace_urls_regex(sent)


    #print(sent)
    to_remove = [',', '(', ')', ':', '?', '&', '/', '*', '!']
    for tr in to_remove:
        sent = sent.replace(tr, ' ')
        
    sent = sent.replace('€™', "'")
    sent = sent.replace('�', "'")
    sent = sent.replace("'", ' ')

    tokens = sent.split(' ')
    result = []
    for t in tokens:
        if t == ' ':
            continue
            
        correct = True
        try:
            correct = hobj.spell(t) or hobj_en.spell(t)

        except Exception as err:
            print('Could not get spell checking for token ' + str(t))
            # do not keep token in case of error
            continue
            
        if not correct:
            suggestions = hobj.suggest(t)
            if not suggestions:
                result.append(t)
            else:
                if suggestions[0] == 'e':
                    result.append(t)
                    continue
                result.append(suggestions[0])
                #print(f'{t} -> {suggestions[0]}')
        else:
            result.append(t)
    return ' '.join(result)


        
spellcheck_sentence({'text': 'Das ist ein TExt, mit einer url https://github.com/wooorm/dictionaries'})

'Das ist ein Text  mit einer URL <URL>'

# Spell Checking

In [10]:
path = os.path.join(os.getcwd(), 'data', 'data', 'germeval2017')
header = ['url', 'text', 'relevance', 'document sentiment', 'aspect sentiments']

In [11]:
splits = ['dev_v1.4', 'train_v1.4', 'test_TIMESTAMP1', 'test_TIMESTAMP2']
path = os.path.join(os.getcwd(), 'data', 'data', 'germeval2017')

for s in splits:
    print('Split: ' + str(s))
    fn = os.path.join(path, s + '.tsv')
    df = pd.read_csv(fn, sep='\t', header=None, names=header, index_col=False) #
    df['text'] = df.progress_apply(clean_text, axis=1)
    
    fn = os.path.join(path, s + '_sp.csv')
    df.to_csv(fn, sep='\t', index=False, header=False)

Split: dev_v1.4


HBox(children=(IntProgress(value=0, max=2584), HTML(value='')))

Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Split: train_v1.4


HBox(children=(IntProgress(value=0, max=20941), HTML(value='')))

Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string o

HBox(children=(IntProgress(value=0, max=2566), HTML(value='')))

Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Could not spellcheck sentence: expected string or bytes-like object
Split: test_TIMESTAMP2


HBox(children=(IntProgress(value=0, max=1842), HTML(value='')))

# Exploration

In [None]:

test2 = pd.read_csv(os.path.join(path, 'test_TIMESTAMP2.tsv'), sep='\t', header=None, names=header, index_col=False)
test1 = pd.read_csv(os.path.join(path, 'test_TIMESTAMP1.tsv'), sep='\t', header=None, names=header, index_col=False)
train = pd.read_csv(os.path.join(path, 'train_v1.4.tsv'), sep='\t', header=None, names=header, index_col=False)
val = pd.read_csv(os.path.join(path, 'dev_v1.4.tsv'), sep='\t', header=None, names=header, index_col=False)

test1['split'] = test1.apply(lambda r: 'test1', axis=1)
test2['split'] = test2.apply(lambda r: 'test2', axis=1)
train['split'] = train.apply(lambda r: 'train', axis=1)
val['split'] = val.apply(lambda r: 'val', axis=1)


complete = test2.append(test1).append(train).append(val)
complete              

In [None]:
test2.shape

### Split sentiments

In [None]:
complete_split = pd.DataFrame()

for _, r in tqdm(complete.iterrows(), total=complete.shape[0]):
    asp_sent = r['aspect sentiments']
    if pd.isnull(asp_sent) or asp_sent == '':
        complete_split = complete_split.append(r)
        continue
    asp_sent = asp_sent.split(" ")
    #print(asp_sent)
    aspects = []
    for as_pair in asp_sent:
        if as_pair == '' or len(as_pair.split(':')) != 2:
            continue

        as_pair = as_pair.split(':')
        aspect = as_pair[0]
        
        if len(aspect.split('#')) > 0:
            aspect = aspect.split('#')[0]
        
        # prevent duplicates
        if aspect in aspects:
            continue

        aspects.append(aspect)
        
        #print('\t' + str(as_pair))
        sentiment = as_pair[1]
        rd = r.to_dict()
        rd['specific_sentiment'] = sentiment
        rd['specific_aspect'] = aspect
        rd['asp_sent'] = f'{aspect}-{sentiment}'
        complete_split = complete_split.append(rd, ignore_index=True)

    
complete_split

In [None]:
complete_split.to_pickle(os.path.join(path, 'merge.pkl'))

In [None]:
train_split = complete_split['split'] == 'train'
val_split = complete_split['split'] == 'val'
test1_split = complete_split['split'] == 'test1'
test2_split = complete_split['split'] == 'test2'

t2_agg = complete_split[test2_split].groupby(['specific_aspect', 'specific_sentiment'], as_index=False).count().rename(columns={'asp_sent':'diachronic test'})
t1_agg = complete_split[test1_split].groupby(['specific_aspect', 'specific_sentiment'], as_index=False).count().rename(columns={'asp_sent':'synchronic test'})
tr_agg = complete_split[train_split].groupby(['specific_aspect', 'specific_sentiment'], as_index=False).count().rename(columns={'asp_sent':'train'})
vl_agg = complete_split[val_split].groupby(['specific_aspect', 'specific_sentiment'], as_index=False).count().rename(columns={'asp_sent':'validation'})


cnt_agg = t1_agg.merge(t2_agg, on=['specific_aspect', 'specific_sentiment']).merge(tr_agg, on=['specific_aspect', 'specific_sentiment']).merge(vl_agg, on=['specific_aspect', 'specific_sentiment'])


In [None]:
cnt_agg = cnt_agg.set_index(['specific_aspect', 'specific_sentiment'])
cnt_agg[['train', 'validation', 'synchronic test', 'diachronic test']]

In [None]:
t2_sum = t2_agg[['diachronic test', 'specific_aspect']].groupby('specific_aspect', as_index=False).sum()
t1_sum = t1_agg[['synchronic test', 'specific_aspect']].groupby('specific_aspect', as_index=False).sum()
tr_sum = tr_agg[['train', 'specific_aspect']].groupby('specific_aspect', as_index=False).sum()
va_sum = vl_agg[['validation', 'specific_aspect']].groupby('specific_aspect', as_index=False).sum()
va_sum.sort_values(by='specific_aspect')

In [None]:
df_sum = t1_sum.merge(t2_sum, on=['specific_aspect']).merge(tr_sum, on=['specific_aspect']).merge(va_sum, on=['specific_aspect'])
df_sum