In [1]:
import pandas as pd
import os
import gzip
from tqdm.autonotebook import tqdm



In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [3]:
max_sentiment = 20000
def process_df(df, aspect):
    df = df.drop(['reviewerID', 'asin', 'reviewerName', 'unixReviewTime', 'reviewTime'], axis=1)
        
    # add sentiment col
    df['sentiment'] = df.apply(process_sentiment, axis=1)
    
    # balance dataset 
    df = balance_dataset(df)
    
    # add aspect col
    df['aspect'] = df.apply(lambda r: aspect, axis=1)
    return df
    
def process_sentiment(row):
    if row['overall'] >= 4.0:
        return 'positive'
    if row['overall'] <= 2.0:
        return 'negative'
    return 'neutral'


def balance_dataset(df):
    pos_series = df['sentiment']=='positive'
    neu_series = df['sentiment']=='neutral'
    neg_series = df['sentiment']=='negative'
    
    pos_diff = df[pos_series]['sentiment'].count() - max_sentiment
    neu_diff = df[neu_series]['sentiment'].count() - max_sentiment
    neg_diff = df[neg_series]['sentiment'].count() - max_sentiment
    
    diffs = [pos_diff, neu_diff, neg_diff]
    print(diffs)

    
    # if the sum of all divs is negative, we can not balance the dataset
    if sum(diffs) < 0:
        print('No balancing possible')
        return df
    
    pos_sel = None
    neg_sel = None
    neu_sel = None
    
    remaining = pd.DataFrame()
    
    # pos has enough 
    if pos_diff >= 0:
        pos_sel = df[pos_series][:max_sentiment]
        remaining = remaining.append(df[pos_series][max_sentiment:], ignore_index=True)
        print(f'Pos enough: Remaining Count: {remaining["overall"].count()}')
    
    if neu_diff >= 0:
        neu_sel = df[neu_series][:max_sentiment]
        remaining = remaining.append(df[neu_series][max_sentiment:], ignore_index=True)
        print(f'Neutral enough: Remaining Count: {remaining["overall"].count()}')

        
    if neg_diff >= 0:
        neg_sel = df[neg_series][:max_sentiment]
        remaining = remaining.append(df[neg_series][max_sentiment:], ignore_index=True)
        print(f'Neg enough: Remaining Count: {remaining["overall"].count()}')

    
    # pos. has not enough
    cur_remaining_idx = 0
    if pos_diff < 0:
        pos_sel = df[pos_series]
        pos_sel = pos_sel.append(remaining[cur_remaining_idx:(-pos_diff)+cur_remaining_idx])
        cur_remaining_idx += -pos_diff
        print(f'Fill positive - Pos Selection: {pos_sel["overall"].count()} - Remainin Idx: {cur_remaining_idx}')
        
    if neu_diff < 0:
        neu_sel = df[neu_series]
        neu_sel = neu_sel.append(remaining[cur_remaining_idx:(-neu_diff)+cur_remaining_idx])
        cur_remaining_idx += -neu_diff
        print(f'Fill Neutral - Neu Selection: {neu_sel["overall"].count()} - Remainin Idx: {cur_remaining_idx}')

        
    if neg_diff < 0:
        neg_sel = df[neg_series]
        neg_sel = neg_sel.append(remaining[cur_remaining_idx:(-neg_diff)+cur_remaining_idx])
        cur_remaining_idx += -neg_diff
        print(f'Fill Negative - Neg Selection: {neg_sel["overall"].count()} - Remainin Idx: {cur_remaining_idx}')

        
    df = pos_sel.append(neu_sel).append(neg_sel)
    return df

In [5]:
root_path = os.path.join(os.getcwd(), 'data', 'data', 'amazon')

aspects = [
    'Apps_for_Android',
    'Baby',
    'Beauty',
    'CDs_and_Vinyl',
    'Cell_Phones_and_Accessories',
    'Clothing_Shoes_and_Jewelry',
    'Digital_Music',
    'Electronics',
    'Grocery_and_Gourmet_Food',
    'Health_and_Personal_Care',
    'Home_and_Kitchen',
    'Kindle_Store',
    'Movies_and_TV',
    'Office_Products',
    'Pet_Supplies',
    'Sports_and_Outdoors',
    'Tools_and_Home_Improvement',
    'Toys_and_Games',
    'Video_Games',
    'Books'
]

In [7]:
df = None

for a in tqdm(aspects):
    fn = f'reviews_{a}_5.json.gz'
    path = os.path.join(root_path, fn)
    print('Parse ' + path)
    
    a_df = getDF(path)
    a_df = process_df(a_df, a)
    a_df.to_pickle(os.path.join(root_path, a + '_processed.pkl'))
    if df is None:
        df = a_df
    else:
        df = df.append(a_df)

A Jupyter Widget

Parse C:\Users\felix\OneDrive\Studium\Studium\6. Semester\MA\Project\ABSA-Transformer\data\data\amazon\reviews_Apps_for_Android_5.json.gz
[524718, 65121, 103098]
Pos enough: Remaining Count: 524718
Neutral enough: Remaining Count: 589839
Neg enough: Remaining Count: 692937
Parse C:\Users\felix\OneDrive\Studium\Studium\6. Semester\MA\Project\ABSA-Transformer\data\data\amazon\reviews_Baby_5.json.gz
[106525, -2745, -2988]
Pos enough: Remaining Count: 106525
Fill Neutral - Neu Selection: 20000 - Remainin Idx: 2745
Fill Negative - Neg Selection: 20000 - Remainin Idx: 5733
Parse C:\Users\felix\OneDrive\Studium\Studium\6. Semester\MA\Project\ABSA-Transformer\data\data\amazon\reviews_Beauty_5.json.gz
[134272, 2248, 1982]
Pos enough: Remaining Count: 134272
Neutral enough: Remaining Count: 136520
Neg enough: Remaining Count: 138502
Parse C:\Users\felix\OneDrive\Studium\Studium\6. Semester\MA\Project\ABSA-Transformer\data\data\amazon\reviews_CDs_and_Vinyl_5.json.gz
[883002, 81824, 72766]
Pos eno

In [8]:
df.to_csv(os.path.join(root_path, 'dataset_processed.csv'))
df.to_pickle(os.path.join(root_path, 'dataset_processed.pkl'))

In [9]:
df.groupby('aspect').count()

Unnamed: 0_level_0,helpful,reviewText,overall,summary,sentiment
aspect,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Apps_for_Android,60000,60000,60000,60000,60000
Baby,60000,60000,60000,60000,60000
Beauty,60000,60000,60000,60000,60000
Books,60000,60000,60000,60000,60000
CDs_and_Vinyl,60000,60000,60000,60000,60000
Cell_Phones_and_Accessories,60000,60000,60000,60000,60000
Clothing_Shoes_and_Jewelry,60000,60000,60000,60000,60000
Digital_Music,60000,60000,60000,60000,60000
Electronics,60000,60000,60000,60000,60000
Grocery_and_Gourmet_Food,60000,60000,60000,60000,60000


In [None]:
#df.sentiment.describe()

# Generate 80 - 20 - 10 Splits

In [3]:
path = os.path.join(os.getcwd(), 'data', 'data', 'amazon', 'raw')
df = pd.read_pickle(os.path.join(path, 'dataset_processed.pkl'))
df

Unnamed: 0,helpful,reviewText,overall,summary,sentiment,aspect
1,"[0, 0]","Oh, how my little grandson loves this app. He'...",5.0,2-year-old loves it,positive,Apps_for_Android
2,"[0, 0]",I found this at a perfect time since my daught...,5.0,Fun game,positive,Apps_for_Android
3,"[3, 4]",My 1 year old goes back to this game over and ...,5.0,We love our Monkeys!,positive,Apps_for_Android
4,"[1, 1]",There are three different versions of the song...,5.0,This is my granddaughters favorite app on my K...,positive,Apps_for_Android
5,"[3, 3]",THis is just so cute and a great app for littl...,5.0,so cute,positive,Apps_for_Android
6,"[3, 4]",I watch my great grandson 4 days a week and it...,5.0,Terrific!,positive,Apps_for_Android
7,"[0, 0]",This app is wild and crazy. Little ones love ...,5.0,Five Little Monkeys,positive,Apps_for_Android
8,"[3, 4]",love love love this app. I was going through d...,5.0,love but to quite,positive,Apps_for_Android
9,"[0, 0]","Very cute, with alot of items to move about. ...",5.0,Cute,positive,Apps_for_Android
10,"[1, 2]",WELL THE CHILDREN LOVE IT AFTER AWHILE YOU GET...,4.0,MONKEYS,positive,Apps_for_Android


In [13]:
# we need an extra attribute to stratify on which is a combination of aspect-sentiment
df['aspectSentiment'] = df.apply(lambda r: f'{r["aspect"]}-{r["sentiment"]})', axis=1)
df

Unnamed: 0,helpful,reviewText,overall,summary,sentiment,aspect,aspectSentiment
1,"[0, 0]","Oh, how my little grandson loves this app. He'...",5.0,2-year-old loves it,positive,Apps_for_Android,Apps_for_Android-positive)
2,"[0, 0]",I found this at a perfect time since my daught...,5.0,Fun game,positive,Apps_for_Android,Apps_for_Android-positive)
3,"[3, 4]",My 1 year old goes back to this game over and ...,5.0,We love our Monkeys!,positive,Apps_for_Android,Apps_for_Android-positive)
4,"[1, 1]",There are three different versions of the song...,5.0,This is my granddaughters favorite app on my K...,positive,Apps_for_Android,Apps_for_Android-positive)
5,"[3, 3]",THis is just so cute and a great app for littl...,5.0,so cute,positive,Apps_for_Android,Apps_for_Android-positive)
6,"[3, 4]",I watch my great grandson 4 days a week and it...,5.0,Terrific!,positive,Apps_for_Android,Apps_for_Android-positive)
7,"[0, 0]",This app is wild and crazy. Little ones love ...,5.0,Five Little Monkeys,positive,Apps_for_Android,Apps_for_Android-positive)
8,"[3, 4]",love love love this app. I was going through d...,5.0,love but to quite,positive,Apps_for_Android,Apps_for_Android-positive)
9,"[0, 0]","Very cute, with alot of items to move about. ...",5.0,Cute,positive,Apps_for_Android,Apps_for_Android-positive)
10,"[1, 2]",WELL THE CHILDREN LOVE IT AFTER AWHILE YOU GET...,4.0,MONKEYS,positive,Apps_for_Android,Apps_for_Android-positive)


In [14]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.1, random_state=42, stratify=df[['aspectSentiment']])
train, val = train_test_split(train, test_size=0.2, random_state=42, stratify=train[['aspectSentiment']])



In [15]:
val.count()['overall']/df.count()['overall']

0.18000046930336944

In [17]:
train.groupby('sentiment').count()

Unnamed: 0_level_0,helpful,reviewText,overall,summary,aspect,aspectSentiment
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
negative,242913,242913,242913,242913,242913,242913
neutral,251767,251767,251767,251767,251767,251767
positive,364465,364465,364465,364465,364465,364465


In [19]:
path = os.path.join(os.getcwd(), 'data', 'data', 'amazon')

split_path = os.path.join(path, 'splits')
train.to_pickle(os.path.join(split_path, 'train.pkl'))
test.to_pickle(os.path.join(split_path, 'test.pkl'))
val.to_pickle(os.path.join(split_path, 'val.pkl'))

train.to_csv(os.path.join(split_path, 'train.csv'))
test.to_csv(os.path.join(split_path, 'test.csv'))
val.to_csv(os.path.join(split_path, 'val.csv'))

# Preprocessing & Spell Checking

In [2]:
import hunspell
import re

In [3]:
def en_contraction_removal(text: str) -> str:
    apostrophe_handled = re.sub("’", "'", text)
    # from https://gist.githubusercontent.com/tthustla/74e99a00541264e93c3bee8b2b49e6d8/raw/599100471e8127d6efad446717dc951a10b69777/yatwapart1_01.py
    contraction_mapping = {
                    "i.e.": 'for example',
                    "e.g.": 'for example',
                    "youre": "you are",
                    "youll": "you will",
                    "theyre": "they are", "theyll": "they will",
                    "weve": "we have",
                    "shouldnt": "should not",
                    "dont": "do not",
                    "doesnt": "does not", "doesn": "does not",
                    "didnt": "did not",
                    "wasn": "was not",
                    "arent": "are not", "aren": "are not",
                    "aint": "is not", "isnt": "is not", "isn": "is not",
                    "wouldnt": "would not", "wouldn": "would not",
                    "ain't": "is not", "aren't": "are not","can't": "cannot", 
                   "can't've": "cannot have", "'cause": "because", "could've": "could have", 
                   "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
                   "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                   "he'll've": "he will have", "he's": "he is", "how'd": "how did", 
                   "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                   "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                   "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                   "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                   "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                   "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                   "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                   "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                   "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                   "this's": "this is",
                   "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                   "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                       "here's": "here is",
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not", 
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                   "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", 
                   "why've": "why have", "will've": "will have", "won't": "will not", 
                   "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                   "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                   "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                   "you'll've": "you will have", "you're": "you are", "you've": "you have" }
    expanded = ' '.join([contraction_mapping[t.lower()] if t.lower() in contraction_mapping else t for t in apostrophe_handled.split(" ")])
    return expanded

In [4]:
hobj = hunspell.HunSpell('/Library/Spelling/en_US.dic', '/Library/Spelling/en_US.aff')


known_words = ['wirelessly', 'hitman', 'Wal-Mart', 'noob', 'subwoofer', 'WTF', 'Waitrose', '<URL>', 'axe', 'TLDR', 'Coca~Cola', 'NPC', 'sci-fi', 'PS3', 'PSX', 'Clooney', 'Schumacher', 'PS2', 'XBOX']


for w in known_words:
    hobj.add(w)

In [5]:
url_regex = r'(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&\(\)\*\+,;=.]+'

def replace_urls_regex(sentence: str, url_token: str = '<URL>') -> str:
    return re.sub(url_regex, url_token, sentence)

def replace_urls(words, url_token: str = '<URL>'):
    return [url_token if (w.lower().startswith('www') or w.lower().startswith('http')) else w for w in words]

def spellcheck_sentence(row) -> str:
    sent = row['reviewText']
    #print(sent)
    to_remove = [',', '(', ')', ':', '?', '&', '/', '*', '!']
    for tr in to_remove:
        sent = sent.replace(tr, ' ')
        
    sent = sent.replace('€™', "'")
    sent = sent.replace('�', "'")
    sent = en_contraction_removal(sent)
    sent = sent.replace("'", ' ')
    sent = replace_urls_regex(sent)


    tokens = sent.split(' ')
    result = []
    for t in tokens:
        if t == ' ':
            continue
        if not hobj.spell(t):
            suggestions = hobj.suggest(t)
            if not suggestions:
                result.append(t)
            else:
                if suggestions[0] == 'e':
                    result.append(t)
                    continue
                result.append(suggestions[0])
                #print(f'{t} -> {suggestions[0]}')
        else:
            result.append(t)
    return ' '.join(result)
        
#spellcheck_sentence('This is a tset with a wong wod. Adn now anotheer one why does this notjn workd')

In [None]:
splits = ['train', 'val', 'test']
path = os.path.join(os.getcwd(), 'data', 'data', 'amazon')
tqdm.pandas()

for s in splits:
    print('Split: ' + str(s))
    fn = os.path.join(path, s + '.pkl')
    df = pd.read_pickle(fn)
    df['reviewText'] = df.progress_apply(spellcheck_sentence, axis=1)
    
    fn = os.path.join(path, s + '_sp.pkl')
    df.to_pkl(fn)
    df.to_csv(os.path.join(path, s + '_sp.csv'), sep='|', index=False)

Split: train


HBox(children=(IntProgress(value=0, max=859145), HTML(value='')))