In [8]:
import pandas as pd
import os
import gzip
from tqdm.autonotebook import tqdm

In [9]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [10]:
max_sentiment = 20000
def process_df(df, aspect):
    df = df.drop(['reviewerID', 'asin', 'reviewerName', 'unixReviewTime', 'reviewTime'], axis=1)
        
    # add sentiment col
    df['sentiment'] = df.apply(process_sentiment, axis=1)
    
    # balance dataset 
    df = balance_dataset(df)
    
    # add aspect col
    df['aspect'] = df.apply(lambda r: aspect, axis=1)
    return df
    
def process_sentiment(row):
    if row['overall'] >= 4.0:
        return 'positive'
    if row['overall'] <= 2.0:
        return 'negative'
    return 'neutral'


def balance_dataset(df):
    pos_series = df['sentiment']=='positive'
    neu_series = df['sentiment']=='neutral'
    neg_series = df['sentiment']=='negative'
    
    pos_diff = df[pos_series]['sentiment'].count() - max_sentiment
    neu_diff = df[neu_series]['sentiment'].count() - max_sentiment
    neg_diff = df[neg_series]['sentiment'].count() - max_sentiment
    
    diffs = [pos_diff, neu_diff, neg_diff]
    print(diffs)

    
    # if the sum of all divs is negative, we can not balance the dataset
    if sum(diffs) < 0:
        print('No balancing possible')
        return df
    
    pos_sel = None
    neg_sel = None
    neu_sel = None
    
    remaining = pd.DataFrame()
    
    # pos has enough 
    if pos_diff >= 0:
        pos_sel = df[pos_series][:max_sentiment]
        remaining = remaining.append(df[pos_series][max_sentiment:], ignore_index=True)
        print(f'Pos enough: Remaining Count: {remaining["overall"].count()}')
    
    if neu_diff >= 0:
        neu_sel = df[neu_series][:max_sentiment]
        remaining = remaining.append(df[neu_series][max_sentiment:], ignore_index=True)
        print(f'Neutral enough: Remaining Count: {remaining["overall"].count()}')

        
    if neg_diff >= 0:
        neg_sel = df[neg_series][:max_sentiment]
        remaining = remaining.append(df[neg_series][max_sentiment:], ignore_index=True)
        print(f'Neg enough: Remaining Count: {remaining["overall"].count()}')

    
    # pos. has not enough
    cur_remaining_idx = 0
    if pos_diff < 0:
        pos_sel = df[pos_series]
        pos_sel = pos_sel.append(remaining[cur_remaining_idx:(-pos_diff)+cur_remaining_idx])
        cur_remaining_idx += -pos_diff
        print(f'Fill positive - Pos Selection: {pos_sel["overall"].count()} - Remainin Idx: {cur_remaining_idx}')
        
    if neu_diff < 0:
        neu_sel = df[neu_series]
        neu_sel = neu_sel.append(remaining[cur_remaining_idx:(-neu_diff)+cur_remaining_idx])
        cur_remaining_idx += -neu_diff
        print(f'Fill Neutral - Neu Selection: {neu_sel["overall"].count()} - Remainin Idx: {cur_remaining_idx}')

        
    if neg_diff < 0:
        neg_sel = df[neg_series]
        neg_sel = neg_sel.append(remaining[cur_remaining_idx:(-neg_diff)+cur_remaining_idx])
        cur_remaining_idx += -neg_diff
        print(f'Fill Negative - Neg Selection: {neg_sel["overall"].count()} - Remainin Idx: {cur_remaining_idx}')

        
    df = pos_sel.append(neu_sel).append(neg_sel)
    return df

In [11]:
root_path = os.path.join(os.getcwd(), 'data', 'data', 'amazon')

aspects = [
    'Apps_for_Android',
    'Baby',
    'Beauty',
    'Books',
    'CDs_and_Vinyl',
    'Cell_Phones_and_Accessories',
    'Clothing_Shoes_and_Jewelry',
    'Digital_Music',
    'Electronics',
    'Grocery_and_Gourmet_Food',
    'Health_and_Personal_Care',
    'Home_and_Kitchen',
    'Kindle_Store',
    'Movies_and_TV',
    'Office_Products',
    'Pet_Supplies',
    'Sports_and_Outdoors',
    'Tools_and_Home_Improvement',
    'Toys_and_Games',
    'Video_Games'
]

In [None]:
df = None

for a in tqdm(aspects):
    fn = f'reviews_{a}_5.json.gz'
    path = os.path.join(root_path, fn)
    print('Parse ' + path)
    
    a_df = getDF(path)
    a_df = process_df(a_df, a)
    if df is None:
        df = a_df
    else:
        df = df.append(a_df)

A Jupyter Widget

Parse C:\Users\felix\OneDrive\Studium\Studium\6. Semester\MA\Project\ABSA-Transformer\data\data\amazon\reviews_Apps_for_Android_5.json.gz
[524718, 65121, 103098]
Pos enough: Remaining Count: 524718
Neutral enough: Remaining Count: 589839
Neg enough: Remaining Count: 692937
Parse C:\Users\felix\OneDrive\Studium\Studium\6. Semester\MA\Project\ABSA-Transformer\data\data\amazon\reviews_Baby_5.json.gz
[106525, -2745, -2988]
Pos enough: Remaining Count: 106525
Fill Neutral - Neu Selection: 20000 - Remainin Idx: 2745
Fill Negative - Neg Selection: 20000 - Remainin Idx: 5733
Parse C:\Users\felix\OneDrive\Studium\Studium\6. Semester\MA\Project\ABSA-Transformer\data\data\amazon\reviews_Beauty_5.json.gz
[134272, 2248, 1982]
Pos enough: Remaining Count: 134272
Neutral enough: Remaining Count: 136520
Neg enough: Remaining Count: 138502
Parse C:\Users\felix\OneDrive\Studium\Studium\6. Semester\MA\Project\ABSA-Transformer\data\data\amazon\reviews_Books_5.json.gz


In [7]:
df.to_csv(os.path.join(root_path, 'dataset_processed.csv'))
df.to_pickle(os.path.join(root_path, 'dataset_processed.pkl'))

In [6]:
df.groupby('aspect').count()

Unnamed: 0_level_0,helpful,reviewText,overall,summary,sentiment
aspect,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Apps_for_Android,60000,60000,60000,60000,60000
Baby,60000,60000,60000,60000,60000
Beauty,60000,60000,60000,60000,60000


In [None]:
#df.sentiment.describe()

# balance dataset

In [None]:
df = df.drop('helpful', axis=1)

In [None]:
df[['overall', 'sentiment', 'aspect']].describe(include='all')

In [None]:
df.groupby('aspect').count()

In [None]:


df[neg_series]['sentiment'].count()

In [None]:
a = [1, 2, 3]
a[:1]+a[2:]

In [None]:
df = balance_sentiment(df)
df.count()











In [None]:
neg['overall'].count()