# Imports

In [1]:
import gzip
import json
import csv
import re

# Constants

In [2]:
TRAIN = '../data/raw/music_reviews_train.json.gz'
DEV   = '../data/raw/music_reviews_dev.json.gz'
TEST  = '../data/raw/music_reviews_test_masked.json.gz'
HARD_12 = '../data/raw/group12.json.gz'
HARD = '../data/raw/phase2_testData-masked.json.gz'
SEW = '../data/raw/Arts_Crafts_and_Sewing.json.gz'
GAMES = '../data/raw/Video_Games.json.gz'

# Functions

In [3]:
review_keys = set(['image', 'vote'])

def loader(PATH):
    review_list = []
    for line in gzip.open(PATH):
        review_data = json.loads(line)
        temp = {'image': 0, # set image binary to 0
                'reviewText' : '<NULL>',
                'summary' : '<NULL>',
                'style' : '<NULL>',
                'vote' : 0} # set votes to zero, will be overwritten if there are upvotes
        for key in review_data:
            if key == 'image':
                temp[key] = 1 # if there is an image present, set the binary to 1
            else:
                if key == 'sentiment':
                    if review_data[key] == 'positive':
                        temp[key] = 1
                    elif review_data[key] == 'negative': 
                        temp[key] = 0                    
                else:
                    review_keys.update([key])
                    temp[key] =  str(review_data[key])
        review_list.append(temp)
    return review_list

#######################################################################

def set_making(data, test = False):
    """Function to separate data into X and y.
    Input: 
    - A list of dictionaries of music reviews. 
    - Each item in the list is a dictionary for an individual review.
    Output:
    - X: a list of concatenated summary and text of movie reviews.
    - y: a list of corresponding sentiment labels.
    """
    X = []
    y = []
    
    for i in data:
        X.append(i['summary'] + ' ' +i['reviewText'])
        if test == False and 'sentiment' in i:
            y.append(i['sentiment'])
        elif test == False and 'overall' in i:
            rating = float(i['overall'])
            y.append(int(rating))
    
    if test:
        return X
    else: return X, y


# Load Data

In [4]:
train_data = loader(TRAIN) # Training
dev_data = loader(DEV)     # Validation
test_data = loader(TEST)   # Test

In [5]:
hard_12 = loader(HARD_12)   # Our Hard Cases
hard = loader(HARD)   # Hard Cases

In [6]:
sew = loader(SEW) # Sewing Reviews
games = loader(GAMES) # Video Game reviews

In [7]:
len(train_data), len(dev_data), len(test_data), len(hard_12), len(hard), len(sew), len(games)

(100000, 10000, 10000, 700, 3591, 494485, 497577)

In [8]:
missing = {}
count = 0

for i in test_data + test_data + dev_data:
    temp = i.keys()
    for r in review_keys:
        if r not in temp:
            count += 1
            if r in missing.keys():
                missing[r] += 1
            else: missing[r] = 1
print('Done')
print(count)
print(missing)

Done
240000
{'overall': 30000, 'group': 30000, 'catagory': 30000, 'Unnamed: 0': 30000, 'level_0': 30000, 'reviewerName': 30000, 'index': 30000, 'category': 30000}


In [9]:
# Checking for missing text or summary
count = 0
for dp in train_data:
    if dp['reviewText'] == '<NULL>' and dp['summary'] == '<NULL>':
        #print(dp)
        count += 1
        
(count)

3

In [10]:
# Adding up review counts, sanity check
99918 + 51 + 28 + 3 # r&s, r&!s, s&!r, !r&!s

100000

In [11]:
# Percentage of reviews after removing those with missing reviewText and/or summary
99918/1000

99.918

In [12]:
games[400:500]

[{'image': 0,
  'reviewText': 'Its a charger. Not name brand. But it worked. Would buy again if I needed to. Need four more words.',
  'summary': 'It worked',
  'style': '<NULL>',
  'vote': 0,
  'overall': '5.0',
  'verified': 'True',
  'reviewTime': '01 25, 2013',
  'reviewerID': 'ALJO1MOF8TMNO',
  'asin': '9882106463',
  'reviewerName': 'Kris',
  'unixReviewTime': '1359072000'},
 {'image': 0,
  'reviewText': 'I bought an old game boy micro off the internet and ordered this charger separate. I worried it would not work, but it fully charged the unit within an hour. Great product!',
  'summary': 'Well made',
  'style': '<NULL>',
  'vote': 0,
  'overall': '5.0',
  'verified': 'True',
  'reviewTime': '01 12, 2013',
  'reviewerID': 'A32DO6STACZ65I',
  'asin': '9882106463',
  'reviewerName': 'lanceandjulie',
  'unixReviewTime': '1357948800'},
 {'image': 0,
  'reviewText': "This Is Eaiser Than the first one, but it's pretty fun i liked it. You get to play some original characters like, Bad 

In [13]:
int(float(sew[0]['overall']))

4

# Make datasets

In [14]:
X_train, y_train = set_making(train_data)

len(X_train), len(y_train)

(100000, 100000)

In [15]:
X_dev, y_dev = set_making(dev_data)

len(X_dev), len(y_dev)

(10000, 10000)

In [16]:
X_test = set_making(test_data, test = True)

len(X_test)

10000

In [17]:
X_hard_12 = set_making(hard_12, test = True)
len(X_hard_12)

700

In [18]:
X_hard = set_making(hard, test = True)
len(X_hard)

3591

In [19]:
X_sew, y_sew = set_making(sew)
len(X_sew)

494485

In [20]:
X_games, y_games = set_making(games)
len(X_games)

497577

# Save datasets to csv files

In [13]:
with open('../data/interim/train.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 
        
    # writing data rows 
    writer.writerows((x for x in zip(X_train, y_train))) 

In [14]:
with open('../data/interim/dev.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 

    # writing data rows 
    writer.writerows((x for x in zip(X_dev, y_dev))) 

In [15]:
with open('../data/interim/test.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 
    
    # writing data rows 
    writer.writerows(([x] for x in X_test)) 

In [48]:
with open('../data/interim/hard_12.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 
    
    # writing data rows 
    writer.writerows(([x] for x in X_hard_12)) 

In [49]:
with open('../data/interim/hard.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 
    
    # writing data rows 
    writer.writerows(([x] for x in X_hard)) 

In [57]:
with open('../data/interim/sew.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 
        
    # writing data rows 
    writer.writerows((x for x in zip(X_sew, y_sew))) 

In [58]:
with open('../data/interim/games.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 
        
    # writing data rows 
    writer.writerows((x for x in zip(X_games, y_games))) 