# Imports

In [1]:
import gzip
import json
import csv
import re

# Constants

In [2]:
TRAIN = '../data/raw/music_reviews_train.json.gz'
DEV   = '../data/raw/music_reviews_dev.json.gz'
TEST  = '../data/raw/music_reviews_test_masked.json.gz'

# Functions

In [3]:
review_keys = set(['image', 'vote'])

def loader(PATH):
    review_list = []
    for line in gzip.open(PATH):
        review_data = json.loads(line)
        temp = {'image': 0, # set image binary to 0
                'reviewText' : '<NULL>',
                'summary' : '<NULL>',
                'style' : '<NULL>',
                'vote' : 0} # set votes to zero, will be overwritten if there are upvotes
        for key in review_data:
            if key == 'image':
                temp[key] = 1 # if there is an image present, set the binary to 1
            else:
                if key == 'sentiment':
                    if review_data[key] == 'positive':
                        temp[key] = 1
                    elif review_data[key] == 'negative': 
                        temp[key] = 0                    
                else:
                    review_keys.update([key])
                    temp[key] =  str(review_data[key])
        review_list.append(temp)
    return review_list

#######################################################################

def set_making(data, test = False):
    """Function to separate data into X and y.
    Input: 
    - A list of dictionaries of music reviews. 
    - Each item in the list is a dictionary for an individual review.
    Output:
    - X: a list of concatenated summary and text of movie reviews.
    - y: a list of corresponding sentiment labels.
    """
    X = []
    y = []
    
    for i in data:
        X.append(i['summary'] + ' ' +i['reviewText'])
        if test == False:
            y.append(i['sentiment'])
    
    if test:
        return X
    else: return X, y

# Load Data

In [4]:
train_data = loader(TRAIN) # Training
dev_data = loader(DEV)     # Validation
test_data = loader(TEST)   # Test

In [5]:
len(train_data), len(dev_data), len(test_data)

(100000, 10000, 10000)

In [6]:
missing = {}
count = 0

for i in test_data + test_data + dev_data:
    temp = i.keys()
    for r in review_keys:
        if r not in temp:
            count += 1
            if r in missing.keys():
                missing[r] += 1
            else: missing[r] = 1
print('Done')
print(count)
print(missing)

Done
0
{}


In [7]:
# Checking for missing text or summary
count = 0
for dp in train_data:
    if dp['reviewText'] == '<NULL>' and dp['summary'] == '<NULL>':
        #print(dp)
        count += 1
        
(count)

3

In [8]:
# Adding up review counts, sanity check
99918 + 51 + 28 + 3 # r&s, r&!s, s&!r, !r&!s

100000

In [9]:
# Percentage of reviews after removing those with missing reviewText and/or summary
99918/1000

99.918

# Make datasets

In [10]:
X_train, y_train = set_making(train_data)

len(X_train), len(y_train)

(100000, 100000)

In [11]:
X_dev, y_dev = set_making(dev_data)

len(X_dev), len(y_dev)

(10000, 10000)

In [12]:
X_test = set_making(test_data, test = True)

len(X_test)

10000

# Save datasets to csv files

In [13]:
with open('../data/interim/train.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 
        
    # writing data rows 
    writer.writerows((x for x in zip(X_train, y_train))) 

In [14]:
with open('../data/interim/dev.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 

    # writing data rows 
    writer.writerows((x for x in zip(X_dev, y_dev))) 

In [15]:
with open('../data/interim/test.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 
    
    # writing data rows 
    writer.writerows(([x] for x in X_test)) 