This notebook was used to:
* extract the data from the original music dataset
* create interim csv files of the data with only the desired columns

# Imports

In [None]:
import gzip
import json
import csv
import re

# Constants

In [None]:
TRAIN = '../data/raw/music_reviews_train.json.gz'
DEV   = '../data/raw/music_reviews_dev.json.gz'
TEST  = '../data/raw/music_reviews_test.json.gz'
#HARD_12 = '../data/raw/group12.json.gz'
#HARD = '../data/raw/phase2_testData-masked.json.gz'
#SEW = '../data/raw/Arts_Crafts_and_Sewing.json.gz'
#GAMES = '../data/raw/Video_Games.json.gz'

# Functions

In [None]:
review_keys = set(['image', 'vote'])

def loader(PATH):
    review_list = []
    for line in gzip.open(PATH):
        review_data = json.loads(line)
        temp = {'image': 0, # set image binary to 0
                'reviewText' : '<NULL>',
                'summary' : '<NULL>',
                'style' : '<NULL>',
                'vote' : 0} # set votes to zero, will be overwritten if there are upvotes
        for key in review_data:
            if key == 'image':
                temp[key] = 1 # if there is an image present, set the binary to 1
            else:
                if key == 'sentiment':
                    if review_data[key] == 'positive':
                        temp[key] = 1
                    elif review_data[key] == 'negative': 
                        temp[key] = 0                    
                else:
                    review_keys.update([key])
                    temp[key] =  str(review_data[key])
        review_list.append(temp)
    return review_list

#######################################################################

def set_making(data, test = False):
    """Function to separate data into X and y.
    Input: 
    - A list of dictionaries of music reviews. 
    - Each item in the list is a dictionary for an individual review.
    Output:
    - X: a list of concatenated summary and text of movie reviews.
    - y: a list of corresponding sentiment labels.
    """
    X = []
    y = []
    
    for i in data:
        X.append(i['summary'] + ' ' +i['reviewText'])
        if test == False and 'sentiment' in i:
            y.append(i['sentiment'])
        elif test == False and 'overall' in i:
            rating = float(i['overall'])
            y.append(int(rating))
    
    if test:
        return X
    else: return X, y


# Load Data

In [None]:
train_data = loader(TRAIN) # Training
dev_data = loader(DEV)     # Validation
test_data = loader(TEST)   # Test

In [None]:
#hard_12 = loader(HARD_12)   # Our Hard Cases
#hard = loader(HARD)   # Hard Cases

In [None]:
#sew = loader(SEW) # Sewing Reviews
#games = loader(GAMES) # Video Game reviews

In [None]:
len(train_data), len(dev_data), len(test_data)#, len(sew), len(games)

In [None]:
missing = {}
count = 0

for i in train_data:
    temp = i.keys()
    for r in review_keys:
        if r not in temp:
            count += 1
            if r in missing.keys():
                missing[r] += 1
            else: missing[r] = 1
print('Done')
print(count)
print(missing)

In [None]:
# Checking for missing text or summary
count = 0
for dp in test_data:
    if dp['reviewText'] == '<NULL>' and dp['summary'] == '<NULL>':
        #print(dp)
        count += 1
        
(count)

In [None]:
# Adding up review counts, sanity check
99918 + 51 + 28 + 3 # r&s, r&!s, s&!r, !r&!s

In [None]:
# Percentage of reviews after removing those with missing reviewText and/or summary
99918/1000

# Make datasets

In [None]:
X_train, y_train = set_making(train_data)

len(X_train), len(y_train)

In [None]:
X_dev, y_dev = set_making(dev_data)

len(X_dev), len(y_dev)

In [None]:
X_test, y_test = set_making(test_data)

len(X_test), len(y_test)

In [None]:
#X_hard_12 = set_making(hard_12, test = True)
#len(X_hard_12)

In [None]:
#X_hard = set_making(hard, test = True)
#len(X_hard)

In [None]:
'''X_sew, y_sew = set_making(sew)
len(X_sew)''' ;

In [None]:
'''X_games, y_games = set_making(games)
len(X_games)'''; 

# Save datasets to csv files

In [None]:
with open('../data/interim/music_train.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 
        
    # writing data rows 
    writer.writerows((x for x in zip(X_train, y_train))) 

In [None]:
with open('../data/interim/music_dev.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 

    # writing data rows 
    writer.writerows((x for x in zip(X_dev, y_dev))) 

In [None]:
with open('../data/interim/music_test.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 
    
    # writing data rows 
    writer.writerows((x for x in zip(X_test, y_test)))

In [None]:
'''with open('../data/interim/hard_12.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 
    
    # writing data rows 
    writer.writerows(([x] for x in X_hard_12)) ''';

In [None]:
'''with open('../data/interim/hard.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 
    
    # writing data rows 
    writer.writerows(([x] for x in X_hard)) ''';

In [None]:
'''with open('../data/interim/sew.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 
        
    # writing data rows 
    writer.writerows((x for x in zip(X_sew, y_sew))) ''' ;

In [None]:
'''with open('../data/interim/games.csv', 'w') as csvfile: 
    # creating a csv dict writer object 
    writer = csv.writer(csvfile) 
        
    # writing data rows 
    writer.writerows((x for x in zip(X_games, y_games))) ''';