In [1]:
import gzip
import json
import pandas as pd
from random import sample

In [2]:
reviewpath = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\Software_5.json.gz"
metapath = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\meta_Software.json.gz"

In [3]:
def read_review_data(path, cleaning_function):
    def convert_json(json_string):
        try:
            return json.loads(json_string)
        except ValueError:
            return    
    with gzip.open(path, 'r') as fin:        # 4. gzip
        json_bytes = fin.read()                      # 3. bytes (i.e. UTF-8)
        json_str = json_bytes.decode('utf-8').split("\n")           # 2. string (i.e. JSON)
        data = list(map(convert_json, json_str))
        data = [cleaning_function(item) for item in data if item is not None]
        
    return data
def clean_meta_data_json(json_dict):
    output = {}
    output['brand'] = json_dict['brand']
    output['asin'] = json_dict['asin']
    return output
def clean_review_json(json_dict):
    def clean_text(text):
        return text.replace("\n","")
    def clean_fields(json_dict):
        desired_fields = ["overall","summary","reviewText","asin"]
        return {field:json_dict.get(field,"") for field in desired_fields}
    json_dict = clean_fields(json_dict)
    json_dict['reviewText'] = clean_text(json_dict['reviewText'])
    return json_dict
def generate_sentiment_category(rating):
    mapping = {
        5.0:"POS",
        4.0:"POS",
        3.0:"NEU",
        2.0:"NEG",
        1.0:"NEG",
    }
    return mapping.get(rating)
def balance_classes(df, target_col = "sentiment"):
    class_distribution = df['sentiment'].value_counts().to_dict()
    least_count = min(class_distribution.values())
    for class_label,count in class_distribution.items():
        if count>least_count:  
            indices_to_drop = sample(df[df[target_col]==class_label].index.tolist(),count-least_count)
            df = df.drop(indices_to_drop)
    return df

### read review and meta data

In [4]:
metadata = read_review_data(metapath, clean_meta_data_json)
reviewdata = read_review_data(reviewpath, clean_review_json)
metadata = pd.DataFrame(metadata)
metadata.drop_duplicates(inplace = True)
reviewdata = pd.DataFrame(reviewdata)
reviewdata.drop_duplicates(inplace = True)

In [17]:
metadata.iloc[49].to_dict()

{'brand': 'McGraw-Hill Education', 'asin': '0077734343'}

### generate sentiment classes

In [5]:
merged_data = reviewdata.merge(metadata, how = "left", on = "asin")
merged_data['sentiment'] = merged_data.overall.map(generate_sentiment_category)
merged_data = balance_classes(merged_data)
merged_data.drop("overall", axis = 1, inplace = True)
merged_data['reviewId'] = range(1,len(merged_data)+1)

### save 

In [10]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\dataset.csv"
merged_data.to_csv(path, index = False)

In [11]:
merged_data

Unnamed: 0,summary,reviewText,asin,brand,sentiment,reviewId
2,This is excellent software for those who want ...,If you've been wanting to learn how to create ...,0321719816,Peach Pit Press,POS,1
10,Competent introduction to Dreamweaver and web ...,I waited to complete the entire course before ...,0321719816,Peach Pit Press,NEU,2
11,Learn Adobe Photoshop Lightroom 3 by Video (Le...,As someone who has just upgraded from Lightroo...,0321700945,Peach Pit Press,POS,3
14,For Highly Motivated and Patient People,There are over 100 video lessons here. Most us...,0321700945,Peach Pit Press,NEU,4
20,Good Intro to Flash CS5,This was the first Learn by Video series cours...,0321719824,Peach Pit Press,POS,5
...,...,...,...,...,...,...
11956,Four Stars,great buy,B01E6C5D1S,IOLO Technologies,POS,4436
11957,They put out a good product...all they need is...,"The product itself is quite good, in depth fix...",B01E6C5D1S,IOLO Technologies,POS,4437
11961,Works great for my purposes!,"I am a total amateur when it comes to editing,...",B01FFVDY9M,Corel,POS,4438
11963,"Might not be for the ""novice""",This software has SO much going on. Theres a ...,B01HAP3NUG,Pinnacle Systems,NEU,4439
