In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import gzip
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import networkx as nx
from statsmodels.tsa.seasonal import seasonal_decompose
import collections
import itertools
import time
import math

In [None]:
meta_videogames = []
with gzip.open('/content/drive/My Drive/meta_Video_Games.json.gz', 'r') as json_file:
    for l in json_file:
        meta_videogames.append(json.loads(l.strip()))

In [None]:
meta_videogames[0]

{'category': ['Video Games', 'PC', 'Games'],
 'tech1': '',
 'description': [],
 'fit': '',
 'title': 'Reversi Sensory Challenger',
 'also_buy': [],
 'tech2': '',
 'brand': 'Fidelity Electronics',
 'feature': [],
 'rank': ['>#2,623,937 in Toys &amp; Games (See Top 100 in Toys &amp; Games)',
  '>#39,015 in Video Games &gt; PC Games'],
 'also_view': [],
 'main_cat': 'Toys &amp; Games',
 'similar_item': '',
 'date': '',
 'price': '',
 'asin': '0042000742',
 'imageURL': ['https://images-na.ssl-images-amazon.com/images/I/31nTxlNh1OL._SS40_.jpg'],
 'imageURLHighRes': ['https://images-na.ssl-images-amazon.com/images/I/31nTxlNh1OL.jpg']}

In [None]:
videogames_data = []
with gzip.open('/content/drive/My Drive/Video_Games.json.gz', 'r') as json_file:
    for l in json_file:
        videogames_data.append(json.loads(l.strip()))

In [None]:
videogames_data[0]

{'overall': 1.0,
 'verified': True,
 'reviewTime': '06 9, 2014',
 'reviewerID': 'A21ROB4YDOZA5P',
 'asin': '0439381673',
 'reviewerName': 'Mary M. Clark',
 'reviewText': 'I used to play this game years ago and loved it. I found this did not work on my computer even though it said it would work with Windows 7.',
 'summary': 'Did not like this',
 'unixReviewTime': 1402272000}

In [None]:
data_df = pd.DataFrame(videogames_data)

In [None]:
meta_musical_instruments = []
with gzip.open('/content/drive/My Drive/meta_Musical_Instruments.json.gz', 'r') as json_file:
    for l in json_file:
        meta_musical_instruments.append(json.loads(l.strip()))

In [None]:
musical_instruments_data = []
with gzip.open('/content/drive/My Drive/Musical_Instruments.json.gz', 'r') as json_file:
    for l in json_file:
        musical_instruments_data.append(json.loads(l.strip()))

In [None]:
meta_appliances = []
with gzip.open('/content/drive/My Drive/meta_Appliances.json.gz', 'r') as json_file:
    for l in json_file:
        meta_appliances.append(json.loads(l.strip()))

In [None]:
appliances_data = []
with gzip.open('/content/drive/My Drive/Appliances.json.gz', 'r') as json_file:
    for l in json_file:
        appliances_data.append(json.loads(l.strip()))

## **Data Engineering**

In [None]:
# Assume videogames_data, musical_instruments_data, etc. are lists of JSON objects/dictionaries
data_list = videogames_data + musical_instruments_data + appliances_data
meta_data_list = meta_videogames + meta_musical_instruments + meta_appliances

# Convert the lists into dataframes
data_df = pd.DataFrame(data_list)
meta_data_df = pd.DataFrame(meta_data_list)

# Select only the columns we are interested in
data_df = data_df[['asin', 'overall', 'verified', 'reviewTime', 'reviewerID']]
meta_data_df = meta_data_df[['asin', 'main_cat']]

# Merge the dataframes on the 'asin' column
merged_df = pd.merge(data_df, meta_data_df, on='asin', how='outer')

# Reorder the columns as desired
merged_df = merged_df[['asin',  'main_cat', 'overall', 'verified', 'reviewTime', 'reviewerID']]

In [None]:
merged_df = merged_df.dropna()

Reducing the size of the dataframe. We are only interested in 'Video Games', 'Musical Instruments', 'Gift Cards' and 'Appliances' categories

In [None]:
# Define the categories of interest
categories_of_interest = ['Video Games', 'Musical Instruments', 'Appliances']#'Gift Cards', ]

# Keep only rows where 'main_cat' is one of the categories of interest
merged_df = merged_df[merged_df['main_cat'].isin(categories_of_interest)]

# Convert the 'reviewTime' column to datetime format
merged_df['reviewTime'] = pd.to_datetime(merged_df['reviewTime'], format="%m %d, %Y")

# To reduce the size of the dataset a little bit more, only keep reviews from 2016 until 2018
# Filter the data for the desired time period
start_date = pd.to_datetime('2016-09-01')
end_date = pd.to_datetime('2018-09-30')
time_dataset = merged_df[(merged_df['reviewTime'] >= start_date) & (merged_df['reviewTime'] <= end_date)]

# Reset the index of the filtered dataframe
time_dataset.reset_index(drop=True, inplace=True)

In [None]:
# To reduce the size of the dataset a little bit more, only keep reviews from 2015 until 2018
# Filter the data for the desired time period
start_date = pd.to_datetime('2018-01-01')
end_date = pd.to_datetime('2018-12-31')
merged_df = merged_df[(merged_df['reviewTime'] >= start_date) & (merged_df['reviewTime'] <= end_date)]

# Reset the index of the filtered dataframe
merged_df.reset_index(drop=True, inplace=True)

In [None]:
merged_df.to_csv('/content/drive/MyDrive/merged_df.csv', index=False)

In [None]:
time_dataset.to_csv('/content/drive/MyDrive/time_dataset.csv', index=False)