## Mount drive and define constants

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

data_dir = "/content/drive/MyDrive/ALDA_Project/data"
books_dir = "/content/drive/MyDrive/ALDA_Project/data/books"
kindle_dir = "/content/drive/MyDrive/ALDA_Project/data/kindle"
kindle_preprocessed_dir = "/content/drive/MyDrive/ALDA_Project/data/kindle_preprocessed"
books_preprocessed_dir = "/content/drive/MyDrive/ALDA_Project/data/books_preprocessed"

Mounted at /content/drive/


In [None]:
import pandas as pd
import gzip
import os
import json
from tqdm import tqdm

## Convert JSON to CSV and divide data into batches

In [None]:
necessary_columns = ['overall','reviewerID', 'asin', 'reviewText', 'summary']

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def convertToCSV(path, out_dir ,filename, batch_size = 1_000_000):
  tmp_path = os.path.join(path, filename)
  if not os.path.exists(os.path.join(path,out_dir)):
    os.makedirs(os.path.join(path,out_dir))

  def _saveBatch(cur_index):
    tmp_df = pd.DataFrame.from_dict(df, orient='index')
    tmp_df = tmp_df[necessary_columns]
    tmp_df['overall'].fillna(0, inplace=True)
    tmp_df['reviewText'].fillna("NA", inplace=True)
    tmp_df['summary'].fillna("NA", inplace=True)
    output_filename = f"{filename.split('.')[0]}_{int(cur_index / batch_size)}.csv"
    out_path = os.path.join(path,out_dir,output_filename)
    print("Saving ", output_filename)
    tmp_df.to_csv(out_path)

  df = {}
  last_index_saved = -1
  for i, d in enumerate(tqdm(parse(tmp_path))):
    if i % batch_size == 1 and i != 1:
      df = {}
    df[i] = d
    if i % batch_size == 0 and i > 0:
      _saveBatch(i)
      last_index_saved = i
      print("Last Saved", last_index_saved)
  _saveBatch(last_index_saved+batch_size)

In [None]:
convertToCSV(data_dir,'kindle','Kindle_Store_5.json.gz')
# No need to run again. Already converted to CSV

In [None]:
convertToCSV(data_dir,'books','Books_5.json.gz')
# No need to run again. Already converted to CSV

**Preview Kindle and Books Data**

In [None]:
inspect_kindle_file = os.listdir(kindle_dir)[0]
inspect_books_file = os.listdir(books_dir)[0]

kindle_df = pd.read_csv(os.path.join(kindle_dir, inspect_kindle_file))
books_df = pd.read_csv(os.path.join(books_dir, inspect_books_file))

In [None]:
kindle_df.head()

Unnamed: 0.1,Unnamed: 0,overall,reviewerID,asin,reviewText,summary
0,0,4.0,A2LSKD2H9U8N0J,B000FA5KK0,"pretty good story, a little exaggerated, but I...",pretty good story
1,1,5.0,A2QP13XTJND1QS,B000FA5KK0,"If you've read other max brand westerns, you k...",A very good book
2,2,5.0,A8WQ7MAG3HFOZ,B000FA5KK0,"Love Max, always a fun twist",Five Stars
3,3,5.0,A1E0MODSRYP7O,B000FA5KK0,"As usual for him, a good book",a good
4,4,5.0,AYUTCGVSM1H7T,B000FA5KK0,MB is one of the original western writers and ...,A Western


In [None]:
books_df.head()

Unnamed: 0.1,Unnamed: 0,overall,reviewerID,asin,reviewText,summary
0,0,5.0,A1REUF3A1YCPHM,1713353,"The King, the Mice and the Cheese by Nancy Gur...",A story children will love and learn from
1,1,5.0,AVP0HXC9FG790,1713353,The kids loved it!,Five Stars
2,2,5.0,A324TTUBKTN73A,1713353,My students (3 & 4 year olds) loved this book!...,Five Stars
3,3,5.0,A2RE7WG349NV5D,1713353,LOVE IT,Five Stars
4,4,5.0,A32B7QIUDQCD0E,1713353,Great!,Five Stars


## Get high level view of the data

In [None]:
kindle = set()
both = set()

print("Processing Kindle data...")
for file in tqdm(os.listdir(kindle_dir)):
  kindle_df = pd.read_csv(os.path.join(kindle_dir, file))
  kindle = kindle.union(set(list(kindle_df['reviewerID'])))

print("Total count of unique kindle data: ", len(kindle))
print("Identifying common users...")

for file in tqdm(os.listdir(books_dir)):
  books_df = pd.read_csv(os.path.join(books_dir, file))
  books = set(list(books_df['reviewerID']))
  both = both.union(kindle.intersection(books))

print("Total count of unique users present in both: ", len(both))


Processing Kindle data...


100%|██████████| 3/3 [00:20<00:00,  6.69s/it]


Total count of unique kindle data:  139824
Identifying common users...


100%|██████████| 28/28 [04:53<00:00, 10.49s/it]

Total count of unique users present in both:  128252





In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer

nltk.download('stopwords')
nltk.download('punkt')
tokenizer = RegexpTokenizer(r'\w+')

def preprocess_text(text):
  if type(text) != str:
    print(text)
    text = ''
  text_tokens = tokenizer.tokenize(text)
  return [word for word in text_tokens if not word in stopwords.words()]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Preprocess data

In [None]:
if not os.path.exists(kindle_preprocessed_dir):
  os.makedirs(kindle_preprocessed_dir)

for file in tqdm(os.listdir(kindle_dir)):
  in_path = os.path.join(kindle_dir, file)
  out_path = os.path.join(kindle_preprocessed_dir, file)
  kindle_df = pd.read_csv(in_path)
  kindle_df['reviewText'] = kindle_df['reviewText'].apply(lambda text : preprocess_text(text))
  kindle_df['summary'] = kindle_df['summary'].apply(lambda text : preprocess_text(text))
  kindle_df.to_csv(out_path)

In [None]:
if not os.path.exists(books_preprocessed_dir):
  os.makedirs(books_preprocessed_dir)

for file in tqdm(os.listdir(books_dir)):
  in_path = os.path.join(books_dir, file)
  out_path = os.path.join(books_preprocessed_dir, file)
  books_df = pd.read_csv(in_path)
  books_df['reviewText'] = books_df['reviewText'].apply(lambda text : preprocess_text(text))
  books_df['summary'] = books_df['summary'].apply(lambda text : preprocess_text(text))
  books_df.to_csv(out_path)