Happy Transformer: https://github.com/EricFillion/happy-transformer

In [2]:
pip install happytransformer

In [7]:
from google.colab import drive
from happytransformer import HappyTextClassification
import pandas as pd
import gzip
import json

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    # print(l)
    yield json.loads(l)

def getDF(path, product_asin):
  i = 0
  df = {}
  for d in parse(path):
    if d.get('asin') == product_asin:
      df[i] = d
      i += 1
  return pd.DataFrame.from_dict(df, orient='index')

drive.mount('/content/drive')
json_data_path = 'drive/MyDrive/Electronics.json.gz'
product_id = 'B003L1ZYYW'
df = getDF(json_data_path, product_id)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
df = df.dropna(subset=['reviewText', 'summary'])
df['reviewText'] = df['reviewText'].astype(str)
df['summary'] = df['summary'].astype(str)
df = df.sort_values(by='unixReviewTime', ascending=True)

In [25]:
happy_tc = HappyTextClassification(model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english", num_labels=2)
tokenizer = happy_tc.tokenizer
# Max token length (510 to account for additional special tokens) - limited by happy transformer
MAX_TOKENS = 510

In [None]:
results = []
count = 0
for _, row in df.iterrows():
    # Combining summary and reviewText with a newline character in between
    text = row["summary"] + "\n" + row["reviewText"]
    # Truncate the text if it exceeds the maximum token length
    tokens = tokenizer.tokenize(text)
    if len(tokens) > MAX_TOKENS:
        tokens = tokens[:MAX_TOKENS]
        text = tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens))
    sentiment = happy_tc.classify_text(text).label
    results.append({
        "unixReviewTime": row["unixReviewTime"],
        "sentiment_pred": int(sentiment == "POSITIVE")
    })

results_df = pd.DataFrame(results)

In [34]:
results_df.to_csv("drive/MyDrive/review_sentiment_preds.csv", index=False)