Collecting Data 

In [1]:
import requests

def get_data(url : str)-> list:
    """ get the json data from provided url and return an array of the data"""

    data = requests.get(url)
    raw_data = data.text
    data_array = raw_data.split("\n")
    return data_array


Data Preparation 

In [2]:
# process each element of the list and assign a label of helpful or not based on the score
import json
from typing import Union
import pandas as pd 
import json

def get_helpful_label(helpful_score: Union[int, float]) -> Union[int,float]:
    """ return label of how helpful """

    if helpful_score < 1:
        return 0
    else:
        return 1


def data_to_df(data: list) -> pd.DataFrame:
    """ convert the data array into a pandas dataframe """

    df_dict = {}
    df_dict["text"] = []
    df_dict["label"] = []

    for item in data:
        try:
            item_json = json.loads(item)
        except:
            continue

        df_dict["text"].append(item_json["sentence"])

        # get helpful label from score 
        helpful_score = item_json["helpful"]
        helpful_label = get_helpful_label(helpful_score)
        df_dict["label"].append(helpful_label)

    # creating dataframe 
    df = pd.DataFrame.from_dict(df_dict)
    return df


In [7]:
import logging


train_data = "https://helpful-sentences-from-reviews.s3.amazonaws.com/train.json"
test_data = "https://helpful-sentences-from-reviews.s3.amazonaws.com/test.json"

train_data = get_data(train_data)
test_data = get_data(test_data)

train_df = data_to_df(train_data)
test_df = data_to_df(test_data)

train_df.to_csv("helpful_sentences_train.csv", index=False)
test_df.to_csv("helpful_sentences_test.csv", index=False)


In [8]:
# Load Data
train_data = pd.read_csv("helpful_sentences_train.csv")
test_data = pd.read_csv("helpful_sentences_test.csv")

train_data.head()

Unnamed: 0,text,label
0,this flash is a superb value.,1
1,The pictures were not sharp at all.,1
2,A very good resource for parents.,1
3,"We have it in a child's room, and will be swit...",0
4,Again the makers are too lazy to bring in the ...,0


Training Model
Using HuggingFace DistilBert base model to classify the text.

In [17]:
# tokenizer to process the text and include a padding and truncation strategy

from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from datasets.arrow_dataset import Dataset
from datasets.load import load_dataset

# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load Base Model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

# Create Dataset
test_dataset = Dataset.from_pandas(test_data)
train_dataset = Dataset.from_pandas(train_data)
tokenized_test = test_dataset.map(preprocess_function, batched=True)
tokenized_train = train_dataset.map(preprocess_function, batched=True)

OSError: Unable to load weights from pytorch checkpoint file for '/Users/domy/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/pytorch_model.bin' at '/Users/domy/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/pytorch_model.bin'. If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True.

In [None]:
from datasets.load import load_metric
import numpy as np


# training the model 

# Metrics 
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)