Collecting Data 

In [1]:
import requests

def get_data(url : str)-> list:
    """ get the json data from provided url and return an array of the data"""

    data = requests.get(url)
    raw_data = data.text
    data_array = raw_data.split("\n")
    return data_array


Data Preparation 

In [2]:
# process each element of the list and assign a label of helpful or not based on the score
import json
from typing import Union
import pandas as pd 
import json

def get_helpful_label(helpful_score: Union[int, float]) -> Union[int,float]:
    """ return label of how helpful """

    if helpful_score < 1:
        return 0
    else:
        return 1


def data_to_df(data: list) -> pd.DataFrame:
    """ convert the data array into a pandas dataframe """

    df_dict = {}
    df_dict["text"] = []
    df_dict["label"] = []

    for item in data:
        try:
            item_json = json.loads(item)
        except:
            continue

        df_dict["text"].append(item_json["sentence"])

        # get helpful label from score 
        helpful_score = item_json["helpful"]
        helpful_label = get_helpful_label(helpful_score)
        df_dict["label"].append(helpful_label)

    # creating dataframe 
    df = pd.DataFrame.from_dict(df_dict)
    return df


In [7]:
import logging


train_data = "https://helpful-sentences-from-reviews.s3.amazonaws.com/train.json"
test_data = "https://helpful-sentences-from-reviews.s3.amazonaws.com/test.json"

train_data = get_data(train_data)
test_data = get_data(test_data)

train_df = data_to_df(train_data)
test_df = data_to_df(test_data)

train_df.to_csv("helpful_sentences_train.csv", index=False)
test_df.to_csv("helpful_sentences_test.csv", index=False)


In [18]:
# Load Data
train_data = pd.read_csv("helpful_sentences_train.csv")
test_data = pd.read_csv("helpful_sentences_test.csv")

train_data.head()

Unnamed: 0,text,label
0,this flash is a superb value.,1
1,The pictures were not sharp at all.,1
2,A very good resource for parents.,1
3,"We have it in a child's room, and will be swit...",0
4,Again the makers are too lazy to bring in the ...,0


Training Model
Using HuggingFace DistilBert base model to classify the text.

In [32]:
# tokenizer to process the text and include a padding and truncation strategy

from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from datasets import Dataset
from datasets import load_dataset

# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load Base Model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

# Create Dataset
test_dataset = Dataset.from_pandas(test_data)
train_dataset = Dataset.from_pandas(train_data)
tokenized_test = test_dataset.map(preprocess_function, batched=True)
tokenized_train = train_dataset.map(preprocess_function, batched=True)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

In [35]:
from datasets import load_metric
import numpy as np


# training the model 

# Metrics 
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Training args
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch"
)

# Set up Training job
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 20000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6250
  Number of trainable parameters = 66955010
  0%|          | 0/6250 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  8%|▊         | 500/6250 [22:01<4:14:30,  2.66s/it]Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json


{'loss': 0.4516, 'learning_rate': 1.8400000000000003e-05, 'epoch': 0.4}


Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
 16%|█▌        | 1000/6250 [45:49<2:44:36,  1.88s/it]Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json


{'loss': 0.4154, 'learning_rate': 1.6800000000000002e-05, 'epoch': 0.8}


Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
 20%|██        | 1250/6250 [59:25<3:16:52,  2.36s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
                                                     
 20%|██        | 1250/6250 [1:00:34<3:16:52,  2.36s/it]

{'eval_loss': 0.34471622109413147, 'eval_accuracy': 0.848, 'eval_runtime': 69.0928, 'eval_samples_per_second': 28.947, 'eval_steps_per_second': 1.809, 'epoch': 1.0}


 24%|██▍       | 1500/6250 [1:09:34<2:40:15,  2.02s/it] Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json


{'loss': 0.3488, 'learning_rate': 1.5200000000000002e-05, 'epoch': 1.2}


Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_tokens_map.json
 32%|███▏      | 2000/6250 [1:28:12<3:01:26,  2.56s/it]Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json


{'loss': 0.3103, 'learning_rate': 1.3600000000000002e-05, 'epoch': 1.6}


Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-2000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2000/special_tokens_map.json
 40%|████      | 2500/6250 [1:47:59<2:56:20,  2.82s/it]Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json


{'loss': 0.3146, 'learning_rate': 1.2e-05, 'epoch': 2.0}


Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-2500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
                                                       
 40%|████      | 2500/6250 [1:49:22<2:56:20,  2.82s/it]

{'eval_loss': 0.37368935346603394, 'eval_accuracy': 0.842, 'eval_runtime': 78.6375, 'eval_samples_per_second': 25.433, 'eval_steps_per_second': 1.59, 'epoch': 2.0}


 48%|████▊     | 3000/6250 [6:47:18<11:31:14, 12.76s/it]    Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3000/config.json


{'loss': 0.2165, 'learning_rate': 1.04e-05, 'epoch': 2.4}


Model weights saved in ./results/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-3000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-3000/special_tokens_map.json
 56%|█████▌    | 3500/6250 [10:30:57<1:48:29,  2.37s/it]    Saving model checkpoint to ./results/checkpoint-3500
Configuration saved in ./results/checkpoint-3500/config.json


{'loss': 0.2056, 'learning_rate': 8.8e-06, 'epoch': 2.8}


Model weights saved in ./results/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-3500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-3500/special_tokens_map.json
 60%|██████    | 3750/6250 [10:42:06<1:43:07,  2.47s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
                                                        
 60%|██████    | 3750/6250 [10:43:16<1:43:07,  2.47s/it]

{'eval_loss': 0.5002440214157104, 'eval_accuracy': 0.832, 'eval_runtime': 70.2614, 'eval_samples_per_second': 28.465, 'eval_steps_per_second': 1.779, 'epoch': 3.0}


 64%|██████▍   | 4000/6250 [10:54:56<1:41:14,  2.70s/it] Saving model checkpoint to ./results/checkpoint-4000
Configuration saved in ./results/checkpoint-4000/config.json


{'loss': 0.162, 'learning_rate': 7.2000000000000005e-06, 'epoch': 3.2}


Model weights saved in ./results/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-4000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-4000/special_tokens_map.json
 72%|███████▏  | 4500/6250 [11:33:28<1:18:52,  2.70s/it]Saving model checkpoint to ./results/checkpoint-4500
Configuration saved in ./results/checkpoint-4500/config.json


{'loss': 0.1402, 'learning_rate': 5.600000000000001e-06, 'epoch': 3.6}


Model weights saved in ./results/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-4500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-4500/special_tokens_map.json
 80%|████████  | 5000/6250 [11:58:46<1:16:53,  3.69s/it]Saving model checkpoint to ./results/checkpoint-5000
Configuration saved in ./results/checkpoint-5000/config.json


{'loss': 0.1371, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}


Model weights saved in ./results/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-5000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-5000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
                                                        
 80%|████████  | 5000/6250 [12:00:26<1:16:53,  3.69s/it]

{'eval_loss': 0.6700233817100525, 'eval_accuracy': 0.8355, 'eval_runtime': 96.1836, 'eval_samples_per_second': 20.794, 'eval_steps_per_second': 1.3, 'epoch': 4.0}


 88%|████████▊ | 5500/6250 [12:23:45<43:38,  3.49s/it]   Saving model checkpoint to ./results/checkpoint-5500
Configuration saved in ./results/checkpoint-5500/config.json


{'loss': 0.0887, 'learning_rate': 2.4000000000000003e-06, 'epoch': 4.4}


Model weights saved in ./results/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-5500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-5500/special_tokens_map.json
 96%|█████████▌| 6000/6250 [12:50:49<11:48,  2.83s/it]  Saving model checkpoint to ./results/checkpoint-6000
Configuration saved in ./results/checkpoint-6000/config.json


{'loss': 0.0912, 'learning_rate': 8.000000000000001e-07, 'epoch': 4.8}


Model weights saved in ./results/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-6000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-6000/special_tokens_map.json
100%|██████████| 6250/6250 [13:02:56<00:00,  2.88s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
                                                      
100%|██████████| 6250/6250 [13:05:24<00:00,  2.88s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 6250/6250 [13:05:25<00:00,  7.54s/it]

{'eval_loss': 0.80378657579422, 'eval_accuracy': 0.8295, 'eval_runtime': 148.5959, 'eval_samples_per_second': 13.459, 'eval_steps_per_second': 0.841, 'epoch': 5.0}
{'train_runtime': 47124.9625, 'train_samples_per_second': 2.122, 'train_steps_per_second': 0.133, 'train_loss': 0.23455033325195312, 'epoch': 5.0}





TrainOutput(global_step=6250, training_loss=0.23455033325195312, metrics={'train_runtime': 47124.9625, 'train_samples_per_second': 2.122, 'train_steps_per_second': 0.133, 'train_loss': 0.23455033325195312, 'epoch': 5.0})

Testing model

In [39]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TextClassificationPipeline,
)

tokenizer = AutoTokenizer.from_pretrained("results/checkpoint-6000/")
model = AutoModelForSequenceClassification.from_pretrained(
    "results/checkpoint-6000/"
)

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
result = pipe("I absolutely loved this product.")
result2test = pipe("same as advertised.")
print(result)
print(result2test)

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file results/checkpoint-6000/config.json
Model config DistilBertConfig {
  "_name_or_path": "results/checkpoint-6000/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading weights file results/checkpoint-6000/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSeq

[{'label': 'LABEL_1', 'score': 0.9988353848457336}]
[{'label': 'LABEL_1', 'score': 0.9727368354797363}]


Evaluating the model using Huggingface evaluate model. 


In [40]:
from transformers import (
    pipeline,
)
from evaluate import evaluator
import evaluate


pipe = pipeline(
    "text-classification", model="results/checkpoint-6000/"
)

test_data = pd.read_csv("helpful_sentences_test.csv")
test_dataset = Dataset.from_pandas(test_data)

# evaluator
accuracy = evaluate.load("accuracy")

# evaluating accuracy
eval = evaluator("text-classification")
result = eval.compute(
    model_or_pipeline=pipe,
    data=test_dataset,
    metric=accuracy,
    label_mapping={"LABEL_0": 0, "LABEL_1": 1},
 # type: ignore    strategy="bootstrap",
    n_resamples=200,
)

print(result)

loading configuration file results/checkpoint-6000/config.json
Model config DistilBertConfig {
  "_name_or_path": "results/checkpoint-6000/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading configuration file results/checkpoint-6000/config.json
Model config DistilBertConfig {
  "_name_or_path": "results/checkpoint-6000/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0

{'accuracy': 0.8295, 'total_time_in_seconds': 108.41167458400014, 'samples_per_second': 18.44819764729627, 'latency_in_seconds': 0.05420583729200007}


Evaluate F1 score 

In [41]:
f1_metric = evaluate.load("f1")

result = eval.compute(
    model_or_pipeline=pipe,
    data=test_dataset,
    metric=f1_metric,
    label_mapping={"LABEL_0": 0, "LABEL_1": 1},
    strategy="bootstrap",
    n_resamples=200,
)

print(result)

Downloading builder script: 100%|██████████| 6.77k/6.77k [00:00<00:00, 200kB/s]


{'f1': {'confidence_interval': (0.8631611716917896, 0.8895750583302473), 'standard_error': 0.006637136238868675, 'score': 0.8764940239043826}, 'total_time_in_seconds': 111.57373449599254, 'samples_per_second': 17.9253657595537, 'latency_in_seconds': 0.055786867247996266}
