In [None]:
# installing transformer library
!pip install transformers datasets evaluate

In [1]:
# installing dataset
import pandas as pd
from sklearn.utils import shuffle
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
data = pd.read_csv("ecomm_data.csv")
# remaove na values
data = data.dropna()
# shuffle data
data = shuffle(data)
# rename column names using col index
data.rename(columns={data.columns[0]: "label", data.columns[1]: "product_description"}, inplace=True)
print(data.head(1)['product_description'])

# replace catecorical with numerical values
data['label'].replace([ "Electronics",  "Household", "Books", "Clothing & Accessories"],
                        [0, 1, 2, 3], inplace=True)

42997    TP-Link TL-UE300 USB 3.0 to RJ45 Gigabit Ether...
Name: product_description, dtype: object


In [2]:
# convert data into dataset
df = {'product_desc': data['product_description'], 'label': data['label']}
from datasets import Dataset
out_df = Dataset.from_dict(df)

print(out_df[:2])

#load a DistilBERT tokenizer, tokeniztation of the words
from transformers import AutoTokenizer
# using the distilbert tokenizer 
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", return_tensors='pt')

#tokenization function
# truncate sequences longer than DistilBERT’s maximum input length

def tokenize(data):
    return tokenizer(data["product_desc"], truncation=True)

#splitting dataset into train and test:
train = out_df[:1000]
test =  out_df[1000:1100]
train = Dataset.from_dict(train)
test = Dataset.from_dict(test)


print(train[:2])
print(test[:2])



{'product_desc': ['TP-Link TL-UE300 USB 3.0 to RJ45 Gigabit Ethernet Network Adapter Style name:UE300   UE300 adds gigabit Ethernet network connectivity to those devices without Ethernet LAN port, such as Ultrabook or MacBook Air, through a USB 3.0 port, also being compatible with USB 2.0 and USB1.1 standard.', 'The Power Of Your Subconscious Mind From the Publisher "I have seen miracles happen to men and women in all walks of life all over the world." -- Dr. Joseph Murphy. At last, a great new scientific discovery brings the incredible force of your subconscious mind under your control. Here are the simple, scientifically proven techniques and the astonishing facts about how your subconscious powers can perform miracles of healing. How lung cancer has been cured and optic nerves made whole again. How you can use the newly discovered Law of Attraction to increase your money-getting powers. How your subconscious mind can win you friends, peace of mind, and even help you to attract the i

In [None]:
print(test['label'])

In [3]:
#tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer.save_pretrained("tokenizer")

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\vocab.txt',
 'tokenizer\\added_tokens.json',
 'tokenizer\\tokenizer.json')

In [5]:
# tokenizing train and test sets using map function, to avoid creating tokenizers.Encoding objects and keep dict or Dataset 
# objects with keys
train_tokens = train.map(tokenize, batched=True)
test_tokens= test.map(tokenize, batched=True)


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [6]:
# tokenizing train and test sets using map function, to avoid creating tokenizers.Encoding objects and keep dict or Dataset 
# objects with keys
train_tokens = train.map(tokenize, batched=True)
test_tokens= test.map(tokenize, batched=True)

In [9]:
# defining a function to calculate prediction performance of the model:  'accuracy' metric will be used

import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")
def acc_calculate(pred):
    predicted, actual = pred
    predicted = np.argmax(predicted, axis=1)# argmax returns indices of the maximum values along axis 1; 
    # predicted contains the probabilities of all classes for each entry in the dataset,
    # the index 0 means that probabilty of class 0 is the highst and so on..
    return accuracy.compute(predictions=predicted, references=actual)

# create a map of the expected ids to their labels with id2label and label2id:

id2label = {0: "Electronics", 1: "Household", 2: "Books", 3:"Clothing & Accessories"}
label2id = {"Electronics": 0, "Household": 1, "Books":2, "Clothing & Accessories":3}

## stop here if you are running the code on cpu; this will take around 90 Min. Use the pretraind model below..

In [8]:
#model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4, id2label=id2label, label2id=label2id, from_tf=True
#)
#model.save_pretrained("distilbert_model_uncased")

All TF 2.0 model weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


In [10]:
# defining the transformer
#from_pretrained(): to load the pretrained Bert model weights
#AutoModelForSequenceClassification: is a generic sequence classification model that will be instantiated using a pretarined Bert model.

transformer = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased", num_labels=4, id2label=id2label, label2id=label2id, from_tf=True
)

All TF 2.0 model weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


In [None]:
# defining the training parameters
training_pars = TrainingArguments(
    output_dir="my_transformer",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True)

# defining the trainer of the model
#The Trainer class provides an API for feature-complete training in PyTorch
trainer = Trainer(
    model=transformer,
    args=training_pars,
    train_dataset= train_tokens,
    eval_dataset=test_tokens,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=acc_calculate,
)
#training the transformer
trainer.train()

In [None]:
#save trainer to a file
#trainer.save_model("./Distilbert_based_transformer")

# evaluate the model 
result =trainer.evaluate()
result

In [11]:
## resume running the code from here using the saved model..
transformer_trained = AutoModelForSequenceClassification.from_pretrained("./Distilbert_based_transformer") 

In [12]:
from transformers import TextClassificationPipeline
# This pipeline has a return_all_scores parameter on its __call__ method that allows you to get all scores for each label on a prediction.
pipe = TextClassificationPipeline(model=transformer_trained, tokenizer=tokenizer)
prediction = pipe("NetGen 4k Wi-Fi 16 MP Ultra HD Action Camera", return_all_scores=True)
prediction



[[{'label': 'Electronics', 'score': 0.9508970975875854},
  {'label': 'Household', 'score': 0.017261911183595657},
  {'label': 'Books', 'score': 0.017655229195952415},
  {'label': 'Clothing & Accessories', 'score': 0.014185710810124874}]]

In [None]:
# using GPT-2 to generate text
from transformers import pipeline
text_generator = pipeline('text-generation', model='gpt2')
# generate 5 different sentences by sampling from the top 10 hits:
#Temperature is a hyper-parameter used to control the randomness of predictions by scaling the logits before applying softmax.
# when temperature is a large value (e.g. 1), the GPT-2 model produces more diversity and also more mistakes and viseversa

In [None]:
#text_generator.save_pretrained('gpt2_pretrained')

In [None]:
sentences = text_generator("in the last few years, a bunch of changes", do_sample=True, top_k=10, temperature=0.6, max_length=50, num_return_sequences=5)
for sent in sentences:
  print(sent["generated_text"])
  print("#"*50)