<a href="https://colab.research.google.com/github/cmorris2945/DistillBERT_sentiment_analysis_LLM/blob/main/DistillBERT_sentiment_analysis_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## have to first install some modules that is not indegenous to Jupyter....

!pip install transformers datasets evaluate

!pip install langchain




In [None]:

## These are the modules that have the methods, classes and funcations
## to get data, tokenize, train and so forth also logging into the Hugging Face Hub.
# This is super important because it lets us access models and datasets directly from their library....
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from huggingface_hub import login
from langchain import PromptTemplate, LLMChain

# This is the login method and the token parameter it takes....
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# Now here I am defining a custom sequential chain class.
# This class will help me run the data through a series of processing steps.
class SequentialChain:
    def __init__(self, chains):
        # Initialize the chain with a list of processing steps.
        self.chains = chains

    def run(self):
        data = None
        # Loop through each step in the chain and process the data.
        for chain in self.chains:
            data = chain["function"](data)
        # Return the final processed data.
        return data



# Now, I'll create a LangChain component for loading our dataset.
class DataLoader:
    def __init__(self, dataset_name):
        # Initialize the DataLoader with the name of the dataset we want to load.
        self.dataset_name = dataset_name

    def __call__(self, _):
        # Load the dataset using the load_dataset function from the datasets library.
        return load_dataset(self.dataset_name)

# Create an instance of the DataLoader for the "Yelp" Review dataset. (I chose the Yelp review dataset)
data_loader = DataLoader("yelp_review_full")



# No I need a function to tokenize our text data so the model can understand it.
# This function will convert the text into tokens, which are the basic units the model works with.
def tokenize_function(examples, tokenizer):
    return tokenizer(examples["text"], padding="max_length", truncation=True)





In [None]:
# Define LangChain components for tokenization
class Tokenizer:
    def __init__(self, tokenizer_name):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    def __call__(self, dataset):
        return dataset.map(lambda examples: tokenize_function(examples, self.tokenizer), batched=True)

tokenizer = Tokenizer("distilbert-base-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:


# Define the model training class
class ModelTrainer:
    def __init__(self, model_name, output_dir, num_labels):
        # We're loading a pre-trained model for sequence classification.
        # Here, I'm using 'distilbert-base-cased' as the base model.
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

        # Set up the training arguments.
        # This includes where to save the model, how many epochs to train for,
        # batch sizes for training and evaluation, and enabling mixed precision training.
        self.training_args = TrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="epoch",
            num_train_epochs=5,  # Set number of epochs (more epochs can help the model learn better)
            per_device_train_batch_size=16,  # Batch size for training
            per_device_eval_batch_size=16,  # Batch size for evaluation
            fp16=True  # Enable mixed precision training for faster and more efficient training
        )

        # Load the accuracy metric to evaluate our model.
        self.metric = evaluate.load("accuracy")

    # Define a function to compute the metrics during evaluation.
    def compute_metrics(self, eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return self.metric.compute(predictions=predictions, references=labels)

    # Define the call method to run the training process.
    def __call__(self, tokenized_datasets):
        # Create smaller subsets of the tokenized datasets for quicker training and evaluation.
        small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(500))  # Smaller subset for quicker training
        small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))  # Smaller subset for quicker training

        # Initialize the Trainer from transformers to handle the training loop.
        trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=small_train_dataset,
            eval_dataset=small_eval_dataset,
            compute_metrics=self.compute_metrics,
        )

        # Start the training process...
        trainer.train()
        # Save the trained model to the specified output directory.
        trainer.save_model("fine_tuned_model")

        # Evaluate the model on the evaluation dataset.
        results = trainer.evaluate()
        # Print out the evaluation results.
        print(results)

        # Return the evaluation results.
        return results

# Create an instance of the ModelTrainer class with the specified model and output directory.
model_trainer = ModelTrainer("distilbert-base-cased", "test_trainer", num_labels=5)

# Create a custom sequential chain.
# This chain will run through data loading, tokenization, and model training steps in sequence.
chain = SequentialChain(
    chains=[
        {"function": data_loader, "name": "Data Loader"},
        {"function": tokenizer, "name": "Tokenizer"},
        {"function": model_trainer, "name": "Model Trainer"}
    ]
)

# Run the chain to execute the entire process from data loading to model training and evaluation.
results = chain.run()



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.358729,0.376
2,No log,1.095432,0.518
3,No log,1.080273,0.554
4,No log,1.136171,0.564
5,No log,1.133183,0.56


{'eval_loss': 1.1331825256347656, 'eval_accuracy': 0.56, 'eval_runtime': 3.2746, 'eval_samples_per_second': 152.692, 'eval_steps_per_second': 9.772, 'epoch': 5.0}


In [None]:
# Importing the necessary classes from transformers and LangChain.
# I am going to use these to load my fine-tuned model, tokenize the input text, and define the inference process.

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from langchain import PromptTemplate

# Load the fine-tuned model and tokenizer from the local directory.
# This is the model I just trained, and the tokenizer we used for preprocessing.
model = AutoModelForSequenceClassification.from_pretrained("fine_tuned_model")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

# Define the inference class.
# This class will handle the process of making predictions with our fine-tuned model.
class Inference:
    def __init__(self, model, tokenizer):
        # Initialize with the model and tokenizer.
        self.model = model
        self.tokenizer = tokenizer

    # Define the call method to make a prediction.
    def __call__(self, text):
        # Tokenize the input text, padding and truncating as necessary.
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        # Use the model to get predictions.
        outputs = self.model(**inputs)
        # The model outputs logits, which we need to convert to predictions.
        predictions = np.argmax(outputs.logits.detach().numpy(), axis=-1)
        # Return the prediction.
        return predictions[0]



In [None]:
# Initialize the inference step by creating an instance of the Inference class.
inference = Inference(model, tokenizer)

# Now, here, I'm defining a LangChain prompt template for user input.
# This template will help format the input text for the model...
template = PromptTemplate(
    input_variables=["text"],
    template="Classify the sentiment of this review: {text}",
)

# Create a custom sequence for inference.
# This class will handle the process of formatting the input and getting predictions from the model.
class CustomChain:
    def __init__(self, template, llm):
        # Initialize with the template and the language model (llm).
        self.template = template
        self.llm = llm

    # Define the run method to process the input data and get predictions.
    def run(self, input_data):
        # Format the input text using the template.
        prompt = self.template.format(text=input_data["text"])
        # Get the prediction from the language model.
        result = self.llm(prompt)
        # Return the prediction result.
        return result

# Initialize the custom chain by creating an instance of the CustomChain class.
custom_chain = CustomChain(template=template, llm=inference)

# Now I test the model with some example inferences here.
# Here are some sample texts that we'll use to see how our model performs...
texts = [
    "This is a wonderful place to eat!",
    "The food was terrible and the service was worse.",
    "It was okay, nothing special.",
    "I had a great time and the staff was very friendly.",
    "I would not recommend this place to anyone."
]

# Loop through each text, run the custom chain to get predictions, and print the results.
for text in texts:
    result = custom_chain.run({"text": text})
    print(f"Review: {text}")
    print(f"Predicted sentiment: {result}\n")


Review: This is a wonderful place to eat!
Predicted sentiment: 4

Review: The food was terrible and the service was worse.
Predicted sentiment: 1

Review: It was okay, nothing special.
Predicted sentiment: 1

Review: I had a great time and the staff was very friendly.
Predicted sentiment: 4

Review: I would not recommend this place to anyone.
Predicted sentiment: 0

