In [1]:
# ---------------------------------- Libraries ----------------------------------
!pip install -q transformers accelerate torch datasets peft trl openai scikit-learn gradio

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/423.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:

# ---------------------------------- Imports ----------------------------------
from datasets import load_dataset, Dataset, DatasetDict
from google.colab import userdata
from huggingface_hub import login, notebook_login
from IPython.display import display, Markdown
import json
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import pandas as pd
import os
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import time
import torch
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    BitsAndBytesConfig,
)
from trl import SFTTrainer


In [3]:
# ---------------------------------- HuggingFace Connection ----------------------------------
hf_token = os.environ.get('HF_TOKEN') or userdata.get('HF_TOKEN')

if hf_token:
    login(token=hf_token)
    print("HuggingFace login successful.")
else:
    print("HuggingFace token not found. Please set the HF_TOKEN environment variable or store it in Colab secrets.")

HuggingFace login successful.


In [4]:
# ---------------------------------- Use GPU ----------------------------------
if torch.cuda.is_available():
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")
    # Set default device to GPU
    torch.set_default_device("cuda")
    print("PyTorch default device set to CUDA (GPU).")
else:
    print("WARNING: No GPU detected. Running on CPU")

GPU detected: Tesla T4
PyTorch default device set to CUDA (GPU).


In [5]:
# ---------------------------------- Functions ----------------------------------
# Helper function for markdown display
def print_markdown(text):
    """Displays text as Markdown."""
    display(Markdown(text))



In [12]:
# ---------------------------------- Load Dataset ----------------------------------
# https://huggingface.co/datasets/Daniel-ML/sentiment-analysis-for-financial-news-v2/viewer
dataset_id = "Daniel-ML/sentiment-analysis-for-financial-news-v2"

# Load the data set
labeled_dataset = load_dataset(dataset_id, split = "train")
print("Dataset loaded successfully!")


Dataset loaded successfully!


In [13]:
# ---------------------------------- View Dataset ----------------------------------
# View the data set
print("\n--- Dataset Information ---")
print(labeled_dataset)

# View the dataset features
print("\n--- Dataset Features ---")
print(labeled_dataset.features)

# Unique Labels
labels = labeled_dataset.to_pandas()['sentiment'].unique().tolist()
print("\n--- Unique Labels ---")
print(f"Unique labels in the dataset: {labels}")


--- Dataset Information ---
Dataset({
    features: ['sentiment', 'text'],
    num_rows: 4846
})

--- Dataset Features ---
{'sentiment': Value('string'), 'text': Value('string')}

--- Unique Labels ---
Unique labels in the dataset: ['neutral', 'negative', 'positive']





--- Dataset Information ---  
Dataset({  
    features: ['sentiment', 'text'],  
    num_rows: 4846  
})  
  
--- Dataset Features ---  
{'sentiment': Value('string'), 'text': Value('string')}  
  
--- Unique Labels ---  
Unique labels in the dataset: ['neutral', 'negative', 'positive']  


In [14]:
# View the data as a python DataFrame
display(labeled_dataset.select(range(5)).to_pandas()[["text", "sentiment"]])

Unnamed: 0,text,sentiment
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,The international electronic industry company ...,negative
3,With the new production plant the company woul...,positive
4,According to the company 's updated strategy f...,positive


**text**  **sentiment**  
**0**  According to Gran , the company has no plans t...  neutral  
**1**  Technopolis plans to develop in stages an area...  neutral  
**2**  The international electronic industry company ...  negative  
**3**  With the new production plant the company woul...  positive  
**4**  According to the company 's updated strategy f...  positive  

In [15]:
# ---------------------------------- Split Dataset into Train / Test Dataset ----------------------------------
# Split the Data into Train and Test Sets ---
print("\nSplitting data into Train (90%) and Test (10%)...")
train_test_split_ratio = 0.10

# Set seed for reproducability
seed = 42

# Using datasets built-in method.  Shuffle the data to allow the LLM not learn the order of each sample.
split_dataset = labeled_dataset.train_test_split(
    test_size = train_test_split_ratio, seed = seed, shuffle = True,
)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

print(f"Training set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")
print("\nTrain/Test Split Complete.")


Splitting data into Train (90%) and Test (10%)...
Training set size: 4361
Test set size: 485

Train/Test Split Complete.





Splitting data into Train (90%) and Test (10%)...  
Training set size: 4361  
Test set size: 485  
  
Train/Test Split Complete.  
