<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/FT_RAG_DEMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers datasets faiss-cpu evaluate -q
!pip install -U rouge_score -q
!pip install -U torch  -q

In [None]:
!pip install -U torchvision -q

## FineTuning

In [2]:
!pip install -U transformers datasets faiss-cpu evaluate -q
!pip install -U rouge_score -q
!pip install -U torch  -q
!pip install -U torchvision -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m91.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ..

In [13]:
from datasets import load_dataset
# Load the Financial PhraseBank dataset from Hugging Face
dataset = load_dataset("atrost/financial_phrasebank")
dataset
dataset['train'][0]

{'sentence': 'EBIT margin was up from 1.4 % to 5.1 % .', 'label': 2}

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 3100
    })
    validation: Dataset({
        features: ['sentence', 'label'],
        num_rows: 776
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 970
    })
})

In [47]:
import random
import numpy as np  # Import numpy for numerical calculations
from textblob import TextBlob  # Import TextBlob for sentiment analysis



def get_sentiment_score(text):
    """Calculates the sentiment score of a text using TextBlob.

    Args:
        text: The input text.

    Returns:
        The sentiment score (polarity) of the text, ranging from -1 to 1.
    """
    analysis = TextBlob(text)
    return analysis.sentiment.polarity


def select_diverse_questions(test_dataset, num_questions=2):
    """Selects questions with diverse sentiment scores.

    Args:
        test_dataset: The test dataset with sentiment scores.
        num_questions: The number of questions to select.

    Returns:
        A list of selected questions.
    """

    # 1. Get sentiment scores for all questions
    sentiment_scores = [get_sentiment_score(q['sentence']) for q in test_dataset]  # Assuming get_sentiment_score() is defined

    # 2. Select questions based on score distribution
    selected_questions = []
    for _ in range(num_questions):
        avg_score = np.mean([get_sentiment_score(q['sentence']) for q in selected_questions]) if selected_questions else 0
        farthest_question = max(test_dataset, key=lambda q: abs(get_sentiment_score(q['sentence']) - avg_score))
        # Append the entire dictionary/row to selected_questions:
        selected_questions.append(farthest_question)


    return selected_questions

In [50]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import numpy as np
from evaluate import load
from sklearn.metrics import f1_score
import torch
import os
import random

import warnings
warnings.filterwarnings("ignore")


# 1. Load the dataset
dataset = load_dataset("atrost/financial_phrasebank")

# 2. Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") # or your preferred model

# 3. Define the tokenization function
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

# 4. Apply tokenization to the datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# **Control the number of rows**
num_train_rows = 10  # Set the desired number of training rows
num_test_rows = 2   # Set the desired number of test rows

# Select a subset of the data
train_dataset = tokenized_datasets["train"].select(range(num_train_rows))
test_dataset = tokenized_datasets["test"].select(range(num_test_rows))

sentiment_groups = {}
for i in range(len(test_dataset)):
    sentiment = test_dataset[i]['label']  # Or 'label', depending on your column name
    if sentiment not in sentiment_groups:
        sentiment_groups[sentiment] = []
    sentiment_groups[sentiment].append(test_dataset[i]['sentence'])  # Assuming 'sentence' column contains the questions

# 2. Select one question from each of two different sentiments
sentiments = list(sentiment_groups.keys())


# Example usage
selected_questions = select_diverse_questions(test_dataset)

#print("Selected Questions:")
#for question in selected_questions:
#    print(question)

print('\n')
if len(sentiments) >= 2:
    sentiment1 = random.choice(sentiments)
    sentiments.remove(sentiment1)  # Avoid selecting the same sentiment twice
    sentiment2 = random.choice(sentiments)

    question1 = random.choice(sentiment_groups[sentiment1])
    question2 = random.choice(sentiment_groups[sentiment2])

    #print("Question 1 (Sentiment:", sentiment1, "):", question1)
    #print("Question 2 (Sentiment:", sentiment2, "):", question2)
#else:
#    print("Not enough distinct sentiments in the test dataset to select two questions.")


# --- Extract and Print Test Questions (with special tokens test within decode) ---
special_token_ids = [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id, tokenizer.unk_token_id, tokenizer.mask_token_id]

test_questions = [
    tokenizer.decode(
        [token_id for token_id in test_dataset.select([i])['input_ids'][0] if token_id not in special_token_ids], # Check against special_token_ids
        skip_special_tokens=True
    )
    for i in range(num_test_rows)
]

#print("Test Questions (derived from test sentences):")  # Changed print message
#for i, question in enumerate(test_questions):
#    print(f"Question {i+1}: {question}")


# Model and training setup
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

seed=42

# **Key Change: Custom data collator**
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./financial_sentiment_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=32, # Reduced batch size
    num_train_epochs=1,  # Reduced number of epochs for demonstration
    weight_decay=0.01,
    report_to="none",  # or "wandb" if you're using Weights & Biases
    use_cpu=True,  # Explicitly set use_cpu to True if needed
    seed=seed,  # Set the seed in TrainingArguments as well
    logging_strategy="steps",  # Log every 'logging_steps'
    logging_steps=10,          # Log every 10 steps
    max_steps=100,
    save_strategy="epoch",    # Save the model every epoch
)

# Function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average="weighted", zero_division=0)
    return {"f1": f1}

# Create Trainer with preprocessed datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("./financial_sentiment_model")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.






Epoch,Training Loss,Validation Loss,F1
0,0.7722,0.807259,0.553284


In [51]:
test_dataset

Dataset({
    features: ['sentence', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2
})

## RAG

In [52]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# --- RAG Example (Question Answering on Financial Reports) ---

# Assuming test_questions is already defined and contains the extracted test questions

# RAG setup
qa_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
qa_model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)

def rag_answer(question, reports):
    best_answer = {"answer": "No answer found.", "score": 0}
    for report_name, report_text in reports.items():
        result = qa_pipeline(question=question, context=report_text)
        if result["score"] > best_answer["score"]:
            best_answer = result
    return best_answer["answer"]

# Sample financial report data (updated with test sentences)
financial_reports = {
    "report1": test_questions[0],  # Use the first test question
    "report2": test_questions[1]   # Use the second test question
}

# Example usage (updated with test questions)
question1 = "What will the works include?"  # Example question related to test_questions[0]
answer1 = rag_answer(question1, financial_reports)
print(f"Question: {question1}")
print(f"Answer: {answer1}")

question2 = "Where is Teleste listed?"  # Example question related to test_questions[1]
answer2 = rag_answer(question2, financial_reports)
print(f"Question: {question2}")
print(f"Answer: {answer2}")

Device set to use cuda:0


Question: What will the works include?
Answer: 30 offices worldwide and is listed on the nordic exchange in helsinki.
Question: Where is Teleste listed?
Answer: nordic exchange


## RAG and FT Integration

In [53]:
# ... (other imports and code) ...

# --- 3. Integration Example (Combining Fine-Tuning and RAG) ---

def integrated_analysis(news_text, question, reports, sentiment_model, sentiment_tokenizer):
    # Sentiment analysis
    inputs = sentiment_tokenizer(news_text, return_tensors="pt")
    with torch.no_grad():
        outputs = sentiment_model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    sentiment_labels = ["Negative", "Neutral", "Positive"]
    sentiment = sentiment_labels[predicted_class]

    # RAG question answering
    rag_answer_text = rag_answer(question, reports)

    return {"sentiment": sentiment, "rag_answer": rag_answer_text}

# Load the fine-tuned model (if trained)
try:
    loaded_sentiment_model = AutoModelForSequenceClassification.from_pretrained("./financial_sentiment_model")
    loaded_sentiment_tokenizer = AutoTokenizer.from_pretrained("./financial_sentiment_model")

    # --- Align news_example with test_questions ---
    news_examples = [
        "Construction firm awarded contract for laying natural stone pavements and installing underground heating systems.",  # Related to test_questions[0]
        "Telecom company Teleste reports strong growth in Nordic markets and expansion of global offices.",  # Related to test_questions[1]
    ]

    # Iterate through test_questions and corresponding news_examples
    for i in range(len(test_questions)):
        news_example = news_examples[i]
        question_example = test_questions[i]

        financial_reports = {
            "report1": test_questions[0],
            "report2": test_questions[1]
        }

        analysis_result = integrated_analysis(news_example, question_example, financial_reports, loaded_sentiment_model, loaded_sentiment_tokenizer)

        print('\n')
        print(f"News: {news_example}")
        print(f"Sentiment: {analysis_result['sentiment']}")
        print(f"Question: {question_example}")
        print(f"RAG Answer: {analysis_result['rag_answer']}")

except OSError:
    print("Fine-tuned model not found. Please train the sentiment model first.")



News: Construction firm awarded contract for laying natural stone pavements and installing underground heating systems.
Sentiment: Neutral
Question: the works will include the laying of natural stone pavements and the installation of underground heating, and surface water drainage systems.
RAG Answer: underground heating, and surface water drainage systems


News: Telecom company Teleste reports strong growth in Nordic markets and expansion of global offices.
Sentiment: Neutral
Question: teleste has some 30 offices worldwide and is listed on the nordic exchange in helsinki.
RAG Answer: teleste has some 30 offices worldwide
