In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import shutil
import os

# Path to the outputs directory
outputs_dir = "outputs"

# Remove the outputs directory if it exists
if os.path.exists(outputs_dir):
    shutil.rmtree(outputs_dir)
    print(f"Deleted directory: {outputs_dir}")
else:
    print(f"Directory {outputs_dir} does not exist.")


In [None]:
!pip install --no-deps packaging ninja einops trl peft accelerate bitsandbytes
!pip install transformers datasets


In [None]:
!pip install xformers==0.0.23


In [None]:
!pip install datasets transformers 


In [None]:
!pip install triton 

In [None]:
import os
os.environ["HF_TOKEN"] = "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"


# Organized version 

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.24" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
import torch
major_version, minor_version = torch.cuda.get_device_capability()
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass
!pip install triton transformers
!pip install -U datasets
!pip install xformers==0.0.23


In [None]:

import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer

# Configuration
max_seq_length = 1024
dtype = None
load_in_4bit = True

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

EOS_TOKEN = tokenizer.eos_token 
# Load and prepare the dataset
dataset_path = '/kaggle/input/cleanedqaesg/cleaned_esg_dataset.jsonl'
raw_dataset = load_dataset('json', data_files=dataset_path)

def format_dataset(example):
    instruction = "Answer the following question related to ESG:"
    input_text = example['input']
    output_text = example['output']
    formatted_input = f"{instruction}\n\nQuestion: {input_text}\n\nAnswer: {output_text}"
    return {'text': formatted_input}


dataset = raw_dataset['train'].map(format_dataset, batched=False)
dataset = dataset.train_test_split(test_size=0.1)

# Training Arguments
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=60,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="./outputs_esg_final",
)

# Trainer initialization
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=training_args,
)

# GPU memory usage before training
start_memory = torch.cuda.memory_reserved(0)
start_memory_gb = start_memory / (1024 ** 3)
gpu_properties = torch.cuda.get_device_properties(0)
total_memory_gb = gpu_properties.total_memory / (1024 ** 3)
print(f"GPU: {gpu_properties.name}, Total Memory: {total_memory_gb:.2f} GB")
print(f"Initial Memory Reserved: {start_memory_gb:.2f} GB")

# Train the model
trainer_stats = trainer.train()

# GPU memory usage after training
end_memory = torch.cuda.memory_reserved(0)
end_memory_gb = end_memory / (1024 ** 3)
memory_used_gb = end_memory_gb - start_memory_gb
memory_used_percentage = (end_memory / gpu_properties.total_memory) * 100

# Training stats and memory usage
train_time_seconds = trainer_stats.metrics['train_runtime']
train_time_minutes = train_time_seconds / 60
print(f"Training Time: {train_time_seconds:.2f} seconds ({train_time_minutes:.2f} minutes)")
print(f"Peak Memory Reserved: {end_memory_gb:.2f} GB")
print(f"Memory Used for Training: {memory_used_gb:.2f} GB")
print(f"Memory Used Percentage: {memory_used_percentage:.2f}%")





In [None]:

# Prepare the model for inference
FastLanguageModel.for_inference(model)

# Define the sample input text for testing
input_text = "What are the key ESG reporting standards companies should follow?"

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

# Generate the output from the model
outputs = model.generate(**inputs, max_length=1024, early_stopping=True)

# Decode the generated output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the generated response
print("Generated Response:")
print(response)

In [None]:
import os

# Set your Hugging Face token in the environment variables
os.environ["HF_TOKEN"] = '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'  # Replace with your actual Hugging Face token

# Import necessary components
from unsloth import FastLanguageModel
# Save only the LoRA adapters locally
adapter_save_path = "./lora_adapters"
model.save_pretrained(adapter_save_path)
print(f"Adapters saved locally at {adapter_save_path}.")


In [None]:
# Push the LoRA adapters to Hugging Face Hub
adapter_repo_name = "llama3-esg-8b-lora-adapters_finalchat"
try:
    model.push_to_hub(
        repo_id=f"AchrafGhribi31/{adapter_repo_name}",
        tokenizer=tokenizer, 
        save_method="lora", 
        token=os.environ.get("HF_TOKEN")
    )
    print(f"LoRA adapters successfully pushed to Hugging Face Hub: {adapter_repo_name}")
except Exception as e:
    print(f"Failed to push the LoRA adapters to Hugging Face: {e}")


In [None]:
# Merge the LoRA adapters with the base model using 4-bit precision and push directly to the Hugging Face Hub
merged_repo_name = "llama3-esg-8b-merged-4bit_finalchat"
try:
    model.push_to_hub_merged(
        repo_id=f"AchrafGhribi31/{merged_repo_name}",
        tokenizer=tokenizer, 
        save_method="merged_4bit_forced", 
        token=os.environ.get("HF_TOKEN")
    )
    print(f"Merged model (4-bit) successfully pushed to Hugging Face Hub: {merged_repo_name}")
except Exception as e:
    print(f"Failed to push the merged model (4-bit) to Hugging Face: {e}")


## Comparison Fine_tuned model VS Non Fine_tuned model  

In [1]:
from unsloth import FastLanguageModel

# Load the non-fine-tuned model
base_model_name = "unsloth/llama-3-8b-bnb-4bit"
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model_name,
    max_seq_length=1024,
    load_in_4bit=True,  # Load the model in 4-bit precision
)

# Prepare the base model for inference
base_model = FastLanguageModel.for_inference(base_model)

# Load the fine-tuned model
fine_tuned_model_name = "AchrafGhribi31/llama3-esg-8b-merged-4bit_finalchat"
fine_tuned_model, fine_tuned_tokenizer = FastLanguageModel.from_pretrained(
    model_name=fine_tuned_model_name,
    max_seq_length=1024,
    load_in_4bit=True,  # Load the model in 4-bit precision
)

# Prepare the fine-tuned model for inference
fine_tuned_model = FastLanguageModel.for_inference(fine_tuned_model)





🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2024-08-22 14:15:18.198471: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-22 14:15:18.198583: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-22 14:15:18.302481: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.1.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.23. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.1.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.23. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/132k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.65G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

In [None]:
import torch
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu
from difflib import SequenceMatcher
import numpy as np
import pandas as pd

# Load the original data
dataset_path = '/kaggle/input/cleanedqaesg/cleaned_esg_dataset.jsonl'
raw_dataset = load_dataset('json', data_files=dataset_path)

def format_dataset(example):
    instruction = "Answer the following question related to ESG:"
    input_text = example['input']
    output_text = example['output']
    formatted_input = f"{instruction}\n\nQuestion: {input_text}\n\nAnswer: {output_text}"
    return {'input': input_text, 'output': output_text, 'text': formatted_input}

dataset = raw_dataset['train'].map(format_dataset, batched=False)
dataset = dataset.train_test_split(test_size=0.1)
test_dataset = dataset['test']

# Ensure your model and tokenizer are on the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to generate predictions from the model
def generate_prediction(model, tokenizer, question):
    inputs = tokenizer(question, return_tensors='pt').to(device)
    with torch.autocast(device_type='cuda', dtype=torch.float16):
        outputs = model.generate(**inputs, max_new_tokens=50)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Function to calculate perplexity
def calculate_perplexity(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors='pt').to(device)
    input_ids = inputs['input_ids']
    labels = input_ids.clone()
    with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16):
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
    perplexity = torch.exp(loss).item()
    return perplexity

# Function to calculate token-level precision, recall, and F1-score
def token_level_f1(prediction, reference):
    pred_tokens = prediction.split()
    ref_tokens = reference.split()
    if not pred_tokens or not ref_tokens:
        return 0, 0, 0  # Avoid NaN by returning zero when there are no tokens
    correct = sum((1 for token in pred_tokens if token in ref_tokens))
    precision = correct / len(pred_tokens) if pred_tokens else 0
    recall = correct / len(ref_tokens) if ref_tokens else 0
    if precision + recall > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0
    return precision, recall, f1

# Lists to store evaluation results
base_model_results = {"f1_score": [], "bleu": [], "perplexity": []}
fine_tuned_model_results = {"f1_score": [], "bleu": [], "perplexity": []}

# Loop through the test dataset and evaluate both models
for example in test_dataset:
    question = example['input']
    reference_answer = example['output']

    # Base model predictions
    base_pred = generate_prediction(base_model, base_tokenizer, question)
    # Fine-tuned model predictions
    fine_tuned_pred = generate_prediction(fine_tuned_model, fine_tuned_tokenizer, question)

    # Debugging output for predictions
    print(f"Reference: {reference_answer}")
    print(f"Base Model Prediction: {base_pred}")
    print(f"Fine-Tuned Model Prediction: {fine_tuned_pred}")

    # Token-level F1-score (including precision and recall)
    _, _, base_f1 = token_level_f1(base_pred, reference_answer)
    _, _, fine_tuned_f1 = token_level_f1(fine_tuned_pred, reference_answer)

    base_model_results["f1_score"].append(base_f1)
    fine_tuned_model_results["f1_score"].append(fine_tuned_f1)

    # BLEU Score, checking for empty predictions or references
    if base_pred.strip() and reference_answer.strip():
        base_bleu = sentence_bleu([reference_answer.split()], base_pred.split())
    else:
        base_bleu = 0

    if fine_tuned_pred.strip() and reference_answer.strip():
        fine_tuned_bleu = sentence_bleu([reference_answer.split()], fine_tuned_pred.split())
    else:
        fine_tuned_bleu = 0

    base_model_results["bleu"].append(base_bleu)
    fine_tuned_model_results["bleu"].append(fine_tuned_bleu)

    # Perplexity
    base_model_results["perplexity"].append(calculate_perplexity(base_model, base_tokenizer, reference_answer))
    fine_tuned_model_results["perplexity"].append(calculate_perplexity(fine_tuned_model, fine_tuned_tokenizer, reference_answer))

# Average the results across all examples in the dataset
base_model_metrics = {metric: np.mean(values) for metric, values in base_model_results.items()}
fine_tuned_model_metrics = {metric: np.mean(values) for metric, values in fine_tuned_model_results.items()}

# Print the results in a table for better visualization
metrics = ["F1-Score", "BLEU Score", "Perplexity"]

metrics_comparison = pd.DataFrame({
    "Metric": metrics,
    "Base Model": [base_model_metrics.get(m.lower(), None) for m in metrics],
    "Fine-Tuned Model": [fine_tuned_model_metrics.get(m.lower(), None) for m in metrics]
})

# Display the table
print(metrics_comparison)


In [None]:
import torch

def generate_predictions_batch(model, tokenizer, dataset, batch_size=8, max_len=512):
    """
    Generates predictions in batches from the given dataset using the specified model and tokenizer.

    Parameters:
    - model: The language model for prediction.
    - tokenizer: The tokenizer associated with the model.
    - dataset: The dataset to generate predictions from.
    - batch_size: The number of examples to process in each batch.
    - max_len: The maximum length of the generated sequence.

    Returns:
    - predictions: A list of generated predictions.
    """
    model.eval()  # Set the model to evaluation mode
    predictions = []

    with torch.no_grad():  # Disable gradient tracking
        for i in range(0, len(dataset), batch_size):
            batch_inputs = dataset[i:i + batch_size]
            input_texts = [example['input'] for example in batch_inputs]

            # Tokenize the batch of input texts
            inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).to("cuda")

            # Generate predictions
            outputs = model.generate(**inputs, max_length=max_len, early_stopping=True)

            # Decode the generated outputs
            batch_predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            predictions.extend(batch_predictions)

    return predictions


In [None]:
# Assuming `formatted_validation_dataset` is already prepared
batch_size = 8
max_len = 512

# Generate predictions using the base model
base_model_predictions = generate_predictions_batch(base_model, base_tokenizer, formatted_validation_dataset, batch_size, max_len)

# Generate predictions using the fine-tuned model
fine_tuned_model_predictions = generate_predictions_batch(fine_tuned_model, fine_tuned_tokenizer, formatted_validation_dataset, batch_size, max_len)


In [None]:
from datasets import load_metric

rouge_metric = load_metric("rouge")

# Evaluate base model
base_model_scores = rouge_metric.compute(
    predictions=base_model_predictions,
    references=[example["output"] for example in formatted_validation_dataset]
)

# Evaluate fine-tuned model
fine_tuned_model_scores = rouge_metric.compute(
    predictions=fine_tuned_model_predictions,
    references=[example["output"] for example in formatted_validation_dataset]
)

print("Base Model ROUGE Scores:", base_model_scores)
print("Fine-Tuned Model ROUGE Scores:", fine_tuned_model_scores)


In [None]:
import pandas as pd

# Convert ROUGE scores to a more readable table format
def rouge_scores_to_table(base_scores, fine_tuned_scores):
    rows = []
    for metric in ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']:
        row = {
            'Metric': metric.upper(),
            'Base Precision': base_scores[metric].mid.precision,
            'Base Recall': base_scores[metric].mid.recall,
            'Base F1': base_scores[metric].mid.fmeasure,
            'Fine-Tuned Precision': fine_tuned_scores[metric].mid.precision,
            'Fine-Tuned Recall': fine_tuned_scores[metric].mid.recall,
            'Fine-Tuned F1': fine_tuned_scores[metric].mid.fmeasure
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    return df

# Create the table
rouge_table = rouge_scores_to_table(base_model_scores, fine_tuned_model_scores)

# Display the table
print(rouge_table)


## Evaluation of our fined tuned 4bit Llama3 :

In [None]:
import torch
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
from datasets import load_dataset, load_metric
from tqdm import tqdm
import os

# Load the merged 4-bit fine-tuned model from the Hugging Face Hub
model_name = "AchrafGhribi31/llama3-esg-8b-merged-4bit_V0"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=1024,
    load_in_4bit=True,  # Load the model in 4-bit precision
    token=os.environ.get("HF_TOKEN")  # Add your HF token if the model is private
)

# Set the model to evaluation mode
model.eval()


In [None]:
from datasets import load_dataset

# Load the entire dataset
dataset_path = '/kaggle/input/chatllama/combined_esg_dataset.jsonl'
raw_dataset = load_dataset('json', data_files=dataset_path)

# Split the dataset into train and test sets (e.g., 90% train, 10% test)
dataset = raw_dataset['train'].train_test_split(test_size=0.1, seed=42)

# Use the 'test' split for evaluation
eval_dataset = dataset['test']

# Format the dataset if necessary
def format_dataset(example):
    input_text = example['input']
    expected_output = example['output']
    return {'input_text': input_text, 'expected_output': expected_output}

eval_dataset = eval_dataset.map(format_dataset, batched=False)


In [None]:
!pip install rouge-score evaluate


In [None]:
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
import evaluate
from tqdm import tqdm
import re
import os 
# Load the merged 4-bit fine-tuned model from the Hugging Face Hub
model_name = "AchrafGhribi31/llama3-esg-8b-merged-4bit_V0"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=1024,
    load_in_4bit=True,  # Load the model in 4-bit precision
    token=os.environ.get("HF_TOKEN")  # Add your HF token if the model is private
)

# Prepare the model for inference
FastLanguageModel.for_inference(model)

# Set the model to evaluation mode
model.eval()



In [None]:
# Load the evaluation metrics
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")

# Clean the text function
def clean_text(text):
    # Remove instruction tokens, system messages, and extra tokens
    text = re.sub(r'\[INST\]', '', text)
    text = re.sub(r'\[\/INST\]', '', text)
    text = re.sub(r'<<SYS>>.*?<</SYS>>', '', text, flags=re.DOTALL)
    text = re.sub(r'<.*?>', '', text)  # Remove other potential special tokens
    text = text.replace("<s>", "").replace("</s>", "").replace("<pad>", "").strip()
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    return text

# Evaluation function
def evaluate_example(input_text, expected_output):
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=1024)
    generated_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Clean up the generated and expected outputs
    generated_output_cleaned = clean_text(generated_output)
    expected_output_cleaned = clean_text(expected_output)

    # Debugging: Print cleaned outputs to check for issues
    print(f"Generated: {generated_output_cleaned}")
    print(f"Expected: {expected_output_cleaned}")
    print(f"Length of generated output: {len(generated_output_cleaned)}")
    print(f"Length of expected output: {len(expected_output_cleaned)}")

    # Check types and contents of predictions and references
    print(f"Generated output type: {type(generated_output_cleaned)}")
    print(f"Expected output type: {type(expected_output_cleaned)}")
    print(f"Generated output as list: {list(generated_output_cleaned)}")
    print(f"Expected output as list: {list(expected_output_cleaned)}")

    # Compute ROUGE and BLEU scores
    try:
        rouge_metric.add(predictions=[generated_output_cleaned], references=[expected_output_cleaned])
        bleu_metric.add(predictions=[generated_output_cleaned.split()], references=[[expected_output_cleaned.split()]])
    except Exception as e:
        print(f"Error in adding metrics: {e}")
        print(f"Generated Output: {generated_output_cleaned}")
        print(f"Expected Output: {expected_output_cleaned}")
        raise e

    return generated_output_cleaned

# Load and evaluate on the dataset
dataset_path = '/kaggle/input/chatllama/combined_esg_dataset.jsonl'
eval_dataset = load_dataset('json', data_files=dataset_path, split='train')

predictions, references = [], []
for example in tqdm(eval_dataset):
    input_text = example['input']
    expected_output = example['output']
    generated_output = evaluate_example(input_text, expected_output)
    
    predictions.append(generated_output)
    references.append(expected_output)

# Compute metrics
rouge_scores = rouge_metric.compute()
bleu_scores = bleu_metric.compute()

# Display results
print("ROUGE Scores:")
for key, value in rouge_scores.items():
    print(f"  {key}: {value['f']:.4f}")

print(f"BLEU Score: {bleu_scores['bleu']:.4f}")

# Save results
results = {
    "rouge_scores": rouge_scores,
    "bleu_scores": bleu_scores,
    "predictions": predictions,
    "references": references,
}

import json
with open(f"./{model_name}-evaluation_results.json", "w") as f:
    json.dump(results, f, indent=4)

In [None]:
# Prepare the model for inference
FastLanguageModel.for_inference(model)


In [None]:
# Testing the fine-tuned model with a sample question
input_text = "How does a company's ESG performance impact its financial performance?"
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_length=1024)
# Adjusting inference parameters
response = model.generate(
    **inputs,
    max_length=1024,
    temperature=0.8,  # Increase temperature for more variability
    top_k=50,         # Top-k sampling
    top_p=0.9,        # Top-p (nucleus) sampling
    repetition_penalty=1.2,  # Increase repetition penalty to avoid loops
)

# Decode and print the response
decoded_response = tokenizer.decode(response[0], skip_special_tokens=True)
print(decoded_response)



In [None]:
import os

# Set your Hugging Face token in the environment variables
os.environ["HF_TOKEN"] = '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'

# Merge the LoRA adapters with the base model and push directly to the Hugging Face Hub
try:
    model.push_to_hub_merged("AchrafGhribi31/llama3-esg-8b-merged", tokenizer, save_method="merged_16bit", token=os.environ.get("HF_TOKEN"))
    print("Merged model pushed to Hugging Face Hub.")
except Exception as e:
    print(f"Failed to push the merged model to Hugging Face: {e}")


In [None]:


# List of questions about ESG reporting standards and KPIs
questions = [
    "What are the key ESG reporting standards companies should follow?",
    "How can companies track and report their carbon footprint?",
    "What are the most important ESG KPIs for the financial sector?",
    "How do ESG reporting standards differ between regions?",
    "What are the benefits of following the Global Reporting Initiative (GRI) standards?",
]


## RAG Implementation


In [None]:
pip install torch transformers unsloth langchain faiss-gpu sentence-transformers 


In [None]:
pip install langchain_community

In [None]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter

# Create the text generation pipeline using your fine-tuned model
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=300,
    temperature=0.3,
    do_sample=True
)

# Create the HuggingFacePipeline LLM
fine_tuned_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Load and process the CSV file
dataset_path = '/kaggle/input/esgdata/CompaniesDataESG.csv'  # Replace with your actual file path
loader = CSVLoader(file_path=dataset_path)
data = loader.load()

# Split the documents into smaller chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
chunked_docs = text_splitter.split_documents(data)

# Create embeddings and build the FAISS index
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
db = FAISS.from_documents(chunked_docs, embeddings)

# Connect query to FAISS index using a retriever
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 4}
)

# Create the Conversational Retrieval Chain
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=fine_tuned_llm,
    retriever=retriever,
    return_source_documents=True
)


In [None]:
import sys

chat_history = []

while True:
    query = input('Prompt (type "exit" to quit): ')
    
    if query.lower() == "exit":
        print("Exiting the chat. Goodbye!")
        break
    
    # Determine if the query is general or company-specific
    company_keywords = ["company", "performance", "ESG risk", "rating", "sector", "employees", 
                        "decarbonization", "target", "turnover", "Name", "Ticker", "Sector"]
    
    if any(keyword.lower() in query.lower() for keyword in company_keywords):
        # Company-specific query, use the retriever
        result = qa_chain.invoke({'question': query, 'chat_history': chat_history})
        answer = result['answer']
        
    else:
        # General ESG question, use the fine-tuned model directly
        inputs = tokenizer(query, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_length=512)
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Print the answer
    print('Answer: ' + answer + '\n')
    
    # Append the question and answer to the chat history
    chat_history.append((query, answer))


In [None]:
!pip install GPUtil

from GPUtil import showUtilization as gpu_usage
gpu_usage()       

In [None]:

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()                           


In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


## BUILDING INTERFACE :

In [None]:
!pip install panel 

In [None]:
model

In [None]:
import panel as pn
from panel.chat import ChatInterface
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter

# Initialize Panel extension
pn.extension()

# Create the text generation pipeline using your fine-tuned model
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=300,
    temperature=0.3,
    do_sample=True
)

# Create the HuggingFacePipeline LLM
fine_tuned_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Load and process the CSV file for RAG
dataset_path = '/kaggle/input/esgdata/CompaniesDataESG.csv'  # Replace with your actual file path
loader = CSVLoader(file_path=dataset_path)
data = loader.load()

# Split the documents into smaller chunks for better retrieval performance
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
chunked_docs = text_splitter.split_documents(data)

# Create embeddings and build the FAISS index
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
db = FAISS.from_documents(chunked_docs, embeddings)

# Connect the query to the FAISS index using a retriever
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 4}
)

# Create the Conversational Retrieval Chain with the fine-tuned LLM
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=fine_tuned_llm,
    retriever=retriever,
    return_source_documents=True
)


In [None]:
# Define the chatbot interaction function
def interact(query, chat_history=[]):
    company_keywords = ["company", "performance", "ESG risk", "rating", "sector", "employees", 
                        "decarbonization", "target", "turnover", "Name", "Ticker", "Sector"]
    
    if any(keyword.lower() in query.lower()):
        result = qa_chain.invoke({'question': query, 'chat_history': chat_history})
        answer = result['answer']
    else:
        inputs = tokenizer(query, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_length=512)
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    chat_history.append((query, answer))
    return answer

# Panel widgets for the chatbot interface
input_box = pn.widgets.TextInput(name="Enter your question:", placeholder="Type your question here...")
output_box = pn.pane.Markdown("", height=300)
submit_button = pn.widgets.Button(name="Submit", button_type="primary")

# Define the function that handles user input and updates the chat
def on_submit(event):
    user_query = input_box.value
    if user_query:
        response = interact(user_query)
        current_text = output_box.object
        output_box.object = f"{current_text}\n\n**User:** {user_query}\n**Bot:** {response}"
        input_box.value = ""  # Clear the input box after submission

# Link the submit button to the on_submit function
submit_button.on_click(on_submit)

# Layout for the Panel app
chat_interface = pn.Column(
    pn.pane.Markdown("# ESG Chatbot Interface"),
    input_box,
    submit_button,
    output_box,
)

# Serve the application (do not specify port or address)
pn.serve(chat_interface)

## Dash


In [None]:
!pip install jupyter-dash


In [None]:
!pip install dash
!pip install dash-bootstrap-components



In [None]:
import os
import dash
from dash import dcc, html, Input, Output, State
import dash_bootstrap_components as dbc
from unsloth import FastLanguageModel
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter

# Initialize the Dash app with external stylesheet
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Load your fine-tuned model and tokenizer using unsloth
model_name = "AchrafGhribi31/llama3-esg-8b-merged-4bit_V0"
tokenizer_name = model_name  # Use the same name for the tokenizer

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=1024,
    load_in_4bit=True,
    token=os.environ.get("HF_TOKEN")
)

# Create the text generation pipeline using your fine-tuned model
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=300,
    temperature=0.3,
    do_sample=True
)

# Create the HuggingFacePipeline LLM
fine_tuned_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Load and process the CSV file
dataset_path = '/kaggle/input/esgdata/CompaniesDataESG.csv'  # Replace with your actual file path
loader = CSVLoader(file_path=dataset_path)
data = loader.load()

# Split the documents into smaller chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
chunked_docs = text_splitter.split_documents(data)

# Create embeddings and build the FAISS index
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
db = FAISS.from_documents(chunked_docs, embeddings)

# Connect query to FAISS index using a retriever
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 4}
)

# Create the Conversational Retrieval Chain
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=fine_tuned_llm,
    retriever=retriever,
    return_source_documents=True
)

# Layout of the Dash app
app.layout = dbc.Container([
    dbc.Row(dbc.Col(html.H1("ESG Chatbot"), className="text-center my-4")),
    dbc.Row(dbc.Col(dcc.Textarea(id="user_input", placeholder="Ask a question...", style={"width": "100%", "height": 100}), className="mb-3")),
    dbc.Row(dbc.Col(dbc.Button("Submit", id="submit_button", color="primary"), className="mb-3 text-center")),
    dbc.Row(dbc.Col(html.Div(id="chat_output"), className="mt-3")),
])

# Callback to handle user input and generate responses
@app.callback(
    Output("chat_output", "children"),
    Input("submit_button", "n_clicks"),
    State("user_input", "value"),
)
def update_chat(n_clicks, user_input):
    if n_clicks is None or user_input.strip() == "":
        return ""

    # Process the query
    result = qa_chain({'question': user_input, 'chat_history': []})
    answer = result['answer']
    
    # Display the answer in the app
    return html.Div([
        html.P(f"User: {user_input}"),
        html.P(f"Bot: {answer}"),
    ])

# Run the app in the notebook
app.run_server(host="0.0.0.0", port=8050)
