### data processing


In [2]:
!git clone https://github.com/rajdeep345/ECTSum.git

Cloning into 'ECTSum'...
remote: Enumerating objects: 16502, done.[K
remote: Counting objects: 100% (1268/1268), done.[K
remote: Compressing objects: 100% (824/824), done.[K
remote: Total 16502 (delta 508), reused 981 (delta 426), pack-reused 15234 (from 1)[K
Receiving objects: 100% (16502/16502), 26.46 MiB | 16.50 MiB/s, done.
Resolving deltas: 100% (6925/6925), done.
Updating files: 100% (16593/16593), done.


In [9]:
!ls ECTSum/data/final/train/ects | head
!ls ECTSum/data/final/train/gt_summaries | head

AAN_q1_2021.txt
AAP_q1_2021.txt
AAP_q2_2021.txt
AAP_q3_2021.txt
AAT_q1_2020.txt
AAT_q2_2020.txt
AAT_q2_2021.txt
AAT_q3_2020.txt
AAT_q3_2021.txt
ABC_q1_2021.txt
AAN_q1_2021.txt
AAP_q1_2021.txt
AAP_q2_2021.txt
AAP_q3_2021.txt
AAT_q1_2020.txt
AAT_q2_2020.txt
AAT_q2_2021.txt
AAT_q3_2020.txt
AAT_q3_2021.txt
ABC_q1_2021.txt


In [25]:
import os, json
from tqdm import tqdm

def build_json(split_dir, out_path):
    ect_dir = os.path.join(split_dir, "ects")
    sum_dir = os.path.join(split_dir, "gt_summaries")

    data = []
    files = sorted(os.listdir(ect_dir))

    for f in tqdm(files):
        ect_path = os.path.join(ect_dir, f)
        sum_path = os.path.join(sum_dir, f)

        if not os.path.exists(sum_path):
            continue

        with open(ect_path, "r", encoding="utf-8") as fe:
            transcript = fe.read().strip()

        with open(sum_path, "r", encoding="utf-8") as fs:
            summary = fs.read().strip()

        data.append({
            "transcript": transcript,
            "summary": summary
        })

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

build_json("ECTSum/data/final/test", "workspace/test_raw.json")
# execute with train then val and then test

100%|██████████| 495/495 [00:00<00:00, 19517.19it/s]


In [26]:
import json, random

data = json.load(open("workspace/test_raw.json"))
# execute with train then val and then test

samples = random.sample(data, 2)
for i, r in enumerate(samples, 1):
    print("="*80)
    print(f"SAMPLE {i}")
    print("TRANSCRIPT:", r["transcript"][:500])
    print("SUMMARY:", r["summary"][:500])

SAMPLE 1
TRANSCRIPT: These are important to review and contemplate.
As everyone on the call today is aware, business environment uncertainty remains heightened due to COVID-19.
These items include shutdown impacts for many areas of the economy, changes to consumer purchasing habits potential for a disruptive supply chain and various other economic factors.
This means that results could change at any time and the forecasted impact of risk consideration to the best estimate based on information available as of today's
SUMMARY: compname posts q3 revenue $238.8 million.
q3 revenue $238.8 million versus refinitiv ibes estimate of $231.5 million.
q3 non-gaap earnings per share $0.81.
q3 earnings per share $0.81.
raising full-year fiscal 2021 outlook for revenue and earnings per share.
sees 2021 revenue approximately $935 million.
sees 2021 adjusted e.p.s. approximately $3.22.
SAMPLE 2
TRANSCRIPT: For a further discussion of the risks related to our business, please see our 10-K and subsequen

In [27]:
import re

def looks_like_bad_sample(t, s):
    bad_patterns = [
        r"\bQ:\b", r"\bA:\b",
        r"\bquestion\b", r"\banswer\b",
        r"\binstruction\b", r"\brespond\b",
        r"\bchoices\b", r"\boption\b"
    ]
    txt = (t + " " + s).lower()
    return any(re.search(p, txt) for p in bad_patterns)

clean = []
dropped = 0

for r in data:
    if not r["transcript"] or not r["summary"]:
        dropped += 1; continue
    if looks_like_bad_sample(r["transcript"], r["summary"]):
        dropped += 1; continue
    clean.append(r)

print("Dropped:", dropped)

with open("workspace/test_clean.json", "w") as f:
# execute with train then val and then test
    json.dump(clean, f, ensure_ascii=False, indent=2)

Dropped: 184


In [16]:
!pip install -U transformers --quiet

In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3")

MAX_INPUT_TOKENS = 4096
MAX_SUMMARY_WORDS = 300

def truncate_tokens(text, max_tokens):
    ids = tokenizer.encode(text, truncation=True, max_length=max_tokens)
    return tokenizer.decode(ids, skip_special_tokens=True)

def cap_words(text, max_words):
    words = text.split()
    return " ".join(words[:max_words])

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [28]:
import json
from tqdm import tqdm

data = json.load(open("workspace/train_clean.json"))
# execute with train then val and then test

with open("workspace/test.jsonl", "w", encoding="utf-8") as out:
    for r in tqdm(data):
        inp = truncate_tokens(r["transcript"], MAX_INPUT_TOKENS)
        outp = cap_words(r["summary"], MAX_SUMMARY_WORDS)

        final_inp = f"<FINANCIAL_REPORT>\n{inp}\n</FINANCIAL_REPORT>"
        final_out = f"<EXEC_SUMMARY>\n{outp}\n</EXEC_SUMMARY>"

        out.write(json.dumps({
            "input": final_inp,
            "output": final_out
        }, ensure_ascii=False) + "\n")

100%|██████████| 1043/1043 [00:12<00:00, 85.19it/s]


### start


In [1]:
#https://colab.research.google.com/github/huggingface/trl/blob/main/examples/notebooks/sft_trl_lora_qlora.ipynb?utm_source=chatgpt.com

In [2]:
#from whatever important details have been mentioned in this chat, create a prompt using those to initialise anther chat because this one has started lagging. i will be using that for testing

#### IMP: this colab excludes the testing process of the trainer loop and some data preprocessing steps -> they are implemented in the other notebook

### step 1: loading model

In [3]:
!pip install -U unsloth transformers trl --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m381.1/381.1 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.0/557.0 kB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.7/295.7 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/122.9 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
import unsloth

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [7]:
from transformers import (
    TrainingArguments,
    Trainer,
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import pandas as pd
import numpy as np

In [5]:
# # define the bnb config for loading the llm in quantised state
# config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=torch.float16,
# )

# #load the model and pass the bnb config
# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.3", quantization_config=config)

In [8]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-bnb-4bit",
    max_seq_length = 4096,
    dtype = torch.float16,   # safe on T4
    load_in_4bit = True,
)

==((====))==  Unsloth 2026.1.2: Fast Mistral patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [7]:
# from peft import prepare_model_for_kbit_training

# model = prepare_model_for_kbit_training(model)

# Casts layer norms to fp32 → numerical stability
# Enables gradient checkpointing → lower VRAM
# Freezes all base model parameters → only adapters can train

#loaded the model in quantised state, q is done

In [8]:
# from peft import LoraConfig

# lora_config = LoraConfig(
#     r=16,
#     lora_alpha=8,
#     target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM"
# )
#apply lora config, qlora is done

In [9]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha = 8,
    lora_dropout = 0.05,
    bias="none",
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2026.1.2 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [10]:
# inject lora layers into the model
# from peft import get_peft_model

# model = get_peft_model(model, lora_config)

# must be 0.1-0.3 % of the models total parameters
model.print_trainable_parameters()

trainable params: 13,631,488 || all params: 7,255,363,584 || trainable%: 0.1879


### step 2: data loading

In [12]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files={
        "train": "train.jsonl",
        "validation": "val.jsonl"
    }
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

### step 3: training the model

In [14]:
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3")
# tokenizer.pad_token = tokenizer.eos_token

In [13]:

training_args = TrainingArguments(
    output_dir="./qlora-mistral-ect",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    learning_rate=1e-4,               # safer than 2e-4 for longer runs


    report_to="tensorboard",
    # trackio_space_id="qlora-finance", # Commented out as reporting is disabled

    logging_steps=50,                 # less noise, cleaner logs
    save_strategy="steps",
    save_steps=500,                   # periodic adapter checkpoints
    save_total_limit=2,               # keep disk clean
    eval_steps=500,                   # validate periodically
  )


In [14]:
def formatting_func(examples):
    texts = []
    for inp, out in zip(examples["input"], examples["output"]):
        texts.append(f"{inp}\n{out}")
    return texts

In [15]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    formatting_func=formatting_func,
    max_seq_length=4096,
    args=training_args,
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/1043 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/1043 [00:00<?, ? examples/s]

In [16]:
trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,043 | Num Epochs = 2 | Total steps = 262
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 13,631,488 of 7,255,363,584 (0.19% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
50,1.8928
100,1.8686
150,1.8443
200,1.8463
250,1.838


TrainOutput(global_step=262, training_loss=1.8571138454757574, metrics={'train_runtime': 6097.0458, 'train_samples_per_second': 0.342, 'train_steps_per_second': 0.043, 'total_flos': 9.114111922844467e+16, 'train_loss': 1.8571138454757574, 'epoch': 2.0})

In [17]:
model.save_pretrained("./lora-adapters")
tokenizer.save_pretrained("./lora-adapters")

('./lora-adapters/tokenizer_config.json',
 './lora-adapters/special_tokens_map.json',
 './lora-adapters/tokenizer.model',
 './lora-adapters/added_tokens.json',
 './lora-adapters/tokenizer.json')

### step 4: push to HF repo

In [18]:
from transformers import AutoTokenizer

# save tokenizer into adapter dir
tokenizer.save_pretrained("./lora-adapters")

('./lora-adapters/tokenizer_config.json',
 './lora-adapters/special_tokens_map.json',
 './lora-adapters/tokenizer.model',
 './lora-adapters/added_tokens.json',
 './lora-adapters/tokenizer.json')

In [19]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
from huggingface_hub import create_repo, upload_folder

repo_id = "devanshpursnani/mistral7b-ectsum"

create_repo(repo_id, exist_ok=True)

upload_folder(
    repo_id=repo_id,
    folder_path="./lora-adapters",
    path_in_repo="."
)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...-adapters/tokenizer.model: 100%|##########|  493kB /  493kB            

  ...adapter_model.safetensors:   1%|1         |  555kB / 54.6MB            

CommitInfo(commit_url='https://huggingface.co/devanshpursnani/mistral7b-ectsum/commit/defefb238620bbd63a1e3600c27a303dd46f9e66', commit_message='Upload folder using huggingface_hub', commit_description='', oid='defefb238620bbd63a1e3600c27a303dd46f9e66', pr_url=None, repo_url=RepoUrl('https://huggingface.co/devanshpursnani/mistral7b-ectsum', endpoint='https://huggingface.co', repo_type='model', repo_id='devanshpursnani/mistral7b-ectsum'), pr_revision=None, pr_num=None)

### step 5: validate

In [25]:
!pip install -U evaluate rouge_score bert_score bleu --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for bleu (setup.py) ... [?25l[?25hdone
