In [1]:
pip install torch transformers datasets sentencepiece pandas scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.1-cp313-cp313-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ------------------- -------------------- 4.2/8.7 MB 26.1 MB/s eta 0:00:01
   ---------------------------------------- 8.7/8.7 MB 28.6 MB/s eta 0:00:00
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn

   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-lea


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import pandas as pd

# Paths
RAW_DIR = "./raw_data"
PROCESSED_DIR = "./processed"

os.makedirs(PROCESSED_DIR, exist_ok=True)

In [6]:
CALLS_FILE = os.path.join(RAW_DIR, r"C:\Users\dhana\Downloads\pro\call_recordings.csv")
df = pd.read_csv(CALLS_FILE)


In [7]:
def make_pseudo_summary(row):
    parts = []
    if "Type" in row and pd.notna(row["Type"]):
        parts.append(f"This was a {row['Type'].lower()} call")
    if "Sentiment" in row and pd.notna(row["Sentiment"]):
        parts.append(f"with a {row['Sentiment'].lower()} sentiment")
    if "Name" in row and pd.notna(row["Name"]):
        parts.append(f"involving {row['Name']}")
    if "Order Number" in row and pd.notna(row["Order Number"]):
        parts.append(f"regarding order {row['Order Number']}")
    return ", ".join(parts) if parts else "Customer service call summary."

In [8]:
records = []
for _, r in df.iterrows():
    src = r.get("Transcript", "")
    tgt = make_pseudo_summary(r)
    records.append({"src": src, "tgt": tgt})

out_df = pd.DataFrame(records)

In [9]:
train_df = out_df.sample(frac=0.8, random_state=42)
val_df = out_df.drop(train_df.index)

train_df.to_csv(os.path.join(PROCESSED_DIR, "train.csv"), index=False)
val_df.to_csv(os.path.join(PROCESSED_DIR, "val.csv"), index=False)

print(f"✅ Saved {len(train_df)} train and {len(val_df)} val samples in {PROCESSED_DIR}")

✅ Saved 16 train and 4 val samples in ./processed


In [20]:
"""
Fine-tune BART on your call transcripts with maximum compatibility
across different transformers versions (old/new).

Requires:
  - processed/train.csv  (columns: src, tgt)
  - processed/val.csv    (columns: src, tgt)
"""

import os
import inspect
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    BartTokenizerFast,
    BartForConditionalGeneration,
    Trainer,
    TrainingArguments,
    __version__ as TRANSFORMERS_VERSION,
)

print(f"[info] transformers version detected: {TRANSFORMERS_VERSION}")

# --------------------------
# Paths
# --------------------------
TRAIN_CSV = "./processed/train.csv"
VAL_CSV   = "./processed/val.csv"
OUT_DIR   = "./bart_finetuned"

assert os.path.exists(TRAIN_CSV), f"Missing {TRAIN_CSV}. Run your Step-1 script first."
assert os.path.exists(VAL_CSV),   f"Missing {VAL_CSV}. Run your Step-1 script first."

# --------------------------
# Load data
# --------------------------
train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)

# Ensure required columns
for dfname, df in [("train", train_df), ("val", val_df)]:
    for col in ("src", "tgt"):
        if col not in df.columns:
            raise ValueError(f"[error] {dfname}.csv must contain column '{col}'")

train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)

# --------------------------
# Load tokenizer & model
# --------------------------
MODEL_NAME = "facebook/bart-base"
tokenizer = BartTokenizerFast.from_pretrained(MODEL_NAME)
model     = BartForConditionalGeneration.from_pretrained(MODEL_NAME)

# --------------------------
# Tokenization (compatible)
# --------------------------
MAX_IN, MAX_OUT = 512, 128

def preprocess(batch):
    # Encode inputs
    model_inputs = tokenizer(
        batch["src"],
        max_length=MAX_IN,
        truncation=True,
        padding="max_length",
    )

    # Encode targets; try the new API first, fallback to old
    try:
        labels = tokenizer(
            text_target=batch["tgt"],
            max_length=MAX_OUT,
            truncation=True,
            padding="max_length",
        )
    except TypeError:
        # Older transformers: use as_target_tokenizer()
        with tokenizer.as_target_tokenizer():  # type: ignore[attr-defined]
            labels = tokenizer(
                batch["tgt"],
                max_length=MAX_OUT,
                truncation=True,
                padding="max_length",
            )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_ds = train_ds.map(preprocess, batched=True, remove_columns=["src", "tgt"])
val_ds   = val_ds.map(preprocess,   batched=True, remove_columns=["src", "tgt"])

# --------------------------
# Build TrainingArguments dynamically
# --------------------------
sig = inspect.signature(TrainingArguments.__init__)
allowed = set(sig.parameters.keys()) - {"self"}

def add_if_supported(d, key, value):
    if key in allowed:
        d[key] = value

args_dict = {}
# Required
add_if_supported(args_dict, "output_dir", OUT_DIR)

# Batch size: prefer per_device*, fallback to per_gpu*
if "per_device_train_batch_size" in allowed:
    args_dict["per_device_train_batch_size"] = 2
    args_dict["per_device_eval_batch_size"]  = 2 if "per_device_eval_batch_size" in allowed else None
else:
    # Very old versions
    add_if_supported(args_dict, "per_gpu_train_batch_size", 2)
    add_if_supported(args_dict, "per_gpu_eval_batch_size", 2)

# Common safe args
add_if_supported(args_dict, "learning_rate", 5e-5)
add_if_supported(args_dict, "num_train_epochs", 3)
add_if_supported(args_dict, "weight_decay", 0.01)
add_if_supported(args_dict, "logging_dir", "./logs")
add_if_supported(args_dict, "logging_steps", 10)
add_if_supported(args_dict, "save_total_limit", 2)

# fp16 only if GPU and arg exists
if torch.cuda.is_available():
    add_if_supported(args_dict, "fp16", True)

# Evaluation/save strategies (newer versions only)
# If your version is old, these simply won't be set.
add_if_supported(args_dict, "evaluation_strategy", "epoch")
add_if_supported(args_dict, "save_strategy", "epoch")
add_if_supported(args_dict, "load_best_model_at_end", False)  # optional; keep False to avoid extra requirements

# Remove any None values (in case per_device_eval_batch_size was missing)
args_dict = {k: v for k, v in args_dict.items() if v is not None}

print("[info] TrainingArguments being used:")
for k, v in args_dict.items():
    print(f"  - {k} = {v}")

training_args = TrainingArguments(**args_dict)

# --------------------------
# Trainer
# --------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds if "evaluation_strategy" in args_dict or "evaluate_during_training" in args_dict else val_ds,  # safe to pass anyway
    tokenizer=tokenizer,
)

# --------------------------
# Train & Save
# --------------------------
trainer.train()
os.makedirs(OUT_DIR, exist_ok=True)
trainer.save_model(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)
print(f"✅ Fine-tuned model saved to {OUT_DIR}")


[info] transformers version detected: 4.55.2


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

[info] TrainingArguments being used:
  - output_dir = ./bart_finetuned
  - per_device_train_batch_size = 2
  - per_device_eval_batch_size = 2
  - learning_rate = 5e-05
  - num_train_epochs = 3
  - weight_decay = 0.01
  - logging_dir = ./logs
  - logging_steps = 10
  - save_total_limit = 2
  - save_strategy = epoch
  - load_best_model_at_end = False


  trainer = Trainer(


Step,Training Loss
10,11.041
20,7.4367


  if is_main_process:


✅ Fine-tuned model saved to ./bart_finetuned


In [22]:
pip install streamlit pandas sqlite3 pytesseract pillow moviepy SpeechRecognition docx2txt pypdf2


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement sqlite3 (from versions: none)

[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for sqlite3
