In [2]:
from pathlib import Path
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling


# === Config ===
file_path = Path("../data/domain_corpus/sec_filings/Apple_10-K_2023.txt").resolve()
model_checkpoint = "bert-base-uncased"  # or try "distilbert-base-uncased"

print(f"🔍 Looking for file at: {file_path}")

# === Load and read text ===
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

print(f"✅ Loaded {len(raw_text):,} characters")

# === Split into chunks ===
chunk_size = 512
chunks = [raw_text[i:i+chunk_size] for i in range(0, len(raw_text), chunk_size)]

print(f"📦 Total chunks: {len(chunks)}")

# === Wrap chunks in dicts ===
data = [{"text": chunk} for chunk in chunks]

# === Convert to HuggingFace dataset ===
dataset = Dataset.from_list(data)

# === Load tokenizer ===
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# === Tokenize chunks ===
def tokenize(example):
    # Tokenize the input text and automatically create masked labels
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_special_tokens_mask=True,
    )


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

tokenized_dataset = dataset.map(tokenize, batched=True)

tokenized_dataset.set_format("torch")
tokenized_dataset.save_to_disk("data/tokenized/sec_apple_10k")
print("✅ Tokenized dataset saved!")


🔍 Looking for file at: C:\Users\Darryl Carpenter\Coding\VScodeProjects\DomainLLMChatbot\data\domain_corpus\sec_filings\Apple_10-K_2023.txt
✅ Loaded 5,704 characters
📦 Total chunks: 12


Map: 100%|██████████| 12/12 [00:00<00:00, 139.85 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 12/12 [00:00<00:00, 660.31 examples/s]

✅ Tokenized dataset saved!





In [6]:
from transformers import pipeline, AutoTokenizer, AutoModelForMaskedLM

# === Load Fine-tuned Model ===
model_path = "../models/finetuned_model"


tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModelForMaskedLM.from_pretrained(model_path, local_files_only=True)

# === Load Pipeline ===
nlp_fill = pipeline("fill-mask", model=model, tokenizer=tokenizer)

# === Test Sentences ===
sentences = [
    "Apple Inc. reported a net income of [MASK] million dollars in Q4.",
    "The company expects [MASK] growth in the next fiscal year.",
    "Operating expenses were reduced by [MASK] percent compared to last quarter.",
]

# === Run Prediction ===
for sentence in sentences:
    print(f"\n🔍 Input: {sentence}")
    results = nlp_fill(sentence)
    for res in results:
        print(f"👉 {res['sequence']} (score: {res['score']:.4f})")


Device set to use cpu



🔍 Input: Apple Inc. reported a net income of [MASK] million dollars in Q4.
👉 apple inc. reported a net income of 10 million dollars in q4. (score: 0.0303)
👉 apple inc. reported a net income of 100 million dollars in q4. (score: 0.0292)
👉 apple inc. reported a net income of one million dollars in q4. (score: 0.0279)
👉 apple inc. reported a net income of 1 million dollars in q4. (score: 0.0224)
👉 apple inc. reported a net income of 3 million dollars in q4. (score: 0.0181)

🔍 Input: The company expects [MASK] growth in the next fiscal year.
👉 the company expects significant growth in the next fiscal year. (score: 0.1884)
👉 the company expects further growth in the next fiscal year. (score: 0.1548)
👉 the company expects continued growth in the next fiscal year. (score: 0.1155)
👉 the company expects rapid growth in the next fiscal year. (score: 0.0747)
👉 the company expects additional growth in the next fiscal year. (score: 0.0424)

🔍 Input: Operating expenses were reduced by [MASK] percen

In [7]:
# === Batch Evaluation ===
examples = [
    "Revenue increased by [MASK] percent compared to last year.",
    "The company reported a [MASK] net income in Q3.",
    "Apple’s earnings per share were [MASK].",
    "Cash flows from operations totaled [MASK] billion dollars.",
    "Total liabilities decreased by [MASK] percent.",
]

for sentence in examples:
    print(f"\n🔍 Input: {sentence}")
    results = nlp_fill(sentence)
    for res in results:
        print(f"👉 {res['sequence']} (score: {res['score']:.4f})")



🔍 Input: Revenue increased by [MASK] percent compared to last year.
👉 revenue increased by 20 percent compared to last year. (score: 0.0429)
👉 revenue increased by 10 percent compared to last year. (score: 0.0328)
👉 revenue increased by 25 percent compared to last year. (score: 0.0301)
👉 revenue increased by 30 percent compared to last year. (score: 0.0283)
👉 revenue increased by 15 percent compared to last year. (score: 0.0266)

🔍 Input: The company reported a [MASK] net income in Q3.
👉 the company reported a low net income in q3. (score: 0.1501)
👉 the company reported a stable net income in q3. (score: 0.0987)
👉 the company reported a net net income in q3. (score: 0.0526)
👉 the company reported a high net income in q3. (score: 0.0382)
👉 the company reported a quarterly net income in q3. (score: 0.0365)

🔍 Input: Apple’s earnings per share were [MASK].
👉 apple ’ s earnings per share were zero. (score: 0.0451)
👉 apple ’ s earnings per share were low. (score: 0.0231)
👉 apple ’ s earnin

In [8]:
while True:
    prompt = input("\n💬 Ask your model (type 'exit' to quit): ")
    if prompt.lower() == "exit":
        break
    if "[MASK]" not in prompt:
        print("⚠️ Please include [MASK] in your prompt.")
        continue
    results = nlp_fill(prompt)
    for res in results:
        print(f"🧠 {res['sequence']} (confidence: {res['score']:.4f})")


🧠 cash flow was two billion dollars. (confidence: 0.1247)
🧠 cash flow was a billion dollars. (confidence: 0.1210)
🧠 cash flow was one billion dollars. (confidence: 0.1073)
🧠 cash flow was three billion dollars. (confidence: 0.0859)
🧠 cash flow was five billion dollars. (confidence: 0.0716)
🧠 cash flow past 10 years was 10 billion dollars. (confidence: 0.0534)
🧠 cash flow past 10 years was 1 billion dollars. (confidence: 0.0340)
🧠 cash flow past 10 years was one billion dollars. (confidence: 0.0331)
🧠 cash flow past 10 years was 2 billion dollars. (confidence: 0.0317)
🧠 cash flow past 10 years was 3 billion dollars. (confidence: 0.0294)
⚠️ Please include [MASK] in your prompt.
⚠️ Please include [MASK] in your prompt.
⚠️ Please include [MASK] in your prompt.
⚠️ Please include [MASK] in your prompt.
⚠️ Please include [MASK] in your prompt.
⚠️ Please include [MASK] in your prompt.
⚠️ Please include [MASK] in your prompt.
⚠️ Please include [MASK] in your prompt.
⚠️ Please include [MASK] in 