In [2]:
# Dataset Loader
from langchain_community.document_loaders import HuggingFaceDatasetLoader

hf_dataset_name = "jlh-ibm/earnings_call"
subset_name =  "transcripts"

# Path name of Loader
loader = HuggingFaceDatasetLoader(path=hf_dataset_name, name=subset_name, page_content_column="transcript")

In [3]:
documents = loader.load()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
len(documents)

188

In [5]:
documents[187]



In [6]:
company_count = {}
for doc in documents:
    company = doc.metadata.get('company')
    if company: 
        company_count[company] = company_count.get(company, 0) + 1

for company, count in company_count.items():
    print(f"{company}:{count}")

AMD:19
AAPL:19
INTC:19
MU:17
GOOGL:19
ASML:19
CSCO:19
NVDA:19
AMZN:19
MSFT:19


In [7]:
print(documents[0])
print(documents[0].page_content[:200])



In [8]:
first = documents[0]
num_words = len(first.page_content.split())
print(num_words)

8463


In [9]:
total = sum(len(doc.page_content.split()) for doc in documents)
avg = total/len(documents) if documents else 0
print(avg)

8797.186170212766


In [10]:
# Quarter Period Extraction
import re
import datetime

def find_quarter(text:str) -> str | None:
    ''' Extract Quarter and Year like Q1 2024'''
    search_results = re.findall(r"[Q]\d\s\d{4}", text)
    if search_results:
         return str(search_results[0])
    return None
quarter = find_quarter(documents[0].page_content)
print(quarter)

Q2 2016


In [12]:
documents[0]



In [11]:
import os
from langchain_nebius import NebiusEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_core.documents import Document
from tqdm import tqdm


# os.environ["NEBIUS_API_KEY"] = os.getenv("NEBIUS_API_KEY")

# Initialise the Embedding model
embeddings = NebiusEmbeddings(model="Qwen/Qwen3-Embedding-8B")

# Instantiate Langchain Semantic Chunking
langchain_semantic_chunker = SemanticChunker(embeddings=embeddings, breakpoint_threshold_type="percentile")

chunked_docs = []
target_companies = ['AMD', 'NVDA']
doc_to_process = [doc for doc in documents if doc.metadata.get('company') in target_companies]
print(doc_to_process)

#Sample 1 from doc_to_process
doc_to_process = doc_to_process[:1]
print(f"Processing {len(doc_to_process)} documents using Langchain SemanticChunker")

# Looping through each transcript and chunk it
for doc in tqdm(doc_to_process, desc="Chunking Transcripts"):
    # Extract quarters
    quarter = find_quarter(doc.page_content)
    parent_metadata = doc.metadata.copy()
    parent_metadata["quarter"] = quarter

    # Perform semantic chunking using Langchain "Chunkers" Create_document
    chunks = langchain_semantic_chunker.create_documents([doc.page_content], metadatas=[parent_metadata])

    chunked_docs.extend(chunks)

print("Chunking Complete")
print(chunked_docs)

Processing 1 documents using Langchain SemanticChunker


Chunking Transcripts: 100%|██████████| 1/1 [04:34<00:00, 274.67s/it]

Chunking Complete





In [21]:
doc_to_process



In [22]:
chunked_docs
len(chunked_docs)

19

In [23]:
chunked_docs

 Document(metadata={'company': 'AMD', 'date': datetime.date(2016, 7, 21), 'quarter': 'Q2 2016'}, page_content="A question-and-answer session will follow the formal presentation. (Operator Instructions) As a reminder, this conference is being recorded.\\nI would now like to turn the conference over to your host, Ruth Cotter, Chief Human Resources Officer, and Senior Vice President of Corporate Communications and Investor Relations. Thank you. You may begin.\\n\\n--------------------------------------------------------------------------------\\nRuth Cotter,  Advanced Micro Devices, Inc. - Chief Human Resources Officer and SVP of Corporate Communications and IR    [2]\\n--------------------------------------------------------------------------------\\nThank you and welcome to AMD's second-quarter conference call. By now you should have had the opportunity to review a copy of our earnings release and the CFO commentary and slides. If you've not reviewed these documents, they can be found o

In [33]:
# Analysis of Chunks
original_doc_count = len(doc_to_process)
chunked_doc_count = len(chunked_docs)
print("Original Number of docs (transcripts)", original_doc_count)
print("Number of new docs (chunks)", chunked_doc_count)
print(f"Average chunks per transcript: {chunked_doc_count / original_doc_count:.2f}")

print("\n--- Inspecting a sample chunk ---")
sample_chunk = chunked_docs[10]
print("Sample Chunk Content (first 300 chars):")
print(sample_chunk.page_content[:300]+"...")
print("\nSample Chunk Metadata:")
print(sample_chunk.metadata)

# Checking average word count
total_chunk_words = sum(len(doc.page_content.split()) for doc in chunked_docs)
average_chunk_words = total_chunk_words/chunked_doc_count if chunked_doc_count else 0

print(f"Average : {average_chunk_words:.2f}")

# Preprocessing Complete

Original Number of docs (transcripts) 1
Number of new docs (chunks) 19
Average chunks per transcript: 19.00

--- Inspecting a sample chunk ---
Sample Chunk Content (first 300 chars):
No, that's a fair question, Matt. So we have been very focused on the server launch for first half of 2017. Desktop should launch before that. In terms of true volume availability, I believe it will be in the first quarter of 2017. We may ship some limited volume towards the end of the fourth quarte...

Sample Chunk Metadata:
{'company': 'AMD', 'date': datetime.date(2016, 7, 21), 'quarter': 'Q2 2016'}
Average : 445.42


In [38]:
# Extracting Atomic Facts(SPlitting data into smallest form)
from enum import Enum
from pydantic import BaseModel, field_validator
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAI
from tqdm import tqdm

class TemporalType(str, Enum):
    """Enumeration of Temporal Types of Statements"""
    ATEMPORAL = "ATEMPORAL"
    STATIC = "STATIC"
    DYNAMIC = "DYNAMIC"

class StatementType(str, Enum):
    """Enumeration of Statement Types for Statements"""
    FACT = "FACT"
    OPINION = "OPINION"
    PREDICTION = "PREDICTION"

class RawStatement(BaseModel):
    """Model Representing Raw Statement with Type and Temporal Information"""
    statement: str
    statement_type: StatementType
    temporal_type: TemporalType

    @field_validator("temporal_type", mode="before")
    @classmethod
    def _parse_temporal_label(cls, value:str| None) -> TemporalType:
        if value is None:
            return TemporalType.ATEMPORAL
        cleaned_value = value.strip().upper()
        try:
            return TemporalType(cleaned_value)
        except ValueError:
            return ValueError(f"Invalid Temporal Type {value}")
        
    @field_validator("statement_type", mode="before")
    @classmethod
    def _parse_statement_label(cls, value: str | None) -> StatementType:
        if value is None:
            return StatementType.FACT
        cleaned_value = value.strip().upper()
        try:
            return StatementType(cleaned_value)
        except ValueError:
            raise ValueError(f"Invalid statement type: {value}")
        
class RawStatementList(BaseModel):
    """A Container for list of statements extracted from single chunk."""
    statements: list[RawStatement]

In [39]:
# Definitions for LLM to understand
LABEL_DEFINITIONS: dict[str, dict[str, dict[str, str]]] = {
    "episode_labelling":{
        "FACT": dict(definition="Statements that are objective and can be independently verified or falsified through evidence."),
        "OPINION": dict(definition="Statements that contain personal opinions, feelings, values, or judgments that are not independently verifiable."),
        "PREDICTION": dict(definition="Uncertain statements about the future on something that might happen, a hypothetical outcome, unverified claims.")
    },
    "temporal_labelling": {
        "STATIC": dict(definition="Often past tense, think -ed verbs, describing single points-in-time."),
        "DYNAMIC": dict(definition="Often present tense, think -ing verbs, describing a period of time."),
        "ATEMPORAL": dict(definition="Statements that will always hold true regardless of time."),
    },
}

# Creating Prompt
statement_extraction_prompt_template_fixed = """
You are an expert finance professional and information-extraction assistant.

===Inputs===
- main_entity: {main_entity}
- publication_date: {publication_date}
- document_chunk: {document_chunk}

===Tasks===
1. Identify and extract atomic declarative statements from the document_chunk.
2. For each statement, label it as FACT, OPINION, or PREDICTION.
3. For each statement, label it temporally as STATIC, DYNAMIC, or ATEMPORAL.

===Extraction Guidelines===
- Each statement should express a single, complete subject-predicate-object relationship.
- Resolve co-references (e.g., "the company" -> "{main_entity}").
- Include any explicit dates or quantitative qualifiers.

===Label Definitions===
{definitions}

===Example===
Chunk: "On April 1st, 2024, John Smith was appointed CFO of TechNova Inc. He is currently overseeing the company’s global restructuring initiative."
Output: {{
  "statements": [
    {{
      "statement": "John Smith was appointed CFO of TechNova Inc on April 1st, 2024.",
      "statement_type": "FACT",
      "temporal_type": "STATIC"
    }},
    {{
      "statement": "John Smith is currently overseeing TechNova Inc's global restructuring initiative.",
      "statement_type": "FACT",
      "temporal_type": "DYNAMIC"
    }}
  ]
}}
===End of Example===

**Output format**
Return ONLY a valid JSON object matching the schema for `RawStatementList`.
"""

prompt = ChatPromptTemplate.from_template(statement_extraction_prompt_template_fixed)

definition_text = ""
for section_key, section_dict in LABEL_DEFINITIONS.items():
    definition_text+= f"==== {section_key.replace('_', ' ').upper()} DEFINITIONS ====\n"
    for category, details in section_dict.items():
        definition_text+= f"- {category}: {details.get('definition', '')}\n"

# Prompt Template for Statement Extraction Created

In [40]:
import os
from langchain_nebius import ChatNebius
import json

llm = ChatNebius(model="deepseek-ai/DeepSeek-V3")
statement_extraction_chain = prompt | llm.with_structured_output(RawStatementList)

# First chunk for extraction
sample_chunk_for_extrac = chunked_docs[0]
print("--- Running statement extraction on a sample chunk (with fixed prompt) ---")
print(f"Chunk Content:\n{sample_chunk_for_extrac.page_content}")
print("\nInvoking LLM for extraction...")

# Runnin the chain, passing all Variables
extracted_statement_list = statement_extraction_chain.invoke({
    "main_entity": sample_chunk_for_extrac.metadata["company"],
    "publication_date": sample_chunk_for_extrac.metadata["date"].isoformat(),
    "document_chunk": sample_chunk_for_extrac.page_content,
    "definitions": definition_text
})

print("\n--- Extraction Result ---")
print(extracted_statement_list.model_dump_json(indent = 2))

--- Running statement extraction on a sample chunk (with fixed prompt) ---
Chunk Content:

Invoking LLM for extraction...

--- Extraction Result ---
{
  "statements": [
    {
      "statement": "The Q2 2016 Advanced Micro Devices Inc Earnings Call took place on July 21, 2016 at 9:00PM GMT.",
      "statement_type": "FACT",
      "temporal_type": "STATIC"
    },
    {
      "statement": "Lisa Su is the President and CEO of Advanced Micro Devices, Inc.",
      "statement_type": "FACT",
      "temporal_type": "ATEMPORAL"
    },
    {
      "statement": "Devinder Kumar is the SVP, CFO, and Treasurer of Advanced Micro Devices, Inc.",
      "statement_type": "FACT",
      "temporal_type": "ATEMPORAL"
    },
    {
      "statement": "Ruth Cotter is the Chief Human Resources Officer and SVP of Corporate Communications and IR of Advanced Micro Devices, Inc.",
      "statement_type": "FACT",
      "temporal_type": "ATEMPORAL"
    },
    {
      "statement": "Matt Ramsay is an Analyst at Canaccor

In [42]:
from datetime import datetime, timezone
from pydantic import Field
from dateutil.parser import parse

def parse_date_str(value: str | datetime | None) -> datetime | None :
    if not value:
        return None
    if isinstance(value, datetime):
        return value if value.tzinfo else value.replace(tzinfo=timezone.utc)
    
    try:
        if re.fullmatch(r"\\d{4}", value.strip()):
            year = int(value.strip())
            return datetime(year, 1, 1, tzinfo=timezone.utc)
        dt: datetime = parse(value)
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt
    except Exception:
        return None

# Pydantic Models for Structured Outputs
class RawTemporalRange(BaseModel):
    """Model Representing the raw temporal validity range as strings from LLM"""
    valid_at: str | None = Field(None, description="The start date/time of the event's validity in ISO 8601 format.")
    invalid_at: str | None = Field(None, description="The start date/time of the event's validity in ISO 8601 format.")

class TemporalValidityRange(BaseModel):
    """Model Representing the parsed and validated temporal range as datetime objects"""
    valid_at: datetime | None = None
    invalid_at: datetime | None = None

    @field_validator("valid_at", "invalid_at", mode = "before")
    @classmethod
    def _parse_data_string(cls, value: str | datetime | None) -> datetime | None:
        return parse_date_str(value)
    
# Pydantic models and helper function for temporal range extraction are defined.

In [44]:
date_extraction_prompt_template = """
You are a temporal information extraction specialist.

INPUTS:
- statement: "{statement}"
- statement_type: "{statement_type}"
- temporal_type: "{temporal_type}"
- publication_date: "{publication_date}"
- quarter: "{quarter}"

TASK:
- Analyze the statement and determine the temporal validity range (valid_at, invalid_at).
- Use the publication date as the reference point for relative expressions (e.g., "last month", "currently").
- If a relationship is ongoing or its end is not specified, `invalid_at` should be null.

GUIDANCE:
- For STATIC statements, `invalid_at` is usually null. The `valid_at` is the date the event occurred.
- For DYNAMIC statements, `valid_at` is when the state began, and `invalid_at` is when it ended.
- Return all dates in ISO 8601 format (e.g., YYYY-MM-DDTHH:MM:SSZ).
- If only a year is mentioned, use the first day of that year (YYYY-01-01T00:00:00Z).

**Output format**
Return ONLY a valid JSON object matching the schema for `RawTemporalRange`.
"""

#Langchain Chain
date_extraction_prompt = ChatPromptTemplate.from_template(date_extraction_prompt_template)
date_extract_chain = date_extraction_prompt | llm.with_structured_output(RawTemporalRange)

# Date Extraction Chain Created Successfully

In [45]:
# Let's take the second statement from our previous result: "Lisa Su is the President and CEO of AMD."
# The original model classified it as STATIC, let's see how our date extraction handles it.
sample_statement = extracted_statement_list.statements[1]
chunk_metadata = sample_chunk_for_extrac.metadata

print(f"--- Running date extraction for statement ---")
print(f'Statement: "{sample_statement.statement}"')
print(f"Statement Type: {sample_statement.statement_type.value}")
print(f"Temporal Type: {sample_statement.temporal_type.value}")
print(f"Reference Publication Date: {chunk_metadata['date'].isoformat()}")

# Invoking the data extraction
raw_temporal_range = date_extract_chain.invoke({
    "statement": sample_statement.statement,
    "statement_type": sample_statement.statement_type.value,
    "temporal_type": sample_statement.temporal_type.value,
    "publication_date": chunk_metadata["date"].isoformat(),
    "quarter": chunk_metadata["quarter"] 
})

# Validate and Parse LLM output into final datetime model
final_temporal_range = TemporalValidityRange.model_validate(raw_temporal_range.model_dump())

print("\n--- Parsed & Validated Result (with fix) ---")
print(f"Valid At: {final_temporal_range.valid_at}")
print(f"Invalid At: {final_temporal_range.invalid_at}")

--- Running date extraction for statement ---
Statement: "Lisa Su is the President and CEO of Advanced Micro Devices, Inc."
Statement Type: FACT
Temporal Type: ATEMPORAL
Reference Publication Date: 2016-07-21

--- Parsed & Validated Result (with fix) ---
Valid At: 2016-07-21 00:00:00+00:00
Invalid At: None
