In [64]:
import os
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq

client = Groq(api_key="key_here")


# <span style="color: aquamarine;">I. Text Retrieving</span>

### <span style="color: yellow;">Load Documents</span>

In [65]:
company_name = "delta"
chunks_path = f"data/{company_name}/chunks.csv"

document_store = pd.read_csv(chunks_path)
document_store.head()

Unnamed: 0,chunk_id,company,source_file,chunk_text
0,chunk_001,delta,annual_report_2024,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...
1,chunk_002,delta,annual_report_2024,included in the filing reflect the correction ...
2,chunk_003,delta,annual_report_2024,the Private Securities Litigation Reform Act o...
3,chunk_004,delta,annual_report_2024,"the end of 2024, we offered up to 5,000 peak-d..."
4,chunk_005,delta,annual_report_2024,domestic net promoter scores and increased cus...


In [66]:
document_store.info()
document_store[['chunk_id', 'company', 'source_file']].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   chunk_id     161 non-null    object
 1   company      161 non-null    object
 2   source_file  161 non-null    object
 3   chunk_text   161 non-null    object
dtypes: object(4)
memory usage: 5.2+ KB


Unnamed: 0,chunk_id,company,source_file
0,chunk_001,delta,annual_report_2024
1,chunk_002,delta,annual_report_2024
2,chunk_003,delta,annual_report_2024
3,chunk_004,delta,annual_report_2024
4,chunk_005,delta,annual_report_2024


### <span style="color: yellow;">Fit TF-IDF</span>

In [67]:
vectorizer = TfidfVectorizer(stop_words="english")
chunk_tfidf = vectorizer.fit_transform(document_store["chunk_text"])
chunk_tfidf.shape

(161, 5983)

### <span style="color: yellow;">Retrieval</span>

In [68]:
def retrieve_chunks(question, top_k=7):
    #Given a user question, return a DataFrame with the top_k most relevant chunks and their similarity scores
    q_vec = vectorizer.transform([question])
    sims = cosine_similarity(q_vec, chunk_tfidf).flatten()
    
    top_idx = np.argsort(sims)[::-1][:top_k]
    
    results = document_store.iloc[top_idx].copy()
    results["score"] = sims[top_idx]
    return results


### <span style="color: yellow;">Prompt Builder</span>

In [69]:
def build_prompt(question, retrieved):
    context = ""

    for _, row in retrieved.iterrows():
        context += (
            f"\n\n---\nChunk ID: {row['chunk_id']}\n"
            f"Source: {row['source_file']}\n"
            f"Text:\n{row['chunk_text']}\n"
        )

    return f"""
You are an AI assistant answering ONLY from provided chunks.
If not in chunks, say: 'The documents do not contain this information.'
Cite chunks using (chunk: chunk_XXXX).

QUESTION:
{question}

CONTEXT:
{context}
""".strip()


### <span style="color: yellow;">LLM QA Function</span>

In [70]:
def answer_question(question, top_k=5, model="llama-3.3-70b-versatile"):
    """
    1. Retrieve relevant chunks
    2. Build prompt with citation instructions
    3. Ask the LLM
    4. Return answer + retrieved chunks
    """
    retrieved = retrieve_chunks(question, top_k)
    prompt = build_prompt(question, retrieved)

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1,
    )

    answer = response.choices[0].message.content
    return answer, retrieved


# <span style="color: aquamarine;">II. Testing</span>

In [84]:
company_snap_questions = [
    "What is Delta Airlines?",
    # "What are the main business segments?",
    # "How does Delta generate revenue?"
    ]

risk_factors_questions = ["What are the main risks?"]

esg_questions = [
    "What environmental goals does the Delta describe?",
    # "What sustainability targets does it mention?",
    ]

custom_questions = [
    "How is Delta's DEI strategy?",
    "What is the major holder breakdown?",
    "What are some awards that Delta achieves?"
    "What is Delta known for"
]



In [83]:
for q in company_snap_questions:
    print("="*80)
    print("QUESTION:", q)
    answer, chunks = answer_question(q)
    print("\nANSWER:\n", answer)
    print("\nTop Chunks:\n", chunks[["chunk_id", "source_file", "score"]])

QUESTION: What is Delta Airlines

ANSWER:
 Delta Airlines is described in the provided chunks as a major airline company that operates in a highly competitive industry, with significant competition from other carriers, including American Airlines and United Airlines (chunk_032). The company has a strong focus on customer service, with a goal of increasing customer loyalty through its award-winning SkyMiles program (chunk_005). Delta also has a diversified business with high-margin revenue streams, including premium products, partnerships, and complementary businesses such as cargo operations and maintenance services (chunk_008). The company has made significant investments in its people, product, and reliability, and has achieved differentiated performance through margin expansion, durable earnings, and free cash flow (chunk_005). (chunk_032, chunk_014, chunk_013, chunk_008, chunk_005)

Top Chunks:
      chunk_id         source_file     score
31  chunk_032  annual_report_2024  0.224142

In [77]:
for q in risk_factors_questions:
    print("="*80)
    print("QUESTION:", q)
    answer, chunks = answer_question(q)
    print("\nANSWER:\n", answer)
    print("\nTop Chunks:\n", chunks[["chunk_id", "source_file", "score"]])

QUESTION: What are the main risks?

ANSWER:
 The main risks are climate-related risks, including physical risks and transition risks, which are assessed and managed through Delta's Enterprise Risk Management (ERM) program (chunk_150). Additionally, the company faces risks related to high temperatures, which can impact the safety of passengers, flight and cabin crew, and ground team members (chunk_121). Other risks mentioned include heat-related injuries or illnesses, aircraft deicing, and airport ramp environment risks (chunk_121).

Top Chunks:
       chunk_id         source_file     score
148  chunk_149     esg_report_2024  0.136556
119  chunk_120     esg_report_2024  0.088340
149  chunk_150     esg_report_2024  0.085457
44   chunk_045  annual_report_2024  0.075872
120  chunk_121     esg_report_2024  0.069453


In [74]:
for q in esg_questions:
    print("="*80)
    print("QUESTION:", q)
    answer, chunks = answer_question(q)
    print("\nANSWER:\n", answer)
    print("\nTop Chunks:\n", chunks[["chunk_id", "source_file", "score"]])

QUESTION: What environmental goals does the Delta describe?

ANSWER:
 The Delta describes environmental goals such as reducing greenhouse gas emissions and achieving net-zero emissions by 2050 (chunk: chunk_139). Delta's Sustainability Strategy focuses on addressing GHG and waste-related impacts through short- and long-term strategies, including working to improve what they fly, how they fly, and the fuel they use (chunk: chunk_139).

Top Chunks:
       chunk_id         source_file     score
29   chunk_030  annual_report_2024  0.133163
158  chunk_159     esg_report_2024  0.104651
28   chunk_029  annual_report_2024  0.095741
138  chunk_139     esg_report_2024  0.087367
159  chunk_160     esg_report_2024  0.081589


In [75]:
for q in custom_questions:
    print("="*80)
    print("QUESTION:", q)
    answer, chunks = answer_question(q)
    print("\nANSWER:\n", answer)
    print("\nTop Chunks:\n", chunks[["chunk_id", "source_file", "score"]])

QUESTION: How is Delta's DEI strategy?

ANSWER:
 Delta's DEI strategy is focused on achieving equitable outcomes for all employees, with key areas including hiring and development, veteran-specific resources, and support for the growth of the Veterans Business Resource Group (chunk_127). The company actively values diversity by hiring highly qualified candidates with diverse backgrounds and skills, and strives to have its senior leadership team reflect the diversity of its workforce (chunk_128). Delta also aims to create merit-based access to career opportunities by considering skills as equivalencies to education, and has implemented various initiatives to promote inclusion and equity, such as partnering with organizations to inspire future aviators from underrepresented groups (chunk_129).

Top Chunks:
       chunk_id      source_file     score
126  chunk_127  esg_report_2024  0.110583
153  chunk_154  esg_report_2024  0.105954
140  chunk_141  esg_report_2024  0.083319
127  chunk_128 