# Abschlussprojekt

## Project Setup

### Importing Dependencies

In [1]:
import os
import json
import os
import json
import re
from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
import chromadb

### Loading Environment Variables

In [2]:
load_dotenv()

True

## Data Preperation

### Loading PDF-Extracted Texts

In [3]:
dataset = {}

directory = 'dataset/extracted'

for filename in os.listdir(directory):
    if filename.endswith('.json'):
        filepath = os.path.join(directory, filename)
        with open(filepath, encoding='utf-8') as file:
            data = json.load(file)
            text = ''
            for element in data:
                if element['type'] == 'Table':
                    text += element['metadata']['text_as_html'].strip() + ' '
                else:
                    text += element['text'].strip() + ' '

        fname_upper = filename.upper()

        # Extract metadata
        year_match = re.search(r'20\d\d', fname_upper)
        year = year_match.group(0) if year_match else 'Unknown'

        company = 'Unknown'
        if 'META' in fname_upper:
            company = 'Meta'
        elif 'MICROSOFT' in fname_upper:
            company = 'Microsoft'
        elif 'NVIDIA' in fname_upper:
            company = 'Nvidia'
        elif 'APPLE' in fname_upper:
            company = 'Apple'
        elif 'GOOGLE' in fname_upper or 'ALPHABET' in fname_upper:
            company = 'Google'

        doc_type = 'Unknown'
        if '10Q' in fname_upper:
            doc_type = '10Q'
        elif '10K' in fname_upper or '10-K' in fname_upper:
            doc_type = '10K'
        elif 'ANNUAL' in fname_upper:
            doc_type = 'Annual Report'

        if doc_type == '10Q':
            quarter_match = re.search(r'[1-4]Q', fname_upper)
            quarter = quarter_match.group(0) if quarter_match else 'Unknown'
        else:
            quarter = 'All'

        dataset[filename[:-5]] = {
            'text': text.strip(),
            'year': year,
            'company': company,
            'type': doc_type,
            'quarter': quarter
        }


In [4]:
dataset['10k_microsoft_2021']['text']

'UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 FORM 10-K ☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Fiscal Year Ended June 30, 2021 OR oO ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Transition Period From to Commission File Number 001-37845 MICROSOFT CORPORATION WASHINGTON 91-1144442 (STATE OF INCORPORATION) (I.R.S. ID) ONE MICROSOFT WAY, REDMOND, WASHINGTON 98052-6399 (425) 882-8080 www.microsoft.com/investor Securities registered pursuant to Section 12(b) of the Act: Title of each class Trading Symbol Name of exchange on which registered Common stock, $0.00000625 par value per share MSFT NASDAQ 2.125% Notes due 2021 MSFT NASDAQ 3.125% Notes due 2028 MSFT NASDAQ 2.625% Notes due 2033 MSFT NASDAQ Securities registered pursuant to Section 12(g) of the Act: NONE Indicate by check mark if the registrant is a well-known seasoned issuer, as defined in Rule 405

### Storing Text & Metadata in Langchain Document Objects

In [5]:
docs = []
for document in dataset:
    docs.append(Document(
        page_content=dataset[document]['text'],
        metadata={'source':document, 
                  'year': dataset[document]['year'], 
                  'company': dataset[document]['company'],
                  'type': dataset[document]['type'],
                  'quarter': dataset[document]['quarter']
        }
    ))

### Semantic Text Splitting

In [None]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=os.getenv('GEMINI_API_KEY_1'))
text_splitter = SemanticChunker(embeddings=embeddings, breakpoint_threshold_type="gradient", breakpoint_threshold_amount=85.0)

docs = []
for document in dataset:
    text_chunks = text_splitter.split_text(dataset[document]['text'])
    for text_chunk in text_chunks:
        print(text_chunk)
        print()
        print()
        docs.append(Document(
            page_content=text_chunk,
            metadata={'source':document, 
                    'year': dataset[document]['year'], 
                    'company': dataset[document]['company'],
                    'type': dataset[document]['type'],
                    'quarter': dataset[document]['quarter']}))


## Storing Documents As Embeddings In Chroma Vector DB

In [5]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=os.getenv('GEMINI_API_KEY_1'))
persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("big_tech_financial_reports")

vector_store = Chroma(
    client=persistent_client,
    collection_name="big_tech_financial_reports",
    embedding_function=embeddings,
)

In [None]:
# from itertools import islice
# import time
# count = 0
# for doc in docs:
#     vector_store.add_documents([doc])
#     time.sleep(0.1)
#     count+=1
#     print(str(count) + " documents embedded")

18243 documents embedded
18244 documents embedded
18245 documents embedded
18246 documents embedded
18247 documents embedded
18248 documents embedded
18249 documents embedded
18250 documents embedded
18251 documents embedded
18252 documents embedded
18253 documents embedded
18254 documents embedded
18255 documents embedded
18256 documents embedded
18257 documents embedded
18258 documents embedded
18259 documents embedded
18260 documents embedded
18261 documents embedded
18262 documents embedded
18263 documents embedded
18264 documents embedded
18265 documents embedded
18266 documents embedded
18267 documents embedded
18268 documents embedded
18269 documents embedded
18270 documents embedded
18271 documents embedded
18272 documents embedded
18273 documents embedded
18274 documents embedded
18275 documents embedded
18276 documents embedded
18277 documents embedded
18278 documents embedded
18279 documents embedded
18280 documents embedded
18281 documents embedded
18282 documents embedded


In [6]:
vector_store._collection.get(['50f92b43-2b50-4d9f-9dab-22fa2cf2c61d'])

{'ids': ['50f92b43-2b50-4d9f-9dab-22fa2cf2c61d'],
 'embeddings': None,
 'documents': ['The timing of product introductions can also impact the Company’s net sales to its indirect distribution channels as these channels are filled with new inventory following a product launch, and channel inventory of an older product often declines as the launch of a newer product approaches. Net sales can also be affected when consumers and distributors anticipate a product introduction. Employees As of September 28, 2019, the Company had approximately 137,000 full-time equivalent employees.'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'year': '2019',
   'type': '10K',
   'source': '10-K-2019-Apple',
   'quarter': 'All',
   'company': 'Apple'}]}

### Creating Self-Query Retriever

In [7]:
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_google_genai import ChatGoogleGenerativeAI


metadata_field_info = [
    AttributeInfo(
        name="quarter",
        description="Indicates the financial quarter covered by the report. Accepted values: ['1Q', '2Q', '3Q', '4Q', 'All'] ('All' is for reports spanning the entire fiscal year)",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="The year the financial report was published. Accepted values: ['2019', '2020', '2021', '2022', '2024']",
        type="string",
    ),
    AttributeInfo(
        name="company",
        description="The company that issued the financial report. Accepted values: ['Meta', 'Apple', 'Google', 'Microsoft', 'Nvidia']",
        type="string",
    ),
    AttributeInfo(
        name="type",
        description="The type of financial report issued. Accepted values: ['10K', '10Q', 'Annual Report']",
        type="string",
    )
]

llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash', api_key=os.getenv('GEMINI_API_KEY_1'))
retriever = SelfQueryRetriever.from_llm(
    llm=llm,
    vectorstore=vector_store,
    metadata_field_info=metadata_field_info,
    document_contents='financial reports and investor relations for top tech giants from the last five years',
    enable_limit=True
)

In [9]:
retriever.invoke('How did Microsoft describe AI integration in their business in 2024?')

[Document(id='2e382f41-074a-4f10-9d93-3b973b43d75c', metadata={'type': '10Q', 'source': '1q-10q_microsoft_2021', 'year': '2021', 'company': 'Microsoft', 'quarter': '1Q'}, page_content='As with many disruptive innovations, AI presents risks and challenges that could affect its adoption, and therefore our business.'),
 Document(id='367b9a9b-ef00-4c95-a513-77e35b152b59', metadata={'company': 'Microsoft', 'quarter': '2Q', 'type': '10Q', 'year': '2020', 'source': '2q-10q_microsoft_2020'}, page_content='As with many disruptive innovations, AI presents risks and challenges that could affect its adoption, and therefore our business.'),
 Document(id='884c5ac1-2002-4d3e-8d73-4ba5e51fed48', metadata={'year': '2019', 'company': 'Microsoft', 'type': '10Q', 'source': '3q-10q_microsoft_2019', 'quarter': '3Q'}, page_content='As with many disruptive innovations, AI presents risks and challenges that could affect its adoption, and therefore our business.'),
 Document(id='eb3b25e7-7615-406c-bdd7-a72fb255