# Abschlussprojekt

## Project Setup

### Importing Dependencies

In [3]:
import os
import json
import os
import json
import re
from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
import chromadb

### Loading Environment Variables

In [4]:
load_dotenv()

True

## Data Preperation

### Loading PDF-Extracted Texts

In [3]:
dataset = {}

directory = 'dataset/extracted'

for filename in os.listdir(directory):
    if filename.endswith('.json'):
        filepath = os.path.join(directory, filename)
        with open(filepath, encoding='utf-8') as file:
            data = json.load(file)
            text = ''
            for element in data:
                if element['type'] == 'Table':
                    text += element['metadata']['text_as_html'].strip() + ' '
                else:
                    text += element['text'].strip() + ' '

        fname_upper = filename.upper()

        # Extract metadata
        year_match = re.search(r'20\d\d', fname_upper)
        year = year_match.group(0) if year_match else 'Unknown'

        company = 'Unknown'
        if 'META' in fname_upper:
            company = 'Meta'
        elif 'MICROSOFT' in fname_upper:
            company = 'Microsoft'
        elif 'NVIDIA' in fname_upper:
            company = 'Nvidia'
        elif 'APPLE' in fname_upper:
            company = 'Apple'
        elif 'GOOGLE' in fname_upper or 'ALPHABET' in fname_upper:
            company = 'Google'

        doc_type = 'Unknown'
        if '10Q' in fname_upper:
            doc_type = '10Q'
        elif '10K' in fname_upper or '10-K' in fname_upper:
            doc_type = '10K'
        elif 'ANNUAL' in fname_upper:
            doc_type = 'Annual Report'

        if doc_type == '10Q':
            quarter_match = re.search(r'[1-4]Q', fname_upper)
            quarter = quarter_match.group(0) if quarter_match else 'Unknown'
        else:
            quarter = 'All'

        dataset[filename[:-5]] = {
            'text': text.strip(),
            'year': year,
            'company': company,
            'type': doc_type,
            'quarter': quarter
        }


### Storing Text & Metadata in Langchain Document Objects

In [5]:
docs = []
for document in dataset:
    docs.append(Document(
        page_content=dataset[document]['text'],
        metadata={'source':document, 
                  'year': dataset[document]['year'], 
                  'company': dataset[document]['company'],
                  'type': dataset[document]['type'],
                  'quarter': dataset[document]['quarter']
        }
    ))

### Semantic Text Splitting

In [6]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=os.getenv('GEMINI_API_KEY_1'))
text_splitter = SemanticChunker(embeddings=embeddings, breakpoint_threshold_type="gradient", breakpoint_threshold_amount=85.0)

docs = []
for document in dataset:
    text_chunks = text_splitter.split_text(dataset[document]['text'])
    for text_chunk in text_chunks:
        docs.append(Document(
            page_content=text_chunk,
            metadata={'source':document, 
                    'year': dataset[document]['year'], 
                    'company': dataset[document]['company'],
                    'type': dataset[document]['type'],
                    'quarter': dataset[document]['quarter']}))


In [None]:
# import pickle
# with open('docs.pkl', 'wb') as f:
#     pickle.dump(docs, f)

In [5]:
import pickle
with open('docs.pkl', 'rb') as f:
    docs = pickle.load(f)

In [6]:
print(len(docs))

28377


## Storing Documents As Embeddings In Chroma Vector DB

In [7]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=os.getenv('GEMINI_API_KEY_1'))
persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("big_tech_financial_reports")

vectore_store = Chroma(
    client=persistent_client,
    collection_name="big_tech_financial_reports",
    embedding_function=embeddings,
)

In [None]:
import time

count = 0
for doc in docs:
    vectore_store.add_documents([doc])
    time.sleep(0.1)
    count+=1
    print(str(count) + " documents embedded")

In [12]:
from itertools import islice

count = 18242  # Start from the cutoff
skip_count = 18242  # Number of documents to skip

# Skip the first 15817 documents and process the rest
for doc in islice(docs, skip_count, None):
    vectore_store.add_documents([doc])
    time.sleep(0.1)
    count += 1
    print(str(count) + " documents embedded")

18243 documents embedded
18244 documents embedded
18245 documents embedded
18246 documents embedded
18247 documents embedded
18248 documents embedded
18249 documents embedded
18250 documents embedded
18251 documents embedded
18252 documents embedded
18253 documents embedded
18254 documents embedded
18255 documents embedded
18256 documents embedded
18257 documents embedded
18258 documents embedded
18259 documents embedded
18260 documents embedded
18261 documents embedded
18262 documents embedded
18263 documents embedded
18264 documents embedded
18265 documents embedded
18266 documents embedded
18267 documents embedded
18268 documents embedded
18269 documents embedded
18270 documents embedded
18271 documents embedded
18272 documents embedded
18273 documents embedded
18274 documents embedded
18275 documents embedded
18276 documents embedded
18277 documents embedded
18278 documents embedded
18279 documents embedded
18280 documents embedded
18281 documents embedded
18282 documents embedded
