# Company Extraction Demo

This notebook demonstrates the company extraction functionality with chunking and async processing.

In [None]:
import pandas as pd
import asyncio
import json
from pathlib import Path
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Import the extraction functions
from extract_companies_optimized import (
    process_article_async,
    get_azure_client,
    chunk_text,
    process_chunk_async,
    merge_chunk_results
)

In [None]:
# Load your company database
company_database = pd.read_csv('company_database.csv')
print(f"Loaded {len(company_database)} companies from database")

## Process a Single Article

Let's process a single article to see how it works:

In [None]:
# Example article text
article_text = """
Apple Inc. (AAPL) reported strong earnings today. Microsoft (MSFT) and Google (GOOGL) also showed positive results. 
The tech sector was led by these companies, with Amazon (AMZN) and Meta (META) following closely behind.
"""

# Initialize Azure OpenAI client
llm = get_azure_client()

# Process the article
results, metrics = await process_article_async(article_text, company_database, llm)

print("\nExtracted Companies:")
for company in results:
    print(f"- {company.get('CompanyName')} ({company.get('RIC')})")

print("\nProcessing Metrics:")
print(json.dumps(metrics, indent=2))

## Process Multiple Articles

Now let's process multiple articles in parallel:

In [None]:
# Load articles from a directory
def load_articles(articles_dir: str):
    articles = []
    articles_path = Path(articles_dir)
    
    if not articles_path.exists():
        raise FileNotFoundError(f"Articles directory not found: {articles_dir}")
        
    for file_path in articles_path.glob("*.txt"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                articles.append({
                    "id": file_path.stem,
                    "content": content,
                    "file_path": str(file_path)
                })
        except Exception as e:
            logger.error(f"Error loading article {file_path}: {e}")
            
    return articles

# Load articles
articles = load_articles('articles')
print(f"Loaded {len(articles)} articles")

In [None]:
# Process articles in parallel
async def process_articles_batch(articles, batch_size=5):
    results = []
    
    for i in range(0, len(articles), batch_size):
        batch = articles[i:i + batch_size]
        print(f"\nProcessing batch {i//batch_size + 1}/{(len(articles) + batch_size - 1)//batch_size}")
        
        # Process batch concurrently
        tasks = [
            process_article_async(
                article["content"],
                company_database,
                llm,
                use_cache=True
            )
            for article in batch
        ]
        
        batch_results = await asyncio.gather(*tasks, return_exceptions=True)
        
        # Process results
        for article, (article_results, article_metrics) in zip(batch, batch_results):
            if isinstance(article_results, Exception):
                print(f"Error processing article {article['id']}: {article_results}")
                continue
                
            results.append({
                "article_id": article["id"],
                "companies": article_results,
                "metrics": article_metrics
            })
            
            print(f"Processed article {article['id']}: {len(article_results)} companies found")
            print(f"Processing time: {article_metrics['execution_time']:.2f}s")
    
    return results

# Run the processing
results = await process_articles_batch(articles)

# Save results
with open('extraction_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"\nProcessed {len(results)} articles successfully")

## Analyze Results

Let's analyze the results and create some visualizations:

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Extract metrics for visualization
processing_times = [r['metrics']['execution_time'] for r in results]
companies_found = [len(r['companies']) for r in results]

# Create visualizations
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Processing time distribution
sns.histplot(processing_times, ax=ax1)
ax1.set_title('Processing Time Distribution')
ax1.set_xlabel('Time (seconds)')
ax1.set_ylabel('Count')

# Companies found distribution
sns.histplot(companies_found, ax=ax2)
ax2.set_title('Companies Found Distribution')
ax2.set_xlabel('Number of Companies')
ax2.set_ylabel('Count')

plt.tight_layout()
plt.show()