In [1]:
# Setup and imports
import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from tqdm.notebook import tqdm
import time

# Add project directory to path
project_root = Path('.').resolve().parent
sys.path.append(str(project_root))

# Import project modules
from src.data_ingestion.news_scraper import NewsCollector
from src.data_ingestion.sec_filings import SECCollector
from src.models.lora_trainer import FinancialModelTrainer
from src.analysis.investment_analyzer import InvestmentAnalyzer
from src.vector_db.database import VectorDatabase
import config.settings as config

# Set up visualization
%matplotlib inline
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# Initialize news collector
news_collector = NewsCollector()

# Define sample tickers
sample_tickers = ["AAPL", "GOOGL", "MSFT", "TSLA", "AMZN"]

# Collect financial news
print(f"Collecting financial news for {len(sample_tickers)} tickers...")
news_collector.collect_recent_news(sample_tickers)

# Display collected news data
news_df = pd.read_csv(config.RAW_DATA_DIR / "financial_news.csv")
print(f"Collected {len(news_df)} news articles")
news_df.head()


In [None]:
# Visualize news sentiment distribution
plt.figure(figsize=(10, 6))
sentiment_counts = news_df['sentiment'].value_counts()
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
plt.title('Distribution of News Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

# Visualize news by ticker
plt.figure(figsize=(12, 6))
ticker_counts = news_df['ticker'].value_counts()
sns.barplot(x=ticker_counts.index, y=ticker_counts.values)
plt.title('News Articles by Ticker')
plt.xlabel('Ticker')
plt.ylabel('Number of Articles')
plt.show()


In [None]:
# Initialize SEC filings collector
sec_collector = SECCollector()

# Collect SEC filings for sample tickers
print("Collecting SEC filings...")
filings = sec_collector.batch_extract_filings(sample_tickers, '10-K')

# Display filings data
filings_df = pd.DataFrame(filings)
print(f"Collected {len(filings_df)} SEC filings")
filings_df.head()


In [None]:
# Visualize filing types
if 'filing_type' in filings_df.columns:
    plt.figure(figsize=(10, 6))
    filing_counts = filings_df['filing_type'].value_counts()
    sns.barplot(x=filing_counts.index, y=filing_counts.values)
    plt.title('SEC Filings by Type')
    plt.xlabel('Filing Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()


In [None]:
# Initialize vector database
vector_db = VectorDatabase()

# Store collected data in vector database
print("Storing financial data in vector database...")

# Store news articles
stored_count = 0
for _, article in news_df.iterrows():
    vector_db.store_document(
        collection="financial_news",
        document={
            "ticker": article['ticker'],
            "title": article['title'],
            "content": article['content'],
            "published_date": article['published_date'],
            "sentiment": article['sentiment']
        }
    )
    stored_count += 1

print(f"Stored {stored_count} documents in vector database")

# Query example
query_results = vector_db.query("financial_news", "earnings report", limit=3)
print("\nSample vector database query results:")
for i, result in enumerate(query_results):
    print(f"Result {i+1}:")
    print(f"  Ticker: {result['ticker']}")
    print(f"  Title: {result['title']}")
    print(f"  Similarity: {result['similarity']:.4f}")


In [None]:
# Initialize model trainer
trainer = FinancialModelTrainer()

# We'll use a small dataset for demonstration purposes
print("Creating small financial dataset for demonstration...")
sample_data = [
    {"instruction": "What is the P/E ratio?", 
     "response": "The P/E ratio (Price-to-Earnings ratio) is calculated by dividing a company's share price by its earnings per share. It indicates how much investors are willing to pay for each dollar of earnings."},
    {"instruction": "Explain what a 10-K filing is.", 
     "response": "A 10-K is an annual report required by the SEC that provides a comprehensive overview of a company's financial performance. It includes audited financial statements, management discussion and analysis, and disclosures about risks and operations."},
    {"instruction": "What does 'bullish' mean in stock market terms?", 
     "response": "In stock market terms, 'bullish' refers to an optimistic outlook that expects prices to rise. Investors who are bullish believe that a stock or the overall market will increase in value."},
    {"instruction": "How do you calculate market capitalization?", 
     "response": "Market capitalization is calculated by multiplying a company's share price by its total number of outstanding shares. It represents the total market value of a company's shares."},
    {"instruction": "What is a stock dividend?", 
     "response": "A stock dividend is a payment to shareholders made in additional shares rather than cash. It's typically expressed as a percentage of existing holdings (e.g., a 5% stock dividend means 5 new shares for every 100 owned)."}
]

# Display sample training data
pd.DataFrame(sample_data).head()


In [None]:
# Set up small-scale model training (for demonstration only)
print("Setting up model for fine-tuning demonstration...")
trainer.setup_model()

print("\nModel architecture:")
print(f"Base model: {trainer.model_name}")
print(f"Creating LoRA configuration with r={config.LORA_R}, alpha={config.LORA_ALPHA}")

# Apply LoRA (without actually training to save time)
lora_config = trainer.create_lora_config()

# Show training parameters
print("\nTraining parameters:")
print(f"Learning rate: {config.LEARNING_RATE}")
print(f"Batch size: {config.BATCH_SIZE}")
print(f"Max sequence length: {config.MAX_LENGTH}")

# Mock training results
print("\nMock training results (for demonstration):")
mock_losses = [1.2, 0.9, 0.7, 0.5, 0.4, 0.35, 0.32, 0.31, 0.30, 0.29]
mock_steps = list(range(len(mock_losses)))

plt.figure(figsize=(10, 6))
plt.plot(mock_steps, mock_losses)
plt.title('Training Loss Over Time')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

print("\nFine-tuning process would save the model to:", config.FINE_TUNED_MODEL_PATH)


In [None]:
# Initialize investment analyzer
analyzer = InvestmentAnalyzer()

# Analyze sample stocks
tickers = ["AAPL", "MSFT", "GOOGL", "TSLA", "AMZN"]
analysis_results = []

print("Performing investment analysis on sample stocks...")
for ticker in tqdm(tickers):
    result = analyzer.analyze_stock(ticker)
    analysis_results.append(result)

# Display results
results_df = pd.DataFrame(analysis_results)
results_df


In [None]:
# Visualize investment recommendations
plt.figure(figsize=(12, 6))
recommendation_counts = results_df['recommendation'].value_counts()
colors = {'BUY': 'green', 'HOLD': 'blue', 'SELL': 'red'}
sns.barplot(x=recommendation_counts.index, y=recommendation_counts.values, 
            palette=[colors[x] for x in recommendation_counts.index])
plt.title('Investment Recommendations')
plt.xlabel('Recommendation')
plt.ylabel('Count')
plt.show()

# Visualize sentiment vs. financial health
plt.figure(figsize=(10, 6))
plt.scatter(results_df['sentiment_score'], results_df['financial_health'], 
            c=[{'BUY': 'green', 'HOLD': 'blue', 'SELL': 'red'}[r] for r in results_df['recommendation']], 
            s=100, alpha=0.7)

for i, ticker in enumerate(results_df['ticker']):
    plt.annotate(ticker, 
                 (results_df['sentiment_score'][i], results_df['financial_health'][i]),
                 xytext=(5, 5), textcoords='offset points')

plt.title('Sentiment vs Financial Health by Stock')
plt.xlabel('Sentiment Score')
plt.ylabel('Financial Health Score')
plt.grid(True)
plt.colorbar(label='Recommendation')
plt.tight_layout()
plt.show()


In [None]:
# Define API URL (assuming the API is running locally)
API_URL = "http://localhost:8000"

# Function to check if API is running
def is_api_running():
    try:
        response = requests.get(f"{API_URL}/health")
        return response.status_code == 200
    except:
        return False

# Check if the API is running
if is_api_running():
    print("API is running and accessible ✅")
else:
    print("API is not running. Please start the API with: python main.py --mode api")
    print("For demonstration purposes, we'll show example code that would work if the API were running.")


In [None]:
# Example 1: Health Check
def api_health_check():
    response = requests.get(f"{API_URL}/health")
    return response.json()

# Example 2: Analyze a stock
def api_analyze_stock(ticker):
    response = requests.get(f"{API_URL}/analyze/{ticker}")
    return response.json()

# Example 3: Get news for a ticker
def api_get_news(ticker):
    response = requests.get(f"{API_URL}/news/{ticker}")
    return response.json()

# Display example responses (mock data if API is not running)
print("\nExample API Responses:")

# Mock responses if API is not running
if not is_api_running():
    mock_health_response = {"status": "healthy"}
    mock_analyze_response = {
        "ticker": "AAPL",
        "sentiment_score": 0.78,
        "financial_health": 0.82,
        "recommendation": "BUY",
        "confidence": 0.55
    }
    mock_news_response = {
        "ticker": "AAPL",
        "articles": [
            {"title": "Apple Reports Strong Q4 Earnings", "sentiment": "positive"},
            {"title": "Apple's New Product Launch Event Scheduled", "sentiment": "neutral"}
        ]
    }
    
    print("\nHealth Check Response:")
    print(mock_health_response)
    
    print("\nStock Analysis Response:")
    print(mock_analyze_response)
    
    print("\nNews Articles Response:")
    print(mock_news_response)
else:
    # Get real responses if API is running
    print("\nHealth Check Response:")
    print(api_health_check())
    
    print("\nStock Analysis Response:")
    print(api_analyze_stock("AAPL"))
    
    print("\nNews Articles Response:")
    print(api_get_news("AAPL"))


In [None]:
# Create Python code examples for using the API
python_code = '''
import requests

# API base URL
API_URL = "http://localhost:8000"

# Example 1: Get health status
response = requests.get(f"{API_URL}/health")
health_status = response.json()
print(f"API Health: {health_status['status']}")

# Example 2: Analyze a stock
ticker = "AAPL"
response = requests.get(f"{API_URL}/analyze/{ticker}")
analysis = response.json()
print(f"Recommendation for {ticker}: {analysis['recommendation']}")
print(f"Confidence: {analysis['confidence']:.2f}")

# Example 3: Process multiple stocks
tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA"]
results = []

for ticker in tickers:
    response = requests.get(f"{API_URL}/analyze/{ticker}")
    results.append(response.json())
    
# Create DataFrame from results
import pandas as pd
analysis_df = pd.DataFrame(results)
print(analysis_df[['ticker', 'recommendation', 'sentiment_score', 'financial_health']])
'''

print("Python Code Example for API Usage:")
print(python_code)


In [None]:
# Display project summary
print("Financial Market Analyst - Project Summary")
print("=" * 50)
print("\nComponents Demonstrated:")
print("1. Data Collection")
print("   - Financial news gathering")
print("   - SEC filings integration")
print("   - Vector database storage")
print("\n2. Model Training")
print("   - LoRA fine-tuning setup")
print("   - Financial dataset preparation")
print("   - Training visualization")
print("\n3. Investment Analysis")
print("   - Sentiment analysis")
print("   - Financial health assessment")
print("   - Investment recommendations")
print("\n4. API Integration")
print("   - REST API endpoints")
print("   - Programmatic access")
print("   - Data visualization")

print("\nNext Steps:")
print("1. Expand the financial dataset with more examples")
print("2. Improve the sentiment analysis with financial-specific models")
print("3. Add technical indicators to the investment analysis")
print("4. Create a simple web dashboard for visualizing results")
print("5. Implement historical backtesting of recommendations")
