In [4]:
# Notebook 1: Setup, Configuration, and Database Initialization
#This notebook sets up the project configuration variables and initializes the SQLite database and its tables.

In [8]:
# --- Project Configuration Variables ---

# Defines the central theme for news and social media analysis
ANALYSIS_TOPIC = "Advancements in Renewable Energy Technologies"

# Dictionary of RSS feeds to be scraped for news articles
# Ensure these URLs are active and relevant to the ANALYSIS_TOPIC.
NEWS_RSS_FEEDS = {
    "Guardian Environment": "https://www.theguardian.com/environment/rss",
    "Ars Technica": "http://feeds.arstechnica.com/arstechnica/index/" # Example working feed
    # Add other relevant and verified RSS feeds here.
}

# Configuration for scraping Reddit
REDDIT_SUBREDDITS = {
    "RenewableEnergySub": "RenewableEnergy" # Internal key maps to subreddit name
}
# Number of recent posts to fetch from each Reddit source per scraping cycle
REDDIT_POST_LIMIT = 10 # Adjusted for efficient testing; can be increased.

# Filename for the SQLite database
DATABASE_NAME = "trend_analyzer.db"

# Interval for the automated scheduler (in seconds)
# This is primarily for the standalone app_scheduler.py script.
SCHEDULER_INTERVAL_SECONDS = 3600 # e.g., 1 hour

print("Project configuration variables loaded.")
print(f"Database will be: {DATABASE_NAME}")
print(f"Analysis Topic set to: {ANALYSIS_TOPIC}")
print(f"RSS Feeds configured: {list(NEWS_RSS_FEEDS.keys())}")
print(f"Subreddits configured: {list(REDDIT_SUBREDDITS.values())}")

Project configuration variables loaded.
Database will be: trend_analyzer.db
Analysis Topic set to: Advancements in Renewable Energy Technologies
RSS Feeds configured: ['Guardian Environment', 'Ars Technica']
Subreddits configured: ['RenewableEnergy']


In [9]:
import sqlite3
from datetime import datetime # Not strictly needed in this cell if not used by these functions

# Assumes DATABASE_NAME and ANALYSIS_TOPIC are defined in the previous cell (Cell 2).

def create_connection():
    """Establishes a connection to the SQLite database."""
    conn = None
    try:
        # Uses DATABASE_NAME from the global scope (defined in Cell 2)
        conn = sqlite3.connect(DATABASE_NAME)
    except sqlite3.Error as e:
        print(f"SQLite Error: Could not connect to database '{DATABASE_NAME}'. Reason: {e}")
    return conn

def create_tables():
    """Creates all necessary tables in the database if they don't already exist."""
    conn = create_connection()
    if conn is not None:
        try:
            cursor = conn.cursor()
            
            # --- Articles Table ---
            # Stores information about each scraped article or post.
            cursor.execute("""
                CREATE TABLE IF NOT EXISTS articles (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    source_url TEXT UNIQUE NOT NULL, -- Ensures each article is stored only once based on its URL
                    source_name TEXT,                 -- e.g., 'Guardian Environment', 'Reddit r/RenewableEnergy'
                    title TEXT,
                    raw_content TEXT,                 -- The initially scraped, unprocessed content
                    processed_content TEXT,           -- (Optional) For cleaned or main extracted text
                    publication_date TIMESTAMP,       -- Standardized to UTC
                    scraped_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP -- When the article was added to our DB
                );
            """)
            
            # --- Sentiments Table ---
            # Stores sentiment analysis results for each article.
            cursor.execute("""
                CREATE TABLE IF NOT EXISTS sentiments (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    article_id INTEGER NOT NULL,      -- Foreign key linking to the articles table
                    sentiment_score REAL,             -- e.g., VADER compound score (-1 to 1)
                    sentiment_label TEXT,             -- e.g., 'positive', 'negative', 'neutral'
                    FOREIGN KEY (article_id) REFERENCES articles (id) ON DELETE CASCADE
                );
            """)
            
            # --- Keywords Table ---
            # Stores extracted keywords and their scores for each article.
            cursor.execute("""
                CREATE TABLE IF NOT EXISTS keywords (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    article_id INTEGER NOT NULL,      -- Foreign key
                    keyword TEXT NOT NULL,
                    score REAL,                       -- e.g., TF-IDF score
                    FOREIGN KEY (article_id) REFERENCES articles (id) ON DELETE CASCADE,
                    UNIQUE (article_id, keyword)      -- Prevents duplicate keywords for the same article
                );
            """)
            
            # --- Entities Table ---
            # Stores named entities extracted from each article.
            cursor.execute("""
                CREATE TABLE IF NOT EXISTS entities (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    article_id INTEGER NOT NULL,      -- Foreign key
                    entity_text TEXT NOT NULL,        -- The text of the entity (e.g., "SolarCorp")
                    entity_label TEXT,                -- The entity type (e.g., "ORG", "PERSON")
                    FOREIGN KEY (article_id) REFERENCES articles (id) ON DELETE CASCADE
                );
            """)
            
            # --- Daily Trends Table ---
            # Stores aggregated daily analytics for the main topic.
            # Uses ANALYSIS_TOPIC from the global scope (defined in Cell 2) for the default topic.
            # The .replace("'", "''") is crucial for safely embedding a string with potential single quotes into an SQL DEFAULT clause.
            escaped_analysis_topic = ANALYSIS_TOPIC.replace("'", "''")
            cursor.execute(f"""
                CREATE TABLE IF NOT EXISTS daily_trends (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    trend_date DATE UNIQUE NOT NULL,    -- The specific date for which trends are calculated
                    topic TEXT DEFAULT '{escaped_analysis_topic}', 
                    average_sentiment_score REAL,
                    top_keywords TEXT,                  -- JSON string of top keywords and their frequencies
                    emerging_keywords TEXT              -- JSON string of newly prominent keywords
                );
            """)
            
            conn.commit()
            print("Database tables checked/created successfully.")
        except sqlite3.Error as e:
            print(f"SQLite Error: Could not create tables. Reason: {e}")
        finally:
            conn.close()
    else:
        print("SQLite Error: Database connection could not be established for table creation.")

print("Database management functions (create_connection, create_tables) are defined.")

Database management functions (create_connection, create_tables) are defined.


In [10]:
# This cell executes the create_tables function to ensure the database schema is in place.

print("Attempting to initialize database and create tables...")
create_tables() # Calls the function defined in the previous cell
print("Database initialization sequence finished.")

Attempting to initialize database and create tables...
Database tables checked/created successfully.
Database initialization sequence finished.
