In [None]:
import anthropic
import pandas as pd
import sqlite3
import os
from datetime import datetime

In [None]:
# Database setup - Single Company table architecture
DB_PATH = "./jobly.db"

def init_database():
    """Initialize the SQLite database with a single Company table"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    
    # Enable WAL mode for better concurrency and performance
    cursor.execute("PRAGMA journal_mode=WAL")
    cursor.execute("PRAGMA synchronous=NORMAL")
    cursor.execute("PRAGMA cache_size=10000")
    cursor.execute("PRAGMA temp_store=MEMORY")
    
    # Create single companies table with all analysis columns
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS companies (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT UNIQUE NOT NULL,
            is_good BOOLEAN,
            is_good_msg TEXT,
            is_good_err BOOLEAN,
            is_local BOOLEAN,
            is_local_msg TEXT,
            is_local_err BOOLEAN,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    
    conn.commit()
    conn.close()
    print("Database initialized with single Company table and WAL mode!")

# Initialize the database
init_database()


In [None]:
# Database helper functions for single table architecture (with better connection management)
def get_or_create_company(company_name):
    """Get company ID, creating the company if it doesn't exist"""
    try:
        with sqlite3.connect(DB_PATH, timeout=30) as conn:
            # Enable WAL mode for better concurrency
            conn.execute("PRAGMA journal_mode=WAL")
            conn.execute("PRAGMA synchronous=NORMAL")
            conn.execute("PRAGMA cache_size=10000")
            
            cursor = conn.cursor()
            
            # Try to get existing company
            cursor.execute("SELECT id FROM companies WHERE name = ?", (company_name,))
            result = cursor.fetchone()
            
            if result:
                company_id = result[0]
            else:
                # Create new company using INSERT OR IGNORE
                cursor.execute("INSERT OR IGNORE INTO companies (name) VALUES (?)", (company_name,))
                if cursor.lastrowid:
                    company_id = cursor.lastrowid
                else:
                    # Company already exists, get its ID
                    cursor.execute("SELECT id FROM companies WHERE name = ?", (company_name,))
                    company_id = cursor.fetchone()[0]
            
            conn.commit()
            return company_id
    except sqlite3.OperationalError as e:
        print(f"Database error in get_or_create_company: {e}")
        raise

def update_company_analysis(company_name, analysis_type, is_positive, message, is_error):
    """Update company analysis results in the database"""
    try:
        with sqlite3.connect(DB_PATH, timeout=30) as conn:
            # Enable WAL mode for better concurrency
            conn.execute("PRAGMA journal_mode=WAL")
            conn.execute("PRAGMA synchronous=NORMAL")
            conn.execute("PRAGMA cache_size=10000")
            
            cursor = conn.cursor()
            
            if analysis_type == 'is_good':
                cursor.execute('''
                    UPDATE companies 
                    SET is_good = ?, is_good_msg = ?, is_good_err = ?, updated_at = CURRENT_TIMESTAMP
                    WHERE name = ?
                ''', (is_positive, message, is_error, company_name))
            elif analysis_type == 'is_local':
                cursor.execute('''
                    UPDATE companies 
                    SET is_local = ?, is_local_msg = ?, is_local_err = ?, updated_at = CURRENT_TIMESTAMP
                    WHERE name = ?
                ''', (is_positive, message, is_error, company_name))
            
            conn.commit()
    except sqlite3.OperationalError as e:
        print(f"Database error in update_company_analysis: {e}")
        raise

def get_companies_needing_analysis(analysis_type):
    """Get companies that need analysis (NULL values)"""
    try:
        with sqlite3.connect(DB_PATH, timeout=10) as conn:
            if analysis_type == 'is_good':
                query = "SELECT name FROM companies WHERE is_good IS NULL"
            elif analysis_type == 'is_local':
                query = "SELECT name FROM companies WHERE is_good = 1 AND is_local IS NULL"
            
            df = pd.read_sql_query(query, conn)
            return df['name'].tolist()
    except sqlite3.OperationalError as e:
        print(f"Database error in get_companies_needing_analysis: {e}")
        return []

def get_all_companies():
    """Get all companies with their analysis results"""
    try:
        with sqlite3.connect(DB_PATH, timeout=10) as conn:
            query = '''
                SELECT id, name, is_good, is_good_msg, is_good_err, 
                       is_local, is_local_msg, is_local_err, created_at, updated_at
                FROM companies 
                ORDER BY name
            '''
            
            df = pd.read_sql_query(query, conn)
            return df
    except sqlite3.OperationalError as e:
        print(f"Database error in get_all_companies: {e}")
        return pd.DataFrame()

def get_good_companies():
    """Get companies where is_good = True"""
    try:
        with sqlite3.connect(DB_PATH, timeout=10) as conn:
            query = "SELECT name FROM companies WHERE is_good = 1"
            df = pd.read_sql_query(query, conn)
            return df['name'].tolist()
    except sqlite3.OperationalError as e:
        print(f"Database error in get_good_companies: {e}")
        return []

def get_final_candidates():
    """Get companies that are both good AND local/remote"""
    try:
        with sqlite3.connect(DB_PATH, timeout=10) as conn:
            query = '''
                SELECT name, is_good_msg, is_local_msg
                FROM companies 
                WHERE is_good = 1 AND is_local = 1
                ORDER BY name
            '''
            
            df = pd.read_sql_query(query, conn)
            return df
    except sqlite3.OperationalError as e:
        print(f"Database error in get_final_candidates: {e}")
        return pd.DataFrame()

print("Database helper functions loaded with improved connection management!")


In [None]:
# Workflow functions for the complete process
def import_companies_from_csv(csv_path, company_column='Company'):
    """Import companies from CSV and add to database if they don't exist"""
    print(f"Reading companies from {csv_path}...")
    
    # Read CSV
    df = pd.read_csv(csv_path, skiprows=2)  # Assuming same format as Connections.csv
    companies = df[company_column].unique()
    
    print(f"Found {len(companies)} unique companies in CSV")
    
    # Add companies to database
    added_count = 0
    for company_name in companies:
        if pd.notna(company_name):  # Skip NaN values
            try:
                get_or_create_company(company_name)
                added_count += 1
            except Exception as e:
                print(f"Error adding company '{company_name}': {e}")
    
    print(f"Added {added_count} companies to database")
    return companies

def run_analysis_batch(companies, prompt_template, analysis_type):
    """Run analysis on a batch of companies and update database"""
    total_input_tokens = 0
    total_output_tokens = 0
    counter = 0
    
    for company_name in companies:
        try:
            formatted_prompt = prompt_template.format(company_name=company_name)
            message = query_claude(formatted_prompt)
            
            final_text = message.content[-1].text
            is_positive = False
            is_error = False
            
            if final_text.endswith('TRUE'):
                is_positive = True
            elif final_text.endswith('FALSE'):
                is_positive = False
            else:
                is_error = True
            
            # Update database
            update_company_analysis(company_name, analysis_type, is_positive, final_text, is_error)
            
            total_input_tokens += message.usage.input_tokens
            total_output_tokens += message.usage.output_tokens
            counter += 1
            
            print(f"{counter:3d}/{len(companies)}: {company_name} - {analysis_type}: {is_positive} (Error: {is_error})")
            
        except Exception as e:
            print(f"Error processing {company_name}: {e}")
            update_company_analysis(company_name, analysis_type, False, str(e), True)
    
    print(f"\nAnalysis complete! Total tokens - Input: {total_input_tokens}, Output: {total_output_tokens}")

def run_complete_workflow(csv_path, good_prompt, local_prompt):
    """Run the complete workflow: import CSV, run is_good analysis, then is_local analysis"""
    
    # Step 1: Import companies from CSV
    companies = import_companies_from_csv(csv_path)
    
    # Step 2: Run is_good analysis on companies that need it
    print("\n=== Running is_good analysis ===")
    companies_needing_good_analysis = get_companies_needing_analysis('is_good')
    print(f"Found {len(companies_needing_good_analysis)} companies needing is_good analysis")
    
    if companies_needing_good_analysis:
        run_analysis_batch(companies_needing_good_analysis, good_prompt, 'is_good')
    else:
        print("All companies already have is_good analysis")
    
    # Step 3: Run is_local analysis on good companies that need it
    print("\n=== Running is_local analysis ===")
    companies_needing_local_analysis = get_companies_needing_analysis('is_local')
    print(f"Found {len(companies_needing_local_analysis)} good companies needing is_local analysis")
    
    if companies_needing_local_analysis:
        run_analysis_batch(companies_needing_local_analysis, local_prompt, 'is_local')
    else:
        print("All good companies already have is_local analysis")
    
    # Step 4: Show final results
    print("\n=== Final Results ===")
    final_candidates = get_final_candidates()
    print(f"Final candidates (good AND local/remote): {len(final_candidates)}")
    
    return final_candidates

print("Workflow functions loaded!")


In [None]:
# Define the analysis prompts
good_prompt = """
I am looking for jobs. I am a data scientist looking for a company or non-profit working on a pro-social mission. 
Some example cause areas: climate change, healthcare, preserving democracy, wealth inequality, education. 
But I am interested in any others that are for the benefit of the greater good.
Can you please let me know if this company fits that above description: {company_name}. 
If it does meet this conditions, just reply "TRUE". If it does not, just reply "FALSE". Do not respond with the reasoning for this decision, 
simply respond "TRUE" or "FALSE".
"""

local_prompt = """
Can you please let me know if the company {company_name} is either fully remote or based in Colorado?
I live in Colorado and can only work for a company that is based in Colorado or fully remote.
If it is remote or in Colorado, just reply "TRUE". If it is neither, just reply "FALSE". 
Do not respond with the reasoning for this decision, simply respond "TRUE" or "FALSE".
"""

print("Analysis prompts defined!")


In [None]:
# Example usage and utility functions
def show_database_status():
    """Show current status of the database"""
    all_companies = get_all_companies()
    
    total = len(all_companies)
    good_count = len(all_companies[all_companies['is_good'] == True])
    local_count = len(all_companies[all_companies['is_local'] == True])
    final_count = len(all_companies[(all_companies['is_good'] == True) & (all_companies['is_local'] == True)])
    
    print("=== Database Status ===")
    print(f"Total companies: {total}")
    print(f"Good companies: {good_count}")
    print(f"Local/remote companies: {local_count}")
    print(f"Final candidates (good AND local): {final_count}")
    print(f"Companies needing is_good analysis: {len(get_companies_needing_analysis('is_good'))}")
    print(f"Good companies needing is_local analysis: {len(get_companies_needing_analysis('is_local'))}")

def export_results_to_csv():
    """Export final results to CSV"""
    final_candidates = get_final_candidates()
    if len(final_candidates) > 0:
        filename = f"./output/final_candidates_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        final_candidates.to_csv(filename, index=False)
        print(f"Exported {len(final_candidates)} final candidates to {filename}")
    else:
        print("No final candidates to export")

# Example: Run the complete workflow
# Uncomment the line below to run the complete process
# final_candidates = run_complete_workflow("./input/Connections.csv", good_prompt, local_prompt)

print("Ready to run! Uncomment the line above to start the complete workflow.")


In [None]:
# # Individual step examples (uncomment to run specific steps)

# # Step 1: Import companies from CSV
# companies = import_companies_from_csv("./input/Connections.csv")

# # Step 2: Run only is_good analysis
# companies_needing_good = get_companies_needing_analysis('is_good')
# if companies_needing_good:
#     run_analysis_batch(companies_needing_good, good_prompt, 'is_good')

# # Step 3: Run only is_local analysis (on good companies)
# companies_needing_local = get_companies_needing_analysis('is_local')
# if companies_needing_local:
#     run_analysis_batch(companies_needing_local, local_prompt, 'is_local')

# # Check database status
# show_database_status()

# # View all companies
# all_companies = get_all_companies()
# print(all_companies.head(10))

# # View final candidates
# final_candidates = get_final_candidates()
# print(final_candidates)

# # Export results
# export_results_to_csv()

# print("Individual step examples ready! Uncomment the lines above to run specific steps.")


In [None]:
# Fix database lock issues and add better connection management
import os
import time

def force_unlock_database():
    """Force unlock the database by closing any open connections"""
    try:
        # Try to connect and immediately close to release any locks
        conn = sqlite3.connect(DB_PATH, timeout=1)
        conn.close()
        print("Database unlocked successfully!")
    except sqlite3.OperationalError as e:
        print(f"Database still locked: {e}")
        print("Try restarting your kernel (Kernel -> Restart) and run the cells again")

def check_database_status():
    """Check if database is accessible"""
    try:
        conn = sqlite3.connect(DB_PATH, timeout=5)
        cursor = conn.cursor()
        cursor.execute("SELECT COUNT(*) FROM companies")
        count = cursor.fetchone()[0]
        conn.close()
        print(f"Database is accessible. Contains {count} companies.")
        return True
    except sqlite3.OperationalError as e:
        print(f"Database error: {e}")
        return False

# Try to unlock the database
force_unlock_database()
check_database_status()


In [None]:
# Troubleshooting database lock issues

def troubleshoot_database():
    """Comprehensive database troubleshooting"""
    print("=== Database Troubleshooting ===")
    
    # Check if database file exists
    if os.path.exists(DB_PATH):
        print(f"✓ Database file exists: {DB_PATH}")
        file_size = os.path.getsize(DB_PATH)
        print(f"  File size: {file_size} bytes")
    else:
        print(f"✗ Database file not found: {DB_PATH}")
        return
    
    # Try to access database
    if check_database_status():
        print("✓ Database is accessible")
    else:
        print("✗ Database is locked or corrupted")
        print("\nTry these solutions:")
        print("1. Restart your Jupyter kernel (Kernel -> Restart)")
        print("2. Close any other programs that might be using the database")
        print("3. Delete the database file and reinitialize:")
        print(f"   os.remove('{DB_PATH}')")
        print("   init_database()")

def reset_database():
    """Reset the database (use with caution!)"""
    try:
        if os.path.exists(DB_PATH):
            os.remove(DB_PATH)
            print(f"Removed database file: {DB_PATH}")
        
        init_database()
        print("Database reset successfully!")
    except Exception as e:
        print(f"Error resetting database: {e}")

# Run troubleshooting
troubleshoot_database()


In [None]:
# Aggressive database unlock methods
import subprocess
import signal
import psutil

def force_kill_database_connections():
    """Force kill any processes that might be holding the database"""
    try:
        # Find processes using the database file
        for proc in psutil.process_iter(['pid', 'name', 'open_files']):
            try:
                if proc.info['open_files']:
                    for file_info in proc.info['open_files']:
                        if DB_PATH in file_info.path:
                            print(f"Found process {proc.info['name']} (PID: {proc.info['pid']}) using database")
                            # Don't actually kill it, just report
            except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
                pass
    except Exception as e:
        print(f"Error checking processes: {e}")

def aggressive_unlock():
    """Try multiple methods to unlock the database"""
    print("=== Aggressive Database Unlock ===")
    
    # Method 1: Try with WAL mode
    try:
        with sqlite3.connect(DB_PATH, timeout=1) as conn:
            conn.execute("PRAGMA journal_mode=WAL")
            conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
            conn.close()
        print("✓ WAL checkpoint completed")
    except Exception as e:
        print(f"✗ WAL checkpoint failed: {e}")
    
    # Method 2: Try to force close all connections
    try:
        with sqlite3.connect(DB_PATH, timeout=1) as conn:
            conn.execute("PRAGMA busy_timeout=1000")
            conn.execute("SELECT 1")
            conn.close()
        print("✓ Connection test passed")
    except Exception as e:
        print(f"✗ Connection test failed: {e}")
    
    # Method 3: Check for lock files
    lock_files = [DB_PATH + "-wal", DB_PATH + "-shm", DB_PATH + "-journal"]
    for lock_file in lock_files:
        if os.path.exists(lock_file):
            try:
                os.remove(lock_file)
                print(f"✓ Removed lock file: {lock_file}")
            except Exception as e:
                print(f"✗ Could not remove {lock_file}: {e}")

def nuclear_option():
    """Nuclear option: backup and recreate database"""
    print("=== Nuclear Option: Backup and Recreate ===")
    
    if not os.path.exists(DB_PATH):
        print("Database doesn't exist, creating new one...")
        init_database()
        return
    
    # Create backup
    backup_path = f"{DB_PATH}.backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    try:
        # Try to backup using SQLite's backup command
        with sqlite3.connect(DB_PATH, timeout=1) as source:
            with sqlite3.connect(backup_path) as backup:
                source.backup(backup)
        print(f"✓ Database backed up to: {backup_path}")
    except Exception as e:
        print(f"✗ Backup failed: {e}")
        print("Proceeding with recreation anyway...")
    
    # Remove old database
    try:
        os.remove(DB_PATH)
        print("✓ Old database removed")
    except Exception as e:
        print(f"✗ Could not remove old database: {e}")
    
    # Remove any lock files
    lock_files = [DB_PATH + "-wal", DB_PATH + "-shm", DB_PATH + "-journal"]
    for lock_file in lock_files:
        if os.path.exists(lock_file):
            try:
                os.remove(lock_file)
                print(f"✓ Removed {lock_file}")
            except:
                pass
    
    # Create new database
    init_database()
    print("✓ New database created")

# # Run aggressive unlock
# force_kill_database_connections()
# aggressive_unlock()

# print("\nIf still locked, try the nuclear option:")
# print("nuclear_option()")


In [None]:
# Simple manual unlock (no extra packages needed)
def simple_manual_unlock():
    """Simple manual unlock without external dependencies"""
    print("=== Simple Manual Unlock ===")
    
    # List all database-related files
    db_dir = os.path.dirname(DB_PATH) or "."
    db_name = os.path.basename(DB_PATH)
    
    print(f"Looking for database files in: {db_dir}")
    print(f"Database name: {db_name}")
    
    # Find all related files
    related_files = []
    for file in os.listdir(db_dir):
        if file.startswith(db_name):
            related_files.append(os.path.join(db_dir, file))
    
    print(f"Found database files: {related_files}")
    
    # Try to remove lock files manually
    lock_extensions = ['.wal', '.shm', '.journal']
    for ext in lock_extensions:
        lock_file = DB_PATH + ext
        if os.path.exists(lock_file):
            try:
                os.remove(lock_file)
                print(f"✓ Removed {lock_file}")
            except Exception as e:
                print(f"✗ Could not remove {lock_file}: {e}")
    
    # Try a simple connection test
    try:
        conn = sqlite3.connect(DB_PATH, timeout=1)
        conn.execute("SELECT 1")
        conn.close()
        print("✓ Database is now accessible!")
        return True
    except Exception as e:
        print(f"✗ Database still locked: {e}")
        return False

def quick_fix():
    """Quick fix: just delete and recreate"""
    print("=== Quick Fix: Delete and Recreate ===")
    
    # Remove database file
    if os.path.exists(DB_PATH):
        try:
            os.remove(DB_PATH)
            print(f"✓ Removed {DB_PATH}")
        except Exception as e:
            print(f"✗ Could not remove database: {e}")
            return False
    
    # Remove any lock files
    for ext in ['.wal', '.shm', '.journal']:
        lock_file = DB_PATH + ext
        if os.path.exists(lock_file):
            try:
                os.remove(lock_file)
                print(f"✓ Removed {lock_file}")
            except:
                pass
    
    # Create new database
    init_database()
    print("✓ New database created")
    return True

# Try the simple approach first
simple_manual_unlock()


In [None]:
# Test database access after unlock
print("=== Testing Database Access ===")

try:
    # Test basic connection
    with sqlite3.connect(DB_PATH, timeout=5) as conn:
        cursor = conn.cursor()
        cursor.execute("SELECT COUNT(*) FROM companies")
        count = cursor.fetchone()[0]
        print(f"✓ Database is accessible! Contains {count} companies.")
        
        # Test a simple query
        cursor.execute("SELECT name FROM companies LIMIT 3")
        companies = cursor.fetchall()
        if companies:
            print(f"✓ Sample companies: {[c[0] for c in companies]}")
        else:
            print("✓ Database is empty (no companies yet)")
            
except Exception as e:
    print(f"✗ Database still has issues: {e}")

print("\nDatabase should now be working! You can proceed with your analysis.")


In [None]:
# Efficient batch migration functions (to prevent database locks)
def batch_migrate_companies(companies_list):
    """Efficiently migrate a list of companies in a single transaction"""
    try:
        with sqlite3.connect(DB_PATH, timeout=30) as conn:
            cursor = conn.cursor()
            
            # Use INSERT OR IGNORE to avoid duplicates
            cursor.executemany(
                "INSERT OR IGNORE INTO companies (name) VALUES (?)",
                [(company,) for company in companies_list if pd.notna(company)]
            )
            
            conn.commit()
            print(f"✓ Batch migrated {len(companies_list)} companies")
            
    except Exception as e:
        print(f"✗ Batch migration failed: {e}")
        raise

def batch_update_analysis(df, analysis_type):
    """Efficiently update analysis results in batches"""
    try:
        with sqlite3.connect(DB_PATH, timeout=30) as conn:
            cursor = conn.cursor()
            
            if analysis_type == 'is_good':
                # Update is_good analysis
                for _, row in df.iterrows():
                    # Convert string TRUE/FALSE to boolean
                    is_good_bool = row['IsGood'] == 'TRUE' if isinstance(row['IsGood'], str) else bool(row['IsGood'])
                    is_err_bool = row['IsErr'] == 'TRUE' if isinstance(row['IsErr'], str) else bool(row['IsErr'])
                    
                    cursor.execute('''
                        UPDATE companies 
                        SET is_good = ?, is_good_msg = ?, is_good_err = ?, updated_at = CURRENT_TIMESTAMP
                        WHERE name = ?
                    ''', (is_good_bool, row['Message'], is_err_bool, row.name))  # row.name is the company name (index)
                    
            elif analysis_type == 'is_local':
                # Update is_local analysis
                for _, row in df.iterrows():
                    # Convert string TRUE/FALSE to boolean
                    is_local_bool = row['IsLocal'] == 'TRUE' if isinstance(row['IsLocal'], str) else bool(row['IsLocal'])
                    is_err_bool = row['IsErr'] == 'TRUE' if isinstance(row['IsErr'], str) else bool(row['IsErr'])
                    
                    cursor.execute('''
                        UPDATE companies 
                        SET is_local = ?, is_local_msg = ?, is_local_err = ?, updated_at = CURRENT_TIMESTAMP
                        WHERE name = ?
                    ''', (is_local_bool, row['Message'], is_err_bool, row.name))  # row.name is the company name (index)
            
            conn.commit()
            print(f"✓ Batch updated {len(df)} {analysis_type} analysis results")
            
    except Exception as e:
        print(f"✗ Batch update failed: {e}")
        raise

def migrate_csv_data_efficiently():
    """Migrate CSV data efficiently without causing database locks"""
    print("=== Efficient CSV Migration ===")
    
    # Migrate first pass (pro_social analysis)
    try:
        first_pass_df = pd.read_csv("./output/dave_connection_companies_first_pass.csv", index_col=0)
        print(f"Found {len(first_pass_df)} pro_social analysis results")
        
        # Batch insert companies
        companies_list = first_pass_df.index.tolist()
        batch_migrate_companies(companies_list)
        
        # Batch update analysis
        batch_update_analysis(first_pass_df, 'is_good')
        
    except FileNotFoundError:
        print("⚠ First pass CSV not found, skipping...")
    except Exception as e:
        print(f"✗ First pass migration failed: {e}")
    
    # Migrate second pass (location analysis)
    try:
        second_pass_df = pd.read_csv("./output/dave_connection_companies_second_pass.csv", index_col=0)
        print(f"Found {len(second_pass_df)} location analysis results")
        
        # Batch insert companies (in case some are missing)
        companies_list = second_pass_df.index.tolist()
        batch_migrate_companies(companies_list)
        
        # Batch update analysis
        batch_update_analysis(second_pass_df, 'is_local')
        
    except FileNotFoundError:
        print("⚠ Second pass CSV not found, skipping...")
    except Exception as e:
        print(f"✗ Second pass migration failed: {e}")
    
    print("✓ Migration complete!")

# Run the efficient migration
migrate_csv_data_efficiently()


In [None]:
get_all_companies().head(100)

In [None]:
# Debug and fix migration issues
def debug_database_migration():
    """Debug what's in the database vs CSV"""
    print("=== Debugging Database Migration ===")
    
    # Check CSV data
    try:
        first_pass_df = pd.read_csv("./output/dave_connection_companies_first_pass.csv", index_col=0)
        print(f"CSV has {len(first_pass_df)} companies")
        print(f"CSV is_good values: {first_pass_df['IsGood'].value_counts()}")
        print(f"CSV sample:")
        print(first_pass_df.head(3))
    except Exception as e:
        print(f"Error reading CSV: {e}")
        return
    
    # Check database data
    try:
        db_df = get_all_companies()
        print(f"\nDatabase has {len(db_df)} companies")
        print(f"Database is_good values: {db_df['is_good'].value_counts()}")
        print(f"Database sample:")
        print(db_df[['name', 'is_good', 'is_good_msg']].head(3))
    except Exception as e:
        print(f"Error reading database: {e}")
        return
    
    # Find mismatches
    csv_companies = set(first_pass_df.index)
    db_companies = set(db_df['name'])
    
    print(f"\nCompanies in CSV but not in DB: {len(csv_companies - db_companies)}")
    print(f"Companies in DB but not in CSV: {len(db_companies - csv_companies)}")
    
    # Check for companies with NULL is_good in DB but have data in CSV
    null_in_db = db_df[db_df['is_good'].isnull()]['name'].tolist()
    print(f"Companies with NULL is_good in DB: {len(null_in_db)}")
    
    if null_in_db:
        print("Sample companies with NULL is_good:")
        for company in null_in_db[:5]:
            if company in first_pass_df.index:
                csv_value = first_pass_df.loc[company, 'IsGood']
                print(f"  {company}: CSV={csv_value}, DB=NULL")

def fix_migration_data():
    """Fix the migration data by re-running with correct data types"""
    print("=== Fixing Migration Data ===")
    
    # Re-run the migration with corrected functions
    migrate_csv_data_efficiently()
    
    # Verify the fix
    print("\n=== Verification ===")
    db_df = get_all_companies()
    print(f"Companies with is_good=True: {len(db_df[db_df['is_good'] == True])}")
    print(f"Companies with is_good=False: {len(db_df[db_df['is_good'] == False])}")
    print(f"Companies with is_good=NULL: {len(db_df[db_df['is_good'].isnull()])}")

# Run debug first
debug_database_migration()


In [None]:
# Complete reset and re-migration
def reset_and_remigrate():
    """Reset database and re-migrate all data correctly"""
    print("=== Complete Reset and Re-migration ===")
    
    # Clear all data from companies table
    try:
        with sqlite3.connect(DB_PATH, timeout=30) as conn:
            cursor = conn.cursor()
            cursor.execute("DELETE FROM companies")
            conn.commit()
            print("✓ Cleared all existing data")
    except Exception as e:
        print(f"✗ Error clearing data: {e}")
        return
    
    # Re-migrate with corrected functions
    print("\nRe-migrating data...")
    migrate_csv_data_efficiently()
    
    # Verify results
    print("\n=== Final Verification ===")
    db_df = get_all_companies()
    
    print(f"Total companies in database: {len(db_df)}")
    print(f"Companies with is_good=True: {len(db_df[db_df['is_good'] == True])}")
    print(f"Companies with is_good=False: {len(db_df[db_df['is_good'] == False])}")
    print(f"Companies with is_good=NULL: {len(db_df[db_df['is_good'].isnull()])}")
    
    if len(db_df[db_df['is_good'].isnull()]) > 0:
        print("⚠ Still have NULL values - there may be an issue")
        null_companies = db_df[db_df['is_good'].isnull()]['name'].head(5).tolist()
        print(f"Sample NULL companies: {null_companies}")
    else:
        print("✓ All companies have is_good values!")

# Run the complete reset and re-migration
reset_and_remigrate()


In [None]:
# Stop any running migration and clean up
def stop_migration_and_cleanup():
    """Stop any running migration and clean up database locks"""
    print("=== Stopping Migration and Cleaning Up ===")
    
    # Remove any lock files
    lock_files = [DB_PATH + "-wal", DB_PATH + "-shm", DB_PATH + "-journal"]
    for lock_file in lock_files:
        if os.path.exists(lock_file):
            try:
                os.remove(lock_file)
                print(f"✓ Removed {lock_file}")
            except Exception as e:
                print(f"✗ Could not remove {lock_file}: {e}")
    
    # Test database access
    try:
        with sqlite3.connect(DB_PATH, timeout=5) as conn:
            conn.execute("SELECT 1")
            print("✓ Database is accessible")
    except Exception as e:
        print(f"✗ Database still locked: {e}")
        print("Try restarting your kernel (Kernel -> Restart)")

# Run cleanup
stop_migration_and_cleanup()


In [None]:
connections_df = pd.read_csv("./input/Connections.csv", skiprows=2)
connections_df = connections_df[connections_df['Company'].notna()]
# print(connections_df.head())

companies = connections_df.Company.unique()
# print(companies)
print(f"Successfully read-in connections. Total unique companies: {len(companies)}")

In [None]:
api_key_file = open("./anthropic_api_key.txt", "r")
anthropic_api_key = api_key_file.read()

client = anthropic.Anthropic(api_key=anthropic_api_key)

In [None]:
def query_claude(prompt, use_search=False):

    tools = []
    if use_search:
        tools = [
            {
                "name": "web_search",
                "type": "web_search_20250305",
                "max_uses": 1
            }
        ]

    return client.beta.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=1024,
        temperature=1,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    }
                ]
            }
        ],
        tools=tools
    )

In [None]:
def loop_through_companies(companies, prompt_template, bool_col_name='IsGood'):
    
    result_df = pd.DataFrame(columns=['Company', bool_col_name, 'Message', 'IsErr'])
    total_input_tokens = 0
    total_output_tokens = 0
    counter = 0

    for company_name in companies:

        formatted_prompt = prompt_template.format(company_name=company_name)
        message = query_claude(formatted_prompt)

        final_text = message.content[-1].text
        bool_col = False
        is_err = False

        if final_text.endswith('TRUE'):
            bool_col = True
        elif final_text.endswith('FALSE'):
            bool_col = False
        else:
            is_err = True

        result_df.loc[len(result_df)] = [company_name, bool_col, final_text, is_err]

        total_input_tokens += message.usage.input_tokens
        total_output_tokens += message.usage.output_tokens
        counter += 1

        print(f"{counter:3d}/{len(companies)}: Processed company: {company_name}. Input tokens: {message.usage.input_tokens}; Output tokens: {message.usage.output_tokens}; Error: {is_err}")

    return result_df.set_index('Company')

In [None]:
good_prompt = """
        I am looking for jobs. I am a data scientist looking for a company or non-profit working on a pro-social mission. 
        Some example cause areas: climate change, healthcare, preserving democracy, wealth inequality, education. 
        But I am interested in any others that are for the benefit of the greater good.
        Can you please let me know if this company fits that above description: {company_name}. 
        If it does meet this conditions, just reply "TRUE". If it does not, just reply "FALSE". Do not respond with the reasoning for this decision, 
        simply respond "TRUE" or "FALSE".
    """

result_df = loop_through_companies(companies, good_prompt, bool_col_name='IsGood')
result_df

In [None]:
result_df = pd.read_csv("./output/dave_connection_companies_first_pass.csv")

In [None]:
# iterate through rows of result_df
for index, row in result_df.iterrows():
    get_or_create_company(row['Company'])
    update_company_analysis(row['Company'], 'is_good', row['IsGood'], row['Message'], row['IsErr'])
get_all_companies()

In [None]:
get_or_create_company('Scope3')
update_company_analysis('Scope3', 'is_good', True, 'TRUE', False)
get_all_companies()

In [None]:
get_all_companies()

In [None]:
good_df = result_df.loc[result_df['IsGood'] == True]
good_companies = good_df.index.tolist()
print(f"Good companies: {len(good_companies)}")

In [None]:
local_prompt = """
        Can you please let me know if the company {company_name} is either fully remote or based in Colorado?
        I live in Colorado and can only work for a company that is based in Colorado or fully remote.
        If it is remote or in Colorado, just reply "TRUE". If it is neither, just reply "FALSE". 
        Do not respond with the reasoning for this decision, simply respond "TRUE" or "FALSE".
    """

loc_result_df = loop_through_companies(good_companies, local_prompt, bool_col_name='IsLocal')
loc_result_df

In [None]:
loc_result_df = pd.read_csv("./output/dave_connection_companies_second_pass.csv", index_col=0)

In [None]:
loc_result_df.loc[loc_result_df['IsLocal'] == True]

In [None]:
# loc_result_df.to_csv("./output/dave_connection_companies_second_pass.csv")
# result_df.to_csv("./output/dave_connection_companies_first_pass.csv", index=False)