In [1]:
# Imports & Configuration
import pandas as pd
from ollama import Client
from ddgs import DDGS
import csv
import time
import re
import os
from tqdm.notebook import tqdm

# Register progress_apply with pandas
tqdm.pandas()

MODEL_NAME = "gemini-3-flash-preview:cloud"
INPUT_FILE = "overbase_list.csv"
OUTPUT_FILE = "processed_data.csv"
BATCH_SIZE = 10 

client = Client(host='http://localhost:11434')

print(f"[INFO] System Initialized.")
print(f"[INFO] AI Model: {MODEL_NAME}")
print(f"[INFO] Batch Size: {BATCH_SIZE}")

[INFO] System Initialized.
[INFO] AI Model: gemini-3-flash-preview:cloud
[INFO] Batch Size: 10


In [2]:
# Helper Functions (Search, LLM, Regex)

COMPANY_DOMAIN_CACHE = {}

def search_web(query, max_results=3):
    """Searches DuckDuckGo and returns a summary string."""
    try:
        time.sleep(1) # Rate limiting
        results = DDGS().text(query, max_results=max_results)
        if not results: return ""
        return "\n".join([f"- {r['body']}" for r in results])
    except Exception as e:
        # Fail silently to keep the video output clean, or log to file
        return ""

def ask_ollama(prompt, context=""):
    """Queries the Ollama Cloud model."""
    full_prompt = f"""
    Context Data:
    {context}
    
    Instruction:
    {prompt}
    
    Output Rules:
    - Return ONLY the requested answer.
    - No markdown formatting, no intro/outro text.
    """
    try:
        response = client.chat(model=MODEL_NAME, messages=[
            {'role': 'user', 'content': full_prompt},
        ])
        return response['message']['content'].strip()
    except Exception as e:
        return f"Error: {e}"

def extract_domain_with_regex(text):
    """Extracts a domain like 'google.com' from a text string using Regex."""
    pattern = r'\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}\b'
    matches = re.findall(pattern, text.lower())
    
    if matches:
        return matches[0]
    return "n/a"

In [3]:
# Load & Clean Data
def clean_and_load_data(filepath):
    valid_rows = []
    INVALID_VALUES = {'', '-', '—', '–', 'n/a', 'nan', 'none'}

    try:
        with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
            reader = csv.reader(f)
            header = next(reader, None)
            
            for row in reader:
                if not row: continue
                
                # Logic to handle varying CSV column counts
                name, title, company = None, None, None
                if len(row) == 4:
                    name, title, company = row[0], row[1], row[2]
                elif len(row) >= 5:
                    name, title, company = row[0], row[1], row[3]
                else:
                    continue

                name = name.strip() if name else ""
                title = title.strip() if title else ""
                company = company.strip() if company else ""

                if not title or title.lower() in INVALID_VALUES: continue
                if not company or company.lower() in INVALID_VALUES: continue
                
                valid_rows.append({'Name': name, 'Title': title, 'Company': company})
                    
    except FileNotFoundError:
        print(f"[ERROR] '{filepath}' not found.")
        return pd.DataFrame()

    return pd.DataFrame(valid_rows)

print("[STEP 1] Loading and Cleaning Data...")
df = clean_and_load_data(INPUT_FILE)
df = df.drop_duplicates()

# Process batch trimming for the demo
if len(df) > BATCH_SIZE:
    df = df.head(BATCH_SIZE).copy()

# Name Splitting Logic
def split_name(full_name):
    parts = str(full_name).strip().split()
    first = parts[0]
    last = " ".join(parts[1:]) if len(parts) > 1 else ""
    return pd.Series([first, last])

if not df.empty:
    df[['First_Name', 'Last_Name']] = df['Name'].apply(split_name)

print(f"[SUCCESS] Data loaded. Rows to process: {len(df)}")
print("Data Preview (Name Split):")
display(df[['Name', 'First_Name', 'Last_Name', 'Title']].head(3))

[STEP 1] Loading and Cleaning Data...
[SUCCESS] Data loaded. Rows to process: 10
Data Preview (Name Split):


Unnamed: 0,Name,First_Name,Last_Name,Title
0,Donna Johnson,Donna,Johnson,Chief Marketing Officer
1,Andreas Urschitz,Andreas,Urschitz,CMO
2,Julia Chen,Julia,Chen,Vice President


In [4]:
# Filter Senior Executives
if not df.empty:
    print("\n[STEP 2] Filtering Senior Executives (AI Analysis)...")
    
    def is_senior(title):
        prompt = f"Is '{title}' a Senior Executive role (C-Level, Director, VP, Founder)? Answer strictly YES or NO."
        res = ask_ollama(prompt)
        return "YES" in res.upper()

    # Using progress_apply to show the bar
    df['Is_Senior'] = df['Title'].progress_apply(is_senior)
    
    # Filter DataFrame
    original_count = len(df)
    df_senior = df[df['Is_Senior']].copy()
    
    print(f"[RESULT] Filtering Complete.")
    print(f"   - Input Rows: {original_count}")
    print(f"   - Qualified Leads: {len(df_senior)}")
    
    display(df_senior[['Name', 'Title', 'Is_Senior']].head())
    df = df_senior # Update main dataframe
else:
    print("[WARN] No data to process.")


[STEP 2] Filtering Senior Executives (AI Analysis)...


  0%|          | 0/10 [00:00<?, ?it/s]

[RESULT] Filtering Complete.
   - Input Rows: 10
   - Qualified Leads: 9


Unnamed: 0,Name,Title,Is_Senior
0,Donna Johnson,Chief Marketing Officer,True
1,Andreas Urschitz,CMO,True
2,Julia Chen,Vice President,True
3,Robert Occhialini,Chief Technology Officer,True
4,Gabriel Romero,Chief Marketing Officer,True


In [5]:
# Verify Employment & Get Domain
def enrich_employment_and_domain(row):
    # --- A. Employment Verification ---
    search_query = f"{row['Name']} {row['Company']} {row['Title']} 2025 LinkedIn"
    work_context = search_web(search_query)
    
    verify_prompt = f"Based on the search results, is {row['Name']} still working at {row['Company']} as of 2025? Answer 'Yes', 'Likely No', or 'Uncertain'."
    still_working = ask_ollama(verify_prompt, work_context)

    # --- B. Domain Extraction ---
    company_name = row['Company']
    
    if company_name in COMPANY_DOMAIN_CACHE:
        domain = COMPANY_DOMAIN_CACHE[company_name]
    else:
        domain_ctx = search_web(f"official website domain for {company_name}")
        raw_domain = ask_ollama(f"Identify the main official website domain (e.g. apple.com) from these results.", domain_ctx)
        domain = extract_domain_with_regex(raw_domain)
        COMPANY_DOMAIN_CACHE[company_name] = domain

    return pd.Series([still_working, domain])

if not df.empty:
    print("\n[STEP 3] Verifying Employment & Extracting Domains...")
    
    # Progress bar is crucial here as this is the slowest step
    df[['Employment_Status', 'Domain']] = df.progress_apply(enrich_employment_and_domain, axis=1)
    
    print("[RESULT] Enrichment Complete. Updated Data:")
    display(df[['Name', 'Company', 'Employment_Status', 'Domain']].head())


[STEP 3] Verifying Employment & Extracting Domains...


  0%|          | 0/9 [00:00<?, ?it/s]

[RESULT] Enrichment Complete. Updated Data:


Unnamed: 0,Name,Company,Employment_Status,Domain
0,Donna Johnson,Inseego,Yes,inseego.com
1,Andreas Urschitz,Infineon,Yes,infineon.com
2,Julia Chen,AWS,Uncertain,aws.amazon.com
3,Robert Occhialini,World Surf League,Uncertain,worldsurfleague.com
4,Gabriel Romero,AllCloud,Yes,allcloud.in


In [6]:
# Generate Emails
def generate_emails(row):
    domain = row['Domain']
    if not domain or domain == 'n/a':
        return pd.Series(["N/A", "N/A"])

    # 1. Search for pattern
    pattern_ctx = search_web(f"email address format for {row['Company']} {domain} contact")
    
    # 2. Ask Ollama to generate
    prompt = f"""
    Based on the context, generate the 2 most likely email addresses for:
    Name: {row['First_Name']} {row['Last_Name']}
    Company Domain: {domain}
    
    Strict Output Format: email1, email2
    """
    
    response = ask_ollama(prompt, pattern_ctx)
    
    try:
        clean_resp = response.replace('\n', ',').replace('Email 1:', '').replace('Email 2:', '')
        emails = [e.strip() for e in clean_resp.split(',') if '@' in e]
        
        email1 = emails[0] if len(emails) > 0 else "N/A"
        email2 = emails[1] if len(emails) > 1 else "N/A"
    except:
        email1, email2 = "Error", "Error"
        
    return pd.Series([email1, email2])

if not df.empty:
    print("\n[STEP 4] Generating Contact Information...")
    df[['Email_1', 'Email_2']] = df.progress_apply(generate_emails, axis=1)
    
    print("[SUCCESS] Pipeline Finalized.")
    display(df[['Name', 'Domain', 'Email_1', 'Email_2']].head())


[STEP 4] Generating Contact Information...


  0%|          | 0/9 [00:00<?, ?it/s]

[SUCCESS] Pipeline Finalized.


Unnamed: 0,Name,Domain,Email_1,Email_2
0,Donna Johnson,inseego.com,djohnson@inseego.com,Donna.Johnson@inseego.com
1,Andreas Urschitz,infineon.com,andreas.urschitz@infineon.com,a.urschitz@infineon.com
2,Julia Chen,aws.amazon.com,julia.chen@aws.amazon.com,jchen@aws.amazon.com
3,Robert Occhialini,worldsurfleague.com,rocchialini@worldsurfleague.com,robertocchialini@worldsurfleague.com
4,Gabriel Romero,allcloud.in,gabriel.romero@allcloud.in,gromero@allcloud.in


In [7]:
# Save Results
if not df.empty:
    final_cols = ['Name', 'Title', 'Company', 'Domain', 'Employment_Status', 'Email_1', 'Email_2']
    cols_to_save = [c for c in final_cols if c in df.columns]
    
    df[cols_to_save].to_csv(OUTPUT_FILE, index=False)
    print(f"[INFO] Validated leads saved to: {os.path.abspath(OUTPUT_FILE)}")

[INFO] Validated leads saved to: /home/h4636oh/Desktop/Projects/overbase-assignment/processed_data.csv
