In [1]:
from datetime import datetime
from dotenv import load_dotenv
import os


from pathlib import Path

In [3]:
env_path = Path('.') / 'api.env'
load_dotenv(dotenv_path=env_path)

True

In [7]:
SERPAPI_API_KEY = os.getenv("SERPAPI_KEY")

In [27]:
import warnings
warnings.filterwarnings("ignore")

from serpapi import GoogleSearch
from dotenv import load_dotenv
import os
from pathlib import Path
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from time import sleep
import json
from langchain_ollama import ChatOllama
from langchain_core.prompts import (SystemMessagePromptTemplate, 
                                    HumanMessagePromptTemplate,
                                    ChatPromptTemplate)
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import SystemMessage, HumanMessage

# Load environment variables
load_dotenv(dotenv_path=Path('.') / 'api.env')
SERPAPI_API_KEY = os.getenv("SERPAPI_KEY")

# Initialize Ollama LLM
base_url = "http://localhost:11434"
model = "llama3"
llm = ChatOllama(base_url=base_url, model=model)

def search_linkedin_profiles(company, job_title, role_tag=None, country=None, state=None, num_results=5):
    """Search for LinkedIn profiles using SerpAPI"""
    location_filter = ""
    if country:
        location_filter += f' "{country}"'
    if state:
        location_filter += f' "{state}"'
    
    role_filter = ""
    if role_tag:
        role_filter = f' "{role_tag}"'
    
    query = f'{company} {job_title}{role_filter} site:linkedin.com/in/{location_filter}'
    
    params = {
        "engine": "google",
        "q": query,
        "api_key": SERPAPI_API_KEY,
        "num": num_results,
        "hl": "en",
        "gl": "us",
    }
    
    search = GoogleSearch(params)
    results = search.get_dict()
    
    links = []
    if "organic_results" in results:
        for result in results["organic_results"]:
            if "link" in result:
                links.append(result["link"])
    
    return links[:num_results]

def clean_text(text):
    """Clean text by removing multiple newlines, tabs, and extra spaces"""
    import re
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\t+', '\t', text)
    text = re.sub(r'\t\s+', ' ', text)
    text = re.sub(r'\n\s+', '\n', text)
    return text.strip()

def process_profile_with_ai(profile_data, profile_number, total_profiles):
    """Process individual profile using AI to extract structured information"""
    try:
        print(f"🤖 AI Processing [{profile_number}/{total_profiles}]: {profile_data['name']}")
        
        # Create system message
        system_message = """You are provided with LinkedIn profile data in JSON format.
Parse the data according to the specified schema, correct any spelling errors,
and condense the information if possible. Extract information accurately from the provided data."""
        
        # Create human message with profile data
        human_message = f"""### LinkedIn Profile JSON Data:
{json.dumps(profile_data, indent=2)}

### Schema You need to follow:
You need to extract the following information in this exact format:

Name:
Headline:
About:
Experience:
License:
Education:
Skills:
Projects:
Publications:
Summary:

Do not return preambles or any other information. Only return the structured data as requested.

### Parsed Data:"""
        
        # Get AI response
        response = llm.invoke([
            SystemMessage(content=system_message),
            HumanMessage(content=human_message)
        ])
        
        ai_processed_data = {
            'original_profile': profile_data,
            'ai_structured_content': response.content,
            'processing_status': 'success'
        }
        
        print(f"  ✅ AI processing completed for: {profile_data['name']}")
        return ai_processed_data
        
    except Exception as e:
        print(f"  ❌ AI processing failed for {profile_data['name']}: {str(e)}")
        return {
            'original_profile': profile_data,
            'ai_structured_content': f"AI Processing Error: {str(e)}",
            'processing_status': 'error'
        }

def process_all_profiles_with_ai(all_profile_data):
    """Process all profiles using AI and return structured results"""
    print(f"\n🤖 Starting AI processing for {len(all_profile_data)} profiles...")
    
    ai_processed_profiles = []
    
    for idx, profile in enumerate(all_profile_data, start=1):
        # Process each profile individually
        ai_result = process_profile_with_ai(profile, idx, len(all_profile_data))
        ai_processed_profiles.append(ai_result)
        
        # Add delay between AI calls to avoid overwhelming the model
        if idx < len(all_profile_data):
            print("  ⏳ Waiting 2 seconds before next AI processing...")
            sleep(2)
    
    return ai_processed_profiles
    """Remove duplicate lines that are repeated within the same line"""
    lines = text.split('\n')
    new_lines = []
    for line in lines:
        if len(line) > 1 and line[:len(line)//2] == line[len(line)//2:]:
            new_lines.append(line[:len(line)//2])
        else:
            new_lines.append(line)
    return '\n'.join(new_lines)

def scrape_linkedin_profile(driver, url):
    """Scrape comprehensive LinkedIn profile data including all sections"""
    try:
        print(f"Scraping: {url}")
        driver.get(url)
        sleep(5)  # Wait longer for page to fully load
        
        profile_data = {}
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'lxml')
        
        # Extract basic profile info
        try:
            name_element = soup.find("h1", string=True)
            if name_element:
                name = name_element.get_text().strip()
            else:
                # Alternative selector for name
                name_element = soup.find("h1", {"class": "text-heading-xlarge inline t-24 v-align-middle break-words"})
                name = name_element.get_text().strip() if name_element else "Name not found"
        except:
            name = "Name not found"
        
        try:
            headline_element = soup.find('div', {'class': 'text-body-medium break-words'})
            headline = headline_element.get_text().strip() if headline_element else "Headline not found"
        except:
            headline = "Headline not found"
        
        try:
            location_element = soup.find('span', {'class': 'text-body-small inline t-black--light break-words'})
            location = location_element.get_text().strip() if location_element else "Location not found"
        except:
            location = "Location not found"
        
        # Extract all sections from the profile
        print(f"  📋 Extracting all sections for: {name}")
        profile_main = soup.find('main')
        sections = []
        sections_data = []
        
        if profile_main:
            sections = profile_main.find_all('section')
            print(f"  📊 Found {len(sections)} sections")
            
            # Extract text from each section
            sections_text = [section.get_text() for section in sections]
            
            # Clean the text from each section
            sections_text = [clean_text(section) for section in sections_text]
            sections_text = [remove_duplicates(section) for section in sections_text]
            
            # Create structured section data
            for idx, section_text in enumerate(sections_text):
                if section_text.strip():  # Only include non-empty sections
                    sections_data.append({
                        'section_number': idx + 1,
                        'content': section_text,
                        'word_count': len(section_text.split())
                    })
        else:
            print("  ⚠️ Main profile section not found!")
        
        # Combine all section text for full profile content
        full_profile_text = '\n\n'.join([section['content'] for section in sections_data])
        
        profile_data = {
            'name': name,
            'url': url,
            'headline': headline,
            'location': location,
            'total_sections': len(sections_data),
            'sections': sections_data,
            'full_profile_text': full_profile_text,
            'total_words': len(full_profile_text.split()) if full_profile_text else 0
        }
        
        print(f"  ✅ Successfully scraped: {name} ({len(sections_data)} sections, {profile_data['total_words']} words)")
        return profile_data
        
    except Exception as e:
        print(f"  ❌ Error scraping {url}: {str(e)}")
        return {
            'name': 'Error',
            'url': url,
            'headline': 'Error occurred',
            'location': 'Error occurred',
            'total_sections': 0,
            'sections': [],
            'full_profile_text': 'Error occurred',
            'total_words': 0
        }

def setup_linkedin_driver():
    """Setup Chrome driver and login to LinkedIn"""
    # Set up Chrome driver with options
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(options=chrome_options)
    
    # Login to LinkedIn
    print("Logging into LinkedIn...")
    driver.get('https://www.linkedin.com/login')
    sleep(2)
    
    email_input = driver.find_element(By.ID, 'username')
    email_input.send_keys(os.getenv("email"))
    
    password_input = driver.find_element(By.ID, 'password')
    password_input.send_keys(os.getenv("password"))
    password_input.submit()
    
    sleep(5)  # Wait for login to complete
    print("✅ LinkedIn login successful")
    
    return driver

def main():
    # ✅ Collect input from user
    company = input("Enter company name: ").strip()
    job_title = input("Enter job title: ").strip()
    role_tag = input("Enter role tag (optional): ").strip()
    country = input("Enter country (optional): ").strip()
    state = input("Enter state (optional): ").strip()
    
    try:
        num_results = int(input("How many LinkedIn profiles do you want to fetch? (e.g., 5): ").strip())
    except ValueError:
        num_results = 5  # fallback
    
    # 🧠 Make empty strings None
    role_tag = role_tag or None
    country = country or None
    state = state or None
    
    # 🔍 Search LinkedIn profiles
    print("\n🔍 Searching for LinkedIn profiles...")
    profile_links = search_linkedin_profiles(company, job_title, role_tag, country, state, num_results)
    
    if not profile_links:
        print("❌ No LinkedIn profiles found!")
        return
    
    # 📤 Display found URLs
    print(f"\n📋 Found {len(profile_links)} LinkedIn Profiles:")
    for idx, profile in enumerate(profile_links, start=1):
        print(f"{idx}. {profile}")
    
    # 🤖 Setup Selenium driver and login
    driver = setup_linkedin_driver()
    
    # 📊 Scrape all profiles
    all_profile_data = []
    print(f"\n🚀 Starting to scrape {len(profile_links)} profiles...")
    
    for idx, url in enumerate(profile_links, start=1):
        print(f"\n[{idx}/{len(profile_links)}] Processing profile...")
        profile_data = scrape_linkedin_profile(driver, url)
        all_profile_data.append(profile_data)
        
        # Add delay between requests to avoid being blocked
        if idx < len(profile_links):
            print("⏳ Waiting 5 seconds before next profile...")
            sleep(5)
    
    # Close the driver after scraping
    driver.quit()
    print("🌐 Browser closed after scraping.")
    
    # 🤖 Process all profiles with AI
    ai_processed_profiles = process_all_profiles_with_ai(all_profile_data)
    
    # 💾 Save results
    print("\n💾 Saving results...")
    
    # Save raw scraped data
    raw_output_filename = f"raw_linkedin_profiles_{company.replace(' ', '_')}_{job_title.replace(' ', '_')}.json"
    with open(raw_output_filename, 'w', encoding='utf-8') as f:
        json.dump(all_profile_data, f, indent=2, ensure_ascii=False)
    
    # Save AI processed data
    ai_output_filename = f"ai_processed_profiles_{company.replace(' ', '_')}_{job_title.replace(' ', '_')}.json"
    with open(ai_output_filename, 'w', encoding='utf-8') as f:
        json.dump(ai_processed_profiles, f, indent=2, ensure_ascii=False)
    
    # Display results
    print(f"\n✅ Processing completed!")
    print(f"📁 Raw scraped data saved to: {raw_output_filename}")
    print(f"🤖 AI processed data saved to: {ai_output_filename}")
    
    print("\n📊 PROCESSING SUMMARY:")
    print("=" * 80)
    
    successful_ai_processing = sum(1 for profile in ai_processed_profiles if profile['processing_status'] == 'success')
    failed_ai_processing = len(ai_processed_profiles) - successful_ai_processing
    
    total_sections = sum(profile['total_sections'] for profile in all_profile_data)
    total_words = sum(profile['total_words'] for profile in all_profile_data)
    
    print(f"📈 Overall Statistics:")
    print(f"   • Total Profiles Scraped: {len(all_profile_data)}")
    print(f"   • AI Processing Successful: {successful_ai_processing}")
    print(f"   • AI Processing Failed: {failed_ai_processing}")
    print(f"   • Total Sections Extracted: {total_sections}")
    print(f"   • Total Words Extracted: {total_words:,}")
    print(f"   • Average Sections per Profile: {total_sections/len(all_profile_data):.1f}")
    print(f"   • Average Words per Profile: {total_words/len(all_profile_data):.0f}")
    
    print(f"\n🤖 AI STRUCTURED PROFILES:")
    print("=" * 80)
    
    for idx, ai_profile in enumerate(ai_processed_profiles, start=1):
        original = ai_profile['original_profile']
        print(f"\n[Profile {idx}] {original['name']}")
        print(f"   Status: {'✅ Success' if ai_profile['processing_status'] == 'success' else '❌ Failed'}")
        print(f"   URL: {original['url']}")
        print(f"   Raw Sections: {original['total_sections']}")
        print(f"   Raw Words: {original['total_words']:,}")
        
        if ai_profile['processing_status'] == 'success':
            # Show a preview of the AI structured content
            structured_preview = ai_profile['ai_structured_content'][:300].replace('\n', ' ')
            print(f"   AI Preview: {structured_preview}{'...' if len(ai_profile['ai_structured_content']) > 300 else ''}")
        else:
            print(f"   Error: {ai_profile['ai_structured_content']}")
        
        print("-" * 60)
    
    # Create comprehensive structured report
    structured_report_filename = f"structured_report_{company.replace(' ', '_')}_{job_title.replace(' ', '_')}.txt"
    with open(structured_report_filename, 'w', encoding='utf-8') as f:
        f.write(f"AI STRUCTURED LINKEDIN PROFILE REPORT\n")
        f.write(f"Company: {company}\n")
        f.write(f"Job Title: {job_title}\n")
        f.write(f"Total Profiles: {len(ai_processed_profiles)}\n")
        f.write(f"Successful AI Processing: {successful_ai_processing}\n")
        f.write(f"Generated on: {os.popen('date').read().strip()}\n")
        f.write("=" * 80 + "\n\n")
        
        for idx, ai_profile in enumerate(ai_processed_profiles, start=1):
            original = ai_profile['original_profile']
            f.write(f"PROFILE {idx}: {original['name']}\n")
            f.write(f"URL: {original['url']}\n")
            f.write(f"Processing Status: {ai_profile['processing_status']}\n")
            f.write("-" * 60 + "\n")
            
            if ai_profile['processing_status'] == 'success':
                f.write("AI STRUCTURED DATA:\n")
                f.write(ai_profile['ai_structured_content'])
            else:
                f.write(f"PROCESSING ERROR:\n{ai_profile['ai_structured_content']}")
            
            f.write("\n" + "=" * 80 + "\n\n")
    
    print(f"\n📄 Comprehensive structured report saved to: {structured_report_filename}")
    
    # Close the driver
    print("\n🎉 All processing completed! Check the generated files for your structured LinkedIn data.")

if __name__ == "__main__":
    main()

Enter company name:  microsoft
Enter job title:  intern
Enter role tag (optional):  swe intern
Enter country (optional):  india
Enter state (optional):  delhi 
How many LinkedIn profiles do you want to fetch? (e.g., 5):  2



🔍 Searching for LinkedIn profiles...

📋 Found 2 LinkedIn Profiles:
1. https://in.linkedin.com/in/07ankitathakur
2. https://in.linkedin.com/in/nishthapabreja
Logging into LinkedIn...
✅ LinkedIn login successful

🚀 Starting to scrape 2 profiles...

[1/2] Processing profile...
Scraping: https://in.linkedin.com/in/07ankitathakur
  📋 Extracting all sections for: Ankita .
  📊 Found 16 sections
  ✅ Successfully scraped: Ankita . (16 sections, 5275 words)
⏳ Waiting 5 seconds before next profile...

[2/2] Processing profile...
Scraping: https://in.linkedin.com/in/nishthapabreja
  📋 Extracting all sections for: Nishtha Pabreja
  📊 Found 13 sections
  ✅ Successfully scraped: Nishtha Pabreja (13 sections, 1340 words)
🌐 Browser closed after scraping.

🤖 Starting AI processing for 2 profiles...
🤖 AI Processing [1/2]: Ankita .
  ✅ AI processing completed for: Ankita .
  ⏳ Waiting 2 seconds before next AI processing...
🤖 AI Processing [2/2]: Nishtha Pabreja
  ✅ AI processing completed for: Nishtha Pa