In [10]:
import os
import time
import json
import getpass
from bs4 import BeautifulSoup

# Selenium Imports
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# --- Configuration ---
PROFILE_URL = "https://www.linkedin.com/in/dfbaron/" # Your profile URL

def get_credentials():
    """Gets LinkedIn credentials from environment variables or user input."""
    #email = os.getenv("LINKEDIN_EMAIL")
    email = "df.baron10@uniandes.edu.co" 
    #password = os.getenv("LINKEDIN_PASSWORD")
    password = '970930danibe'
        
    return email, password

# --- HELPER PARSING FUNCTIONS ---

def get_clean_text(element):
    """Extracts clean text from a BeautifulSoup element, handling newlines."""
    if not element:
        return 'N/A'
    
    # Replace <br> tags with newlines for descriptions
    for br in element.find_all("br"):
        br.replace_with("\n")
        
    # Get text, strip leading/trailing whitespace, and clean up multiple newlines
    text = element.get_text(strip=True)
    return text

def find_section(soup, section_id):
    """Finds a section by its ID anchor and returns the parent <section> container."""
    anchor = soup.find('div', id=section_id)
    if anchor:
        return anchor.find_parent('section', class_='artdeco-card')
    return None

# --- SCRAPING LOGIC FOR EACH PROFILE SECTION ---

def scrape_header(soup):
    profile = {}
    # Use a more general selector for the top card to be more robust
    top_card = soup.find('section', class_=lambda x: x and 'pv-top-card' in x)
    if top_card:
        profile['name'] = get_clean_text(top_card.find('h1'))
        profile['headline'] = get_clean_text(top_card.find('div', class_='text-body-medium'))
        # This selector is specific for the location span
        profile['location'] = get_clean_text(top_card.select_one('span.text-body-small.inline.t-black--light'))
    return profile

def scrape_about(soup):
    about_section = find_section(soup, 'about')
    if not about_section:
        return 'N/A'
    
    about_text_container = about_section.select_one('div[class*="inline-show-more-text"] span[aria-hidden="true"]')
    return get_clean_text(about_text_container)

def scrape_experience(soup):
    experience_section = find_section(soup, 'experience')
    if not experience_section:
        return []

    all_experiences = []
    exp_blocks = experience_section.select('ul > li.artdeco-list__item')

    for block in exp_blocks:
        exp = {}
        title_element = block.select_one('div.t-bold span[aria-hidden="true"]')
        if not title_element: continue

        exp['title'] = get_clean_text(title_element)
        
        # All metadata (company, dates, location) are in sibling spans
        meta_elements = block.select('span.t-14 span[aria-hidden="true"]')
        if meta_elements:
            exp['company_and_type'] = get_clean_text(meta_elements[0])
            if len(meta_elements) > 1:
                exp['dates'] = get_clean_text(meta_elements[1])
            if len(meta_elements) > 2:
                exp['location'] = get_clean_text(meta_elements[2])

        desc_div = block.select_one('div[class*="inline-show-more-text"] span[aria-hidden="true"]')
        exp['description'] = get_clean_text(desc_div)

        skills_element = block.select_one('strong')
        exp['skills'] = get_clean_text(skills_element)
        
        all_experiences.append(exp)

    return all_experiences

def scrape_education(soup):
    education_section = find_section(soup, 'education')
    if not education_section:
        return []

    all_education = []
    edu_blocks = education_section.select('ul > li.artdeco-list__item')
    
    for block in edu_blocks:
        edu = {}
        school_element = block.select_one('span.t-bold span[aria-hidden="true"]')
        if not school_element: continue

        edu['school'] = get_clean_text(school_element)
        
        meta_elements = block.select('span.t-14 span[aria-hidden="true"]')
        if meta_elements:
            edu['degree'] = get_clean_text(meta_elements[0])
            if len(meta_elements) > 1:
                edu['dates'] = get_clean_text(meta_elements[1])

        desc_div = block.select_one('div[class*="inline-show-more-text"] span[aria-hidden="true"]')
        edu['description'] = get_clean_text(desc_div)
        
        all_education.append(edu)
        
    return all_education

# --- MAIN SCRIPT EXECUTION ---
def main():
    # Define the filename for the saved HTML
    html_filename = 'linkedin_profile.html'
    html_source = ''

    # 1. Try to load HTML from the local file first
    try:
        with open(html_filename, 'r', encoding='utf-8') as f:
            print(f"Found and loaded '{html_filename}'. Skipping web scraping.")
            html_source = f.read()
    except FileNotFoundError:
        print(f"'{html_filename}' not found. Starting web scraper...")
        
        email, password = get_credentials()
        
        print("Initializing WebDriver...")
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service)
        
        try:
            # A. Log into LinkedIn
            print("Logging in...")
            driver.get("https://www.linkedin.com/login")
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "username"))).send_keys(email)
            driver.find_element(By.ID, "password").send_keys(password)
            driver.find_element(By.XPATH, '//button[@type="submit"]').click()
            
            print("Waiting for login to complete...")
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "global-nav-search")))

            # B. Navigate to the profile and scroll
            print(f"Navigating to profile: {PROFILE_URL}")
            driver.get(PROFILE_URL)
            
            print("Scrolling to load entire page...")
            last_height = driver.execute_script("return document.body.scrollHeight")
            for _ in range(5):
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(3)
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height: break
                last_height = new_height
            
            # C. Get the final HTML source
            print("Extracting page source...")
            html_source = driver.page_source
            
            # D. Save the HTML to a file for future use
            with open(html_filename, 'w', encoding='utf-8') as f:
                f.write(html_source)
            print(f"Successfully saved page source to '{html_filename}'.")

        except Exception as e:
            print(f"\nAn error occurred during scraping: {e}")
            return # Stop execution if scraping fails
        finally:
            print("Closing WebDriver.")
            driver.quit()

    # 2. Parse the HTML (either from the file or from the fresh scrape)
    if not html_source:
        print("Could not get HTML source. Exiting.")
        return

    soup = BeautifulSoup(html_source, "html.parser")
    
    # 3. Scrape all sections from the soup object
    profile_data = {}
    print("\nParsing data from HTML...")
    profile_data['profile'] = scrape_header(soup)
    profile_data['about'] = scrape_about(soup)
    profile_data['experiences'] = scrape_experience(soup)
    profile_data['education'] = scrape_education(soup)
    
    # 4. Save the final JSON data
    output_filename = 'linkedin_data.json'
    print(f"Saving parsed data to {output_filename}...")
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(profile_data, f, ensure_ascii=False, indent=4)
    
    print("\nProcess complete!")

if __name__ == "__main__":
    main()

'linkedin_profile.html' not found. Starting web scraper...
Initializing WebDriver...
Logging in...
Waiting for login to complete...
Navigating to profile: https://www.linkedin.com/in/dfbaron/
Scrolling to load entire page...
Extracting page source...
Successfully saved page source to 'linkedin_profile.html'.
Closing WebDriver.

Parsing data from HTML...
Saving parsed data to linkedin_data.json...

Process complete!
