In [5]:
#imports
import os
import pickle
import time
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import chromedriver_autoinstaller
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import random
import sqlite3
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import gender_guesser.detector as gender
from human_mimic import human_sleep, human_scroll, random_hover


# **LOGIN**
- attempted changes, one time manual login, then automatic saving of cookies until you're kicked off, persistnet long lasting session as if you were a real user
- better User Experience then manually scraping a cookie
- I'm also going to reduce the number of top level functions because I learnt what a class really is...

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pickle, os, time, random
import chromedriver_autoinstaller

class SessionManager:
    def __init__(self, cookie_path="your_cookies.pkl", use_user_profile=False, user_profile_path=None):
        self.cookie_path = cookie_path
        self.use_user_profile = use_user_profile
        self.user_profile_path = user_profile_path
        self.driver = self._init_driver()

    def _init_driver(self):
        chromedriver_autoinstaller.install()
        options = Options()
        options.add_argument("--user-agent=Mozilla/5.0")

        if self.use_user_profile:
            if self.user_profile_path:
                options.add_argument(f"--user-data-dir={self.user_profile_path}")
                print(f"[i] Using custom user profile: {self.user_profile_path}")
            else:
                print("[i] User profile flag is on but no path specified.")

        return webdriver.Chrome(options=options)

    
    def _human_mimic(self):
        human_scroll(self.driver, total_scrolls=5)
        random_hover(self.driver)
        human_sleep(2, 0.5)


    def _save_cookies(self):
        with open(self.cookie_path, "wb") as f:
            pickle.dump(self.driver.get_cookies(), f)
        print(f"[SUCCESS] Cookies saved to {self.cookie_path}")

    def _load_cookies(self):
        with open(self.cookie_path, "rb") as f:
            cookies = pickle.load(f)
            for cookie in cookies:
                try:
                    self.driver.add_cookie(cookie)
                except Exception:
                    pass

    def login(self):
        self.driver.get("https://www.linkedin.com")
        time.sleep(3)

        if os.path.exists(self.cookie_path):
            self._load_cookies()
            self.driver.get("https://www.linkedin.com/feed/")
            time.sleep(3)

            if "feed" in self.driver.current_url:
                print("[SUCCESS] Logged in using saved cookies.")
                self._human_behavior()
                return self.driver
            else:
                print("[ERROR] Saved cookies invalid or expired.")
        
        print("Manual login required.")
        print("Please login in the Chrome window. Waiting...")

        self.driver.get("https://www.linkedin.com/login")
        input("Press [ENTER] after you've logged in...")

        if "feed" in self.driver.current_url:
            self._save_cookies()
            self._human_behavior()
        else:
            print("[ERROR] Login failed. Please try again.")

        return self.driver


session = SessionManager()
driver = session.login()




[!] Manual login required.
Please login in the Chrome window. Waiting...
[✓] Cookies saved to proto_cookies.pkl


# **Navigate to PYMK tab**
- change now takes a general input, e.g university of leeds, bristol, people work in consulting, etc.
- longer wait times for lazy load

In [6]:
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from human_mimic import human_sleep, random_hover
import random

def wait_and_open_target_tab(driver, target_label, max_retries=5, scroll_loops=8):
    """
    Navigates to the 'My Network' page and tries to open a specific 'People You May Know' tab.

    Args:
        driver: Selenium WebDriver instance
        target_label (str): The dynamic part of the tab label (e.g. 'University of Exeter')
        max_retries (int): Max attempts before giving up
        scroll_loops (int): How many scrolls to perform to trigger lazy loading
    """
    print(f"Trying to open 'People you may know from {target_label}' tab")

    for attempt in range(max_retries):
        print(f"[{attempt + 1}/{max_retries}] Searching...")

        driver.get("https://www.linkedin.com/mynetwork/")
        human_sleep(5, 2)
        random_hover(driver, "a")

        for scroll in range(scroll_loops):
            driver.execute_script("window.scrollBy(0, document.body.scrollHeight);")
            print(f"    Scrolled down {scroll + 1}/{scroll_loops}")
            human_sleep(2, 1.5)  # increased wait time per scroll

            if scroll in {2, 5} and random.random() < 0.4:
                print("   ...pausing to simulate reading")
                human_sleep(3, 1)

        try:
            show_all_btn = driver.find_element(
                By.XPATH,
                f"//button[@aria-label='Show all suggestions for People you may know from {target_label}']"
            )
            driver.execute_script("arguments[0].scrollIntoView({ behavior: 'smooth' });", show_all_btn)
            human_sleep(2, 1)

            ActionChains(driver).move_to_element(show_all_btn).perform()
            human_sleep(1.5, 0.5)

            driver.execute_script("arguments[0].click();", show_all_btn)
            print(f"Opened suggestions for: {target_label}")
            return True

        except Exception:
            print("Did not find the tab this time.")

            if random.random() < 0.4:
                driver.get("https://www.linkedin.com/feed/")
                print("Went back to feed to reset.")
            else:
                print("Retrying directly from My Network...")

            human_sleep(5, 2)

    print("Gave up after too many attempts, are you sure you entered in the section name EXACTLY AS IT'S WRITTEN?.")
    return False


### **Why I Chose a Two-Phase Scraping Pipeline (Option A)**

We separate scraping into two distinct phases:

1. **Phase 1 – Shallow Scrape (PYMK tab):**
   - Collect basic metadata (name, role, profile URL) from the "People You May Know" view.
   - Avoids interrupting LinkedIn’s scroll-based loading behavior.

2. **Phase 2 – Deep Scrape (Full Profiles):**
   - Later, visit each profile URL individually and save raw HTML for offline parsing.

#### **Why This Is Better Than In-Loop HTML Scraping (Option B):**
- Keeps the PYMK tab stable — avoids breaking the scroll loop.
- Reduces suspicious behavior (less jumping between pages).
- Faster and more efficient for gathering large batches.
- Allows offline parsing and reprocessing without re-scraping.

This design increases reliability and stealth while keeping the scraping modular and maintainable.


# **Shallow Scraping**
-No global seen_urls/people — it's self-contained


In [7]:
def scroll_and_extract_profiles(
    driver,
    pause_range=(2.5, 4.0),
    streak_limit=5,
    scrolls_per_loop=10,
    scroll_factor=1.8,
    max_profiles=100
):
    """
    Scrolls through the suggestion tab and extracts new profile previews.

    Args:
        driver: Selenium WebDriver instance.
        pause_range: Tuple of (min, max) pause time between loops.
        streak_limit: How many times in a row we can scroll with 0 new profiles.
        scrolls_per_loop: Number of scrolls per iteration.
        scroll_factor: Amount to scroll per step (adjusted randomly).
        max_profiles: Optional int to stop scraping after reaching N profiles.
    """

    seen_urls = set()
    people = []
    streak = 0
    loop = 0

    print("\nStarting deep scroll & extract...\n")

    def get_scroll_container(timeout=10):
        try:
            container = WebDriverWait(driver, timeout).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "#root > dialog > div > div:nth-child(2)")
                )
            )
            return container
        except Exception:
            print("   Scroll container not found.")
            return None

    def extract_new_people():
        soup = BeautifulSoup(driver.page_source, "html.parser")
        cards = soup.find_all("a", href=True)
        new_count = 0

        for card in cards:
            href = card['href']
            if not href.startswith("https://www.linkedin.com/in/"):
                continue
            if href in seen_urls:
                continue

            try:
                paragraphs = card.find_all("p")
                if len(paragraphs) < 2:
                    continue

                name = paragraphs[0].get_text(strip=True)
                headline = paragraphs[1].get_text(strip=True)

                people.append({
                    "name": name,
                    "headline": headline,
                    "profile_url": href
                })
                seen_urls.add(href)
                new_count += 1

                if max_profiles is not None and len(seen_urls) >= max_profiles:
                    break  # stop collecting more this round

            except Exception as e:
                print("   Error parsing card:", e)
                continue

        return new_count

    while True:
        loop += 1
        print(f"Loop {loop}")

        new_profiles = extract_new_people()
        print(f"   New: {new_profiles} | Total collected: {len(seen_urls)}")

        if max_profiles is not None and len(seen_urls) >= max_profiles:
            print(f"Reached max scrape limit ({max_profiles}). Stopping.")
            break

        if new_profiles == 0:
            streak += 1
            print(f"   No new profiles ({streak}/{streak_limit})")
            if streak >= streak_limit:
                print("Too many empty scrolls. Ending loop.")
                break
        else:
            streak = 0

        scroll_container = get_scroll_container()
        if scroll_container and scroll_container.size['height'] > 0:
            try:
                ActionChains(driver).move_to_element(scroll_container).perform()
                human_sleep(0.5, 0.2)

                for s in range(scrolls_per_loop):
                    scroll_step = scroll_factor + random.uniform(-0.3, 0.3)
                    driver.execute_script("""
                        let container = arguments[0];
                        container.scrollTop += container.clientHeight * arguments[1];
                    """, scroll_container, scroll_step)

                    print(f"      Scroll {s+1}/{scrolls_per_loop} [factor: {scroll_step:.2f}]")
                    human_sleep(random.uniform(1.5, 2.5))

                try:
                    see_more_button = driver.find_element(
                        By.CSS_SELECTOR,
                        "#root > dialog > div > div > div > div > section > div > div > div > div._1xoe5hdi.cnuthtrs > button"
                    )
                    if see_more_button:
                        print("   Found 'See more' button — clicking.")
                        driver.execute_script("arguments[0].scrollIntoView({ behavior: 'smooth' });", see_more_button)
                        human_sleep(2, 1)
                        ActionChains(driver).move_to_element(see_more_button).perform()
                        human_sleep(1.5, 0.5)
                        driver.execute_script("arguments[0].click();", see_more_button)
                        human_sleep(2, 1)
                except Exception:
                    print("   No 'See more' button found this round.")

                if loop % 3 == 0 and random.random() < 0.5:
                    pause = random.uniform(5, 9)
                    print(f"   Taking a longer pause: {pause:.1f}s")
                    human_sleep(pause / 2, pause / 2)

            except Exception as e:
                print("   Failed to scroll:", e)

        else:
            print("   Scroll container missing or collapsed.")
            streak += 1

        human_sleep(*pause_range)

    df = pd.DataFrame(people)
    print(f"\nDone. Collected {len(df)} unique profiles.\n")
    return df


: 