# **Notebook Overview**

The purpose of this notebook is to login and navigate to the target area
Scrape and store profiles to a csv
Perform analysis on them and rank by likelihood of acceptance based on initial intuition
Then we rank them best to worst and apply to the 20 best, sample size of 100 or so
ML model to predict acceptance liklihood
Ml integraiton into flow
Mission done

## **Logging in**

In [2]:
import os
import pickle
import time
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import chromedriver_autoinstaller

#when this breaks close everything, restart the terminal, open google, login to linkedin copy a fresh cookie
#one file saves the cookie, one is the most recent cookie from linkedin
LI_AT_COOKIE = "YOUR LI_AT COOKIE FROM LINKEDIN"
COOKIE_FILE = "linkedin_cookies.pkl"

#core functions to open, save session

def save_linkedin_session(driver, cookie_path=COOKIE_FILE):
    cookies = driver.get_cookies()
    with open(cookie_path, "wb") as f:
        pickle.dump(cookies, f)
    print(f"cookies saved to {cookie_path}")

def load_linkedin_session(cookie_path=COOKIE_FILE):
    chromedriver_autoinstaller.install()
    options = Options()
    options.add_argument("--user-agent=Mozilla/5.0")
    driver = webdriver.Chrome(options=options)

    driver.get("https://www.linkedin.com")
    time.sleep(3)

    if os.path.exists(cookie_path):
        with open(cookie_path, "rb") as f:
            cookies = pickle.load(f)
        for cookie in cookies:
            try:
                driver.add_cookie(cookie)
            except:
                pass

        driver.get("https://www.linkedin.com/feed/")
        time.sleep(5)

        if "feed" in driver.current_url:
            print("Logged in with saved session cookies.")
            return driver
        else:
            print("Cookie login failed. Trying li_at fallback...")

    return linkedin_login_with_li_at(driver)

def linkedin_login_with_li_at(driver):
    driver.get("https://www.linkedin.com")
    time.sleep(3)

    driver.add_cookie({
        'name': 'li_at',
        'value': LI_AT_COOKIE,
        'domain': '.linkedin.com',
        'path': '/',
        'secure': True,
        'httpOnly': True
    })

    driver.get("https://www.linkedin.com/feed/")
    time.sleep(5)

    if "feed" in driver.current_url:
        print("Logged in using li_at cookie.")
        save_linkedin_session(driver)
    else:
        print("li_at cookie failed. Check your value or try manual login.")

    return driver

#basic call before runing anything else
#driver = load_linkedin_session()


## Why We Save and Load LinkedIn Session Cookies

To interact with LinkedIn programmatically (such as sending connections or scraping profiles), we need to be logged into our account. Normally, this would require logging in every time the script runs, which is time-consuming and may trigger LinkedIn's bot protection systems.

Instead, we save our login session using cookies. When we're already logged in, LinkedIn stores a session cookie that keeps us authenticated. By saving this cookie after a successful login, we can reuse it in future sessions without needing to log in again.

This approach has several advantages:
- Avoids logging in manually every time
- Reduces the chance of being blocked or flagged as a bot
- Makes the bot fully automatic and ready to run on a schedule
- Allows us to access LinkedIn pages that require authentication, like the feed or "People You May Know"

In this notebook, we use two functions:
- `save_linkedin_session(driver)`: saves the current session cookies to a file
- `load_linkedin_session()`: loads the saved cookies to restore the session

If the cookies are missing or expired, the script falls back to using a fresh `li_at` cookie to log in and then saves the new session.

This setup allows our LinkedIn bot to operate without needing manual login each time.


In [3]:
from selenium.webdriver.common.by import By
import time
import random

def wait_and_open_exeter_tab(driver, target="University of Exeter", max_retries=10, scroll_loops=8):
    for attempt in range(max_retries):
        print(f"Attempt {attempt + 1}/{max_retries} to open '{target}' tab")

        # Step 1: go to My Network
        driver.get("https://www.linkedin.com/mynetwork/")
        time.sleep(random.uniform(4, 6))

        # Step 2: aggressive scrolling to trigger lazy loading
        for scroll in range(scroll_loops):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            print(f"   scrolled {scroll + 1}/{scroll_loops}")
            time.sleep(random.uniform(0.8, 1.5))

        # Step 3: try to find and click the target tab
        try:
            show_all_btn = driver.find_element(
                By.XPATH,
                f"//button[@aria-label='Show all suggestions for People you may know from {target}']"
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", show_all_btn)
            time.sleep(random.uniform(1, 2))
            driver.execute_script("arguments[0].click();", show_all_btn)
            print(f"Opened '{target}' tab")
            return True

        except Exception as e:
            print("Did not find the target tab this time.")
            driver.get("https://www.linkedin.com/feed/")
            time.sleep(random.uniform(4, 6))

    print(" Gave up after too many tries.")
    return False

    return False






# **Scraping Profiles form the People You Might Know Page**

In [4]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

seen_urls = set()
people = []

def extract_new_people(driver):
    soup = BeautifulSoup(driver.page_source, "html.parser")
    cards = soup.find_all("a", href=True)
    new_count = 0

    for card in cards:
        href = card['href']
        if not href.startswith("https://www.linkedin.com/in/"):
            continue
        if href in seen_urls:
            continue

        try:
            paragraphs = card.find_all("p")
            if len(paragraphs) < 2:
                continue

            name = paragraphs[0].get_text(strip=True)
            headline = paragraphs[1].get_text(strip=True)

            people.append({
                "name": name,
                "headline": headline,
                "profile_url": href
            })
            seen_urls.add(href)
            new_count += 1
        except Exception as e:
            print("error parsing card:", e)
            continue

    return new_count

def get_scroll_container(driver, timeout=10):
    try:
        container = WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "#root > dialog > div > div:nth-child(2)")
            )
        )
        return container
    except Exception:
        print("   scroll container not found.")
        return None

def scroll_and_extract_profiles(driver, pause_range=(1.2, 2.0), streak_limit=5, scrolls_per_loop=10, scroll_factor=2):
    from selenium.webdriver.common.action_chains import ActionChains

    streak = 0
    loop = 0
    print("\nstarting scroll + extract...\n")

    while True:
        loop += 1
        print(f"loop {loop}")

        new_profiles = extract_new_people(driver)
        print(f"   new: {new_profiles} | total: {len(seen_urls)}")

        if new_profiles == 0:
            streak += 1
            if streak >= streak_limit:
                print("   nothing new for a while — stopping.")
                break
        else:
            streak = 0

        scroll_container = get_scroll_container(driver)

        if scroll_container and scroll_container.size['height'] > 0:
            try:
                ActionChains(driver).move_to_element(scroll_container).perform()
                time.sleep(0.3)

                for s in range(scrolls_per_loop):
                    driver.execute_script("""
                        let container = arguments[0];
                        container.scrollTop += container.clientHeight * arguments[1];
                    """, scroll_container, scroll_factor)

                    print(f"   aggressive scroll {s+1}/{scrolls_per_loop} triggered")
                    time.sleep(0.05)
            except Exception as e:
                print("   failed to scroll:", e)
        else:
            print("   scroll container missing or has height = 0")

        time.sleep(random.uniform(*pause_range))

    df = pd.DataFrame(people)
    print(f"\ncompleted with {len(df)} total profiles\n")
    return df


## **Saving non duplicates to db**

In [5]:
import os
import sqlite3
import pandas as pd

driver = load_linkedin_session()
success = wait_and_open_exeter_tab(driver)

if success:
    print("Ready to extract or interact with profiles.")
    df_new = scroll_and_extract_profiles(driver)
else:
    print("Target tab not found after retries.")
    df_new = pd.DataFrame()

# --- SETUP SQLITE DB ---
db_path = "linkedin_profiles.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Create table if it doesn't exist
cursor.execute("""
CREATE TABLE IF NOT EXISTS profiles (
    profile_id INTEGER PRIMARY KEY AUTOINCREMENT,
    profile_url TEXT UNIQUE,
    name TEXT,
    headline TEXT,
    location TEXT,
    connections TEXT,
    scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
conn.commit()

# --- INSERT NEW PROFILES ---
if not df_new.empty:
    new_rows = 0
    for _, row in df_new.iterrows():
        url = row.get("profile_url")
        name = row.get("name")
        headline = row.get("headline")
        location = row.get("location")
        connections = row.get("connections")

        try:
            cursor.execute("""
                INSERT INTO profiles (profile_url, name, headline, location, connections)
                VALUES (?, ?, ?, ?, ?)
            """, (url, name, headline, location, connections))
            new_rows += 1
        except sqlite3.IntegrityError:
            # URL already exists (duplicate)
            continue

    conn.commit()
    print(f"Inserted {new_rows} new profiles into the database.")
else:
    print("No profiles scraped to insert.")

conn.close()


InvalidSessionIdException: Message: invalid session id: session deleted as the browser has closed the connection
from disconnected: not connected to DevTools
  (Session info: chrome=133.0.6943.53)
Stacktrace:
#0 0x614fcd97314a <unknown>
#1 0x614fcd410b80 <unknown>
#2 0x614fcd3f672e <unknown>
#3 0x614fcd41f189 <unknown>
#4 0x614fcd490839 <unknown>
#5 0x614fcd4adad9 <unknown>
#6 0x614fcd487ca3 <unknown>
#7 0x614fcd453f08 <unknown>
#8 0x614fcd455071 <unknown>
#9 0x614fcd93cb5b <unknown>
#10 0x614fcd940ae2 <unknown>
#11 0x614fcd928967 <unknown>
#12 0x614fcd9416d4 <unknown>
#13 0x614fcd90cc7f <unknown>
#14 0x614fcd961cd8 <unknown>
#15 0x614fcd961ea9 <unknown>
#16 0x614fcd971fc6 <unknown>
#17 0x76579f69caa4 <unknown>
#18 0x76579f729c3c <unknown>


### **Retrieving HTML**

- Retrieve HTML to cache
- Parse for vital information
- From this information create hypotheses on who is most likely to accept a connection
- Create numerous categories based on hypotheses
- Draw a random sample from each group 30 each
- Mark when we send invitations, timestamp and binary sent / not sent
- Wait for some period for responses
- After some time, mark invites as accepted / rejected 
- Study differences in groups, response times acceptance rates, etc

- train ML model to predict most likely to accept:
- create an integrated pipeline that loads onto linkedin, scrapes from PYMK,stores to db, retrieves html, parses for key info, predicts acceptance liklihood, send connection reqeust to the top 20 every weekday (LinkedIn upper limit)
 

In [104]:
from bs4 import BeautifulSoup
import os
import time
import random
from selenium.webdriver.common.by import By

def extract_and_save_profile_sections(driver, url, filename):
    try:
        driver.get(url)
        time.sleep(4 + random.uniform(1.5, 2.5))

        #Scroll to bottom to trigger lazy load
        for _ in range(3):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(1.5, 2.5))

        soup = BeautifulSoup(driver.page_source, "html.parser")

        sections = {}

        # --- ABOUT section ---
        about_section = soup.select_one("section.pv-about-section") or soup.select_one("div.display-flex.ph5.pv3")
        if about_section:
            sections['about'] = str(about_section)

        # --- EXPERIENCE section (with fallback) ---
        experience_section = soup.find("section", id=lambda x: x and "experience" in x.lower()) or \
                             soup.select_one("section.pv-profile-section.experience-section") or \
                             soup.select_one("div[data-view-name='profile-component-entity']")
        if experience_section:
            sections['experience'] = str(experience_section)

        # --- EDUCATION section (with partial ID logic) ---
        education_section = soup.find("section", id=lambda x: x and "education" in x.lower()) or \
                            soup.select_one("section.education-section") or \
                            soup.select_one("section:nth-child(5) div.QFUNNBSyURYZDijoTTsGvpXOtaExgDrYZis")
        if education_section:
            sections['education'] = str(education_section)

        # --- SKILLS section ---
        skills_section = soup.find("section", id=lambda x: x and "skills" in x.lower()) or \
                         soup.select_one("section.pv-skill-categories-section") or \
                         soup.select_one("div[data-view-name='profile-component-entity'] div.QFUNNBSyURYZDijoTTsGvpXOtaExgDrYZis")
        if skills_section:
            sections['skills'] = str(skills_section)

        # --- CONNECTION COUNT ---
        connection_element = soup.select_one(
            "#profile-content span.t-bold"
        )
        if connection_element and "connection" in connection_element.get_text(strip=True).lower():
            connections_text = connection_element.get_text(strip=True)
            sections['connections'] = f"<div class='connections'>{connections_text}</div>"

        if not sections:
            print(f"No key sections found in {url}")
            return

        # Save combined content
        combined_html = "\n\n".join(sections.values())
        os.makedirs("html_cache", exist_ok=True)

        with open(os.path.join("html_cache", filename), "w", encoding="utf-8") as f:
            f.write(combined_html)

        print(f"Saved reduced HTML: {filename}")

    except Exception as e:
        print(f"Error scraping {url}: {e}")


In [None]:
html_folder = "html_cache"
os.makedirs(html_folder, exist_ok=True)
driver = load_linkedin_session()

for _, row in df.iterrows():
    url = row["profile_url"]
    profile_id = row["profile_id"]
    filename = f"profile_{profile_id}.html"
    filepath = os.path.join(html_folder, filename)

   
    if os.path.exists(filepath):
        print(f">>> Skipping profile {profile_id} — already saved.")
        continue

    extract_and_save_profile_sections(driver, url, filename)
    time.sleep(random.uniform(2, 4))


Logged in using li_at cookie.
cookies saved to linkedin_cookies.pkl
>>> Skipping profile 157 — already saved.
>>> Skipping profile 433 — already saved.
>>> Skipping profile 235 — already saved.
>>> Skipping profile 393 — already saved.
>>> Skipping profile 966 — already saved.
>>> Skipping profile 491 — already saved.
>>> Skipping profile 516 — already saved.
>>> Skipping profile 330 — already saved.
>>> Skipping profile 846 — already saved.
>>> Skipping profile 141 — already saved.
>>> Skipping profile 526 — already saved.
>>> Skipping profile 510 — already saved.
>>> Skipping profile 836 — already saved.
>>> Skipping profile 121 — already saved.
>>> Skipping profile 254 — already saved.
>>> Skipping profile 951 — already saved.
>>> Skipping profile 648 — already saved.
>>> Skipping profile 260 — already saved.
>>> Skipping profile 683 — already saved.
>>> Skipping profile 956 — already saved.
>>> Skipping profile 545 — already saved.
>>> Skipping profile 935 — already saved.
>>> Skip