# Media Feed Query
This notebook queries your LinkedIn feed for posts containing specific keywords. 

## 1. Setup

Install the required libraries by running the following command in your terminal:

```bash
pip install -r requirements.txt
```

## 2. Imports

In [1]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
from dotenv import load_dotenv
load_dotenv()

## 3. Configuration

**Important:** For security reasons, it is highly recommended to set your LinkedIn credentials as environment variables rather than hardcoding them in the notebook. 

You can set environment variables in your terminal like this:

```bash
export LINKEDIN_SESSION_COOKIE='your_cookie_value'
```

In [None]:
LINKEDIN_SESSION_COOKIE = os.environ.get('LINKEDIN_SESSION_COOKIE')
# Keywords to search for in the posts
KEYWORDS = ['Geospatial', 'Earth Observation', 'Autonomous vehicles', 'Remote Sensing', 'GeospatialAI']
# Number of times to scroll down the feed to load more posts
SCROLLS = 5

None


## 4. Initialize WebDriver

In [None]:
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.binary_location = "/usr/bin/chromium-browser"
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

service = Service("/usr/lib/chromium-browser/chromedriver")

driver = webdriver.Chrome(service=service, options=chrome_options)

## 5. Login to LinkedIn

In [None]:
def login_to_linkedin(driver, cookie):
    driver.get('https://www.linkedin.com/')
    driver.add_cookie({'name': 'li_at', 'value': cookie})
    driver.refresh()
    time.sleep(5) # Wait for the feed to load

## 6. Scrape the Feed

In [None]:
def scrape_feed(driver, keywords, scrolls):
    posts_data = []
    for _ in range(scrolls):
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(3) # Wait for new posts to load

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    # Note: These selectors might change over time. You may need to inspect the page and update them.
    posts = soup.find_all('div', {'class': 'feed-shared-update-v2'})

    for post in posts:
        post_text = post.get_text()
        if any(keyword.lower() in post_text.lower() for keyword in keywords):
            try:
                author = post.find('span', {'class': 'feed-shared-actor__name'}).get_text().strip()
                content = post.find('div', {'class': 'feed-shared-update-v2__description-wrapper'}).get_text().strip()
                posts_data.append({'author': author, 'content': content, 'raw_html': str(post)})
            except AttributeError:
                # Skip posts that don't have the expected structure
                continue
    return pd.DataFrame(posts_data)

## 7. Main Execution

In [None]:
if LINKEDIN_SESSION_COOKIE:
    login_to_linkedin(driver, LINKEDIN_SESSION_COOKIE)
    df = scrape_feed(driver, KEYWORDS, SCROLLS)
    
    if not df.empty:
        print(f'Found {len(df)} posts matching your keywords.')
        # Save the data to a CSV file
        df.to_csv('linkedin_posts.csv', index=False)
        print('Data saved to linkedin_posts.csv')
        print(df.head())
    else:
        print('No posts found matching your keywords.')
else:
    print('Please set your LINKEDIN_SESSION_COOKIE environment variable.')

## 8. Close the WebDriver

In [None]:
driver.quit()