In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import json
import time

# Function to initialize Selenium WebDriver
def init_driver(headless=True):
    chrome_options = Options()
    return webdriver.Chrome(options=chrome_options)

# Function to scroll the page by smaller increments and capture tweets at each step
def scroll_and_scrape(driver, max_scrolls=20, scroll_pause_time=0.5):
    tweets_data = []
    seen_tweets = set()

    def scrape_tweets_from_html(html):
        nonlocal tweets_data, seen_tweets
        soup = BeautifulSoup(html, 'html.parser')
        tweet_divs = soup.find_all('article')

        for tweet_div in tweet_divs:
            tweet_text_tag = tweet_div.find(['div', 'span'], {'data-testid': 'tweetText'})
            time_tag = tweet_div.find('time')

            tweet_text = tweet_text_tag.get_text(separator=" ", strip=True) if tweet_text_tag else 'No Text Found'
            tweet_timestamp = time_tag['datetime'] if time_tag else 'No Timestamp Found'

            if tweet_text and tweet_text not in seen_tweets:
                tweets_data.append({
                    "text": tweet_text,
                    "timestamp": tweet_timestamp
                })
                seen_tweets.add(tweet_text)

    scrape_tweets_from_html(driver.page_source)

    # Scroll down the page in small increments and scrape on each scroll
    for scroll_count in range(max_scrolls):
        driver.execute_script("window.scrollBy(0, 300);")  # Scroll by 300 pixels at a time
        time.sleep(scroll_pause_time)
        scrape_tweets_from_html(driver.page_source)  # Scrape after each scroll
        # print(f"Scrolled {scroll_count + 1}/{max_scrolls} times.")

    # Scroll back up to capture any missed tweets
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(2)  # Wait a bit after scrolling up
    scrape_tweets_from_html(driver.page_source)

    return tweets_data

# Function to scrape tweets from a given X.com URL
def scrape_x_article(url, headless=True):
    driver = init_driver(headless=headless)
    driver.get(url)

    try:
        # Wait until at least one tweet is loaded
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, "//div[@data-testid='tweetText']"))
        )
    except TimeoutException:
        print(f"Timeout while waiting for tweets to load on {url}")
        driver.quit()
        return None

    # Scroll and capture tweets at each scroll step
    tweets_data = scroll_and_scrape(driver)

    print(f"Total unique tweets collected: {len(tweets_data)}")

    x_data = {
        "source": "X.com",
        "url": url,
        "tweets": tweets_data
    }

    driver.quit()
    return x_data

# List of X.com URLs
x_urls = ["https://x.com/CNN","https://x.com/TIME","https://x.com/elonmusk","https://x.com/BarackObama","https://x.com/justinbieber"]

all_x_data = []
for url in x_urls:
    print(f"Scraping URL: {url}")
    x_data = scrape_x_article(url, headless=False) 
    if x_data:
        all_x_data.append(x_data)
        
for data in all_x_data:
    print(json.dumps(data, indent=4, ensure_ascii=False))

save_data_to_json(all_x_data, 'x_data.json')