In [96]:
! pip install twitter_scraper_selenium --quiet
! pip install selenium pandas webdriver-manager --quiet

In [97]:
from twitter_scraper_selenium import scrape_keyword

from multiprocessing import Pool

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException, TimeoutException

from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd
import time
import asyncio
import json
import os

In [98]:
def login_twitter(driver, username, password):
    driver.get("https://twitter.com/login")
    time.sleep(2)  # Wait for the login page to load
    
    username_input = driver.find_element(By.NAME, "text")
    username_input.send_keys(username)
    username_input.send_keys(Keys.RETURN)
    time.sleep(2)  # Wait for the next page to load
    
    password_input = driver.find_element(By.NAME, "password")
    password_input.send_keys(password)
    password_input.send_keys(Keys.RETURN)
    time.sleep(5)  # Wait for login to complete

In [100]:
def scrape_keyword(headless, keyword, browser, tweets_count, filename, output_format, since):
    tweets = []
    browser.get(f"https://twitter.com/search?q={keyword}%20since%3A{since}&src=typed_query")
    time.sleep(3)

    # Scroll multiple times to load more tweets
    for _ in range(15):
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)

    # Get tweet elements
    tweet_elements = browser.find_elements(By.XPATH, '//article')

    print(f"Total tweet elements found: {len(tweet_elements)}")  # Debugging output

    for tweet in tweet_elements[:tweets_count]:
        try:
            # Skip if it's a reply (check for 'Replying to' indicator)
            is_reply = tweet.find_elements(By.XPATH, './/div[contains(text(), "Replying to")]')
            if is_reply:
                print("Skipping a reply tweet.")  # Debugging
                continue  # Skip this tweet if it's a reply
            
            username = tweet.find_element(By.XPATH, './/span[contains(text(), "@")]').text
            tweet_text = tweet.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
            
            try:
                like_button = tweet.find_element(By.XPATH, './/button[@data-testid="like"]')
                likes = like_button.get_attribute("aria-label").split()[0]  # Extract number from label
            except NoSuchElementException:
                likes = "0"

            try:
                retweet_button = tweet.find_element(By.XPATH, './/button[@data-testid="retweet"]')
                retweets = retweet_button.get_attribute("aria-label").split()[0]  # Extract number from label
            except NoSuchElementException:
                retweets = "0"

            try:
                reply_button = tweet.find_element(By.XPATH, './/button[@data-testid="reply"]')
                replies = reply_button.get_attribute("aria-label").split()[0]  # Extract number from label
            except NoSuchElementException:
                replies = "0"

            try:
                views_span = tweet.find_element(By.XPATH, './/span[contains(text(), "Views")]')
                views = views_span.text.split()[0]  # Extract the number from text
            except NoSuchElementException:
                views = "0"

            date = tweet.find_element(By.XPATH, './/time').get_attribute('datetime')
            hashtags = [hashtag.text for hashtag in tweet.find_elements(By.XPATH, './/a[contains(@href,"/hashtag/")]')]
            media_urls = [img.get_attribute('src') for img in tweet.find_elements(By.XPATH, './/img')]
            mentions = [mention.text for mention in tweet.find_elements(By.XPATH, './/a[contains(@href, "/")]')]
            language = tweet.get_attribute('lang')

            tweets.append({
                'username': username,
                'tweet': tweet_text,
                'replies': replies,
                'likes': likes,
                'retweets': retweets,
                'views': views,
                'date': date,
                'hashtags': hashtags,
                'media_urls': media_urls,
                'mentions': mentions,
                'language': language,
            })
        except Exception as e:
            print(f"Error extracting tweet data: {e}")

    print(f"Number of tweets scraped: {len(tweets)}")

    # Save to CSV if there are tweets
    if output_format == "csv" and tweets:
        pd.DataFrame(tweets).to_csv(filename + '.csv', index=False)

    return tweets

In [101]:
def scrape_profile_tweets(username: str, twitter_username: str, twitter_password: str):
    kword = "from:" + username
    path = './users/' + username
    file_path = path + '.csv'
    
    # Create the directory if it does not exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    # Initialize the WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
    login_twitter(driver, twitter_username, twitter_password)

    # Scrape tweets
    scrape_keyword(
        headless=False,         # Runs scraping in headless mode (no GUI)
        keyword=kword,          # Search keyword, e.g., 'from:elonmusk'
        browser=driver,         # Uses logged-in Chrome browser for scraping
        tweets_count=10000,     # Number of tweets to scrape
        filename=path,          # File path to save the output
        output_format="csv",    # Saves data in CSV format
        since="2020-01-01",     # Retrieves tweets since 2023
    )

    # Read the saved CSV and return data
    try:
        data = pd.read_csv(file_path)
        data = json.loads(data.to_json(orient='records'))
    except pd.errors.EmptyDataError:
        print("No data found in the CSV file.")
        data = []

    # Close the browser
    driver.quit()

    return data

In [103]:
tweets_data = scrape_profile_tweets('elonmusk', 'toutouxihuannii', 'lingraisin31839')

Total tweet elements found: 29
Number of tweets scraped: 29
