## Analyzing Twitter via Website Scraping and Tweet Data Collection (Data Collection)

#### Import the libraries and functions necessary for the project

In [None]:
# This function allows us to input the username and password without echoing on the screen
from getpass import getpass

# This function allows us to suspend the execution for the number of seconds we specified
from time import sleep

# These packages provide a series of tools used to initialize the browser, handle the error, get the response from the website, etc.
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from msedge.selenium_tools import Edge, EdgeOptions

# Pandas is used to export the scraped data as a CSV file
import pandas as pd

#### Define a function that is used to scrape the Tweet data

In [None]:
def get_tweet_data(individual_tweet):
    
    # Extract the username. If username is failed to be extracted for some reasons, return nothing.
    try:
        username = individual_tweet.find_element_by_xpath('.//span').text
    except NoSuchElementException:
        return
    
    # Extract the user's handle. If handle is failed to be extracted for some reasons, return nothing.
    try:
        handle = individual_tweet.find_element_by_xpath('.//span[contains(text(), "@")]').text
    except NoSuchElementException:
        return
    
    # Extract the date the Tweet was posted. If the date is failed to be extracted for some reasons, return nothing.
    try:
        postdate = individual_tweet.find_element_by_xpath('.//time').get_attribute('datetime')
    except NoSuchElementException:
        return
    
    # Extract the content of the Tweet. If the content is failed to be extracted for some reasons, return nothing.
    try:
        text = individual_tweet.find_element_by_xpath('.//div[2]/div[2]/div[1]').text
    except NoSuchElementException:
        return
    
    # Extract the number of reply. If the no one replied to the corresponding Tweet, return zero.
    reply_count = individual_tweet.find_element_by_xpath('.//div[@data-testid="reply"]').text
    if reply_count == '':
        reply_count = '0'
    else:
        reply_count = reply_count
    
    # Extract the number of retweet. If the no one retweeted the corresponding Tweet, return zero.
    retweet_count = individual_tweet.find_element_by_xpath('.//div[@data-testid="retweet"]').text
    if retweet_count == '':
        retweet_count = '0'
    else:
        retweet_count = retweet_count
    
    # Extract the number of like. If the no one liked the corresponding Tweet, return zero.
    like_count = individual_tweet.find_element_by_xpath('.//div[@data-testid="like"]').text
    if like_count == '':
        like_count = '0'
    else:
        like_count = like_count
    
    # Assigned the information to a variable named 'tweet'
    tweet = (username, handle, postdate, text, reply_count, retweet_count, like_count)
    return tweet

#### Inspect the HTML elements, identify the correct pattern, and input the necessary information to get ready for scrapping

In [None]:
# Input the username and password of a Twitter account
user = getpass('Phone, email, or username: ')
user_password = getpass('Password: ')

# Initialize the Microsoft Edge browser
options = EdgeOptions()
options.use_chromium = True
driver = Edge(options=options)

# Navigate to Twitter's login page and maximize the browser window. Maximizing is essential because Twitter adjust the feature available based on the size of the window
driver.get('https://www.twitter.com/login')
driver.maximize_window()

# Inspect the HTML code elements on the login page, identify the correct input bracket, and input the username and password we defined previously
username = driver.find_element_by_xpath('//input[@name="session[username_or_email]"]')
username.send_keys(user)
password = driver.find_element_by_xpath('//input[@name="session[password]"]')
password.send_keys(user_password)

# Get the response from Twitter login page and suspend for one second before the next execution to allow full website reaction
password.send_keys(Keys.RETURN)
sleep(1)

# Inspect the element behind the search box, research the searching system and the proper keywords, and finally input the keywords and get the response
search_key = driver.find_element_by_xpath('//input[@aria-label="Search query"]')
search_key.send_keys(
    'vaccine (COVID OR COVID19 OR COVID-19 OR coronavirus OR corona) lang:en \
    until:2020-10-20 since:2019-12-01 -filter:replies')
search_key.send_keys(Keys.RETURN)
sleep(1)

# Identify the element and click on 'Latest' to get the latest Tweets. Suspend execution for one second to allow full reaction
driver.find_element_by_link_text('Latest').click()
sleep(1)

In [None]:
# Create an empty list and a set to store the Tweet data and Tweet id separately.
data = []
tweet_ids = set()
last_position = driver.execute_script("return window.pageYOffset;")
scrolling = True

while scrolling:
    individual_tweets = driver.find_elements_by_xpath('//div[@data-testid="tweet"]')
    
    # Since the previous content will not disappear if we keep scrolling down, we want to check only the last 15 Tweets to boost the processing speed.
    # To ensure we do not repeatedly collect the same data, we assign a tweet_id to each tweet using its content. If the id has a match with previous id, we ignore it
    for individual_tweet in individual_tweets[-15:]:
        tweet = get_tweet_data(individual_tweet)
        tweet_id = ''.join(tweet)
        if tweet_id not in tweet_ids:
            tweet_ids.add(tweet_id)
            data.append(tweet)
            
    scroll_tried = 0
    while True:
        
        # Ask the web driver to scroll down to the bottom of the page. Then, we suspend for 1.5 seconds to allow the website to load the new content
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        sleep(1.5)
        
        # Assign the current web page location to current_position variable
        current_position = driver.execute_script("return window.pageYOffset;")
        
        # If the current position on the web page is the same as the previous position, we know that either we have reached the end of the content or the web page is not reacting
        # In this case, we would allow three tries. If we are still at the same position after three tries, we would break out of this loop and go on. Otherwise, we would allow
            # three 1.5 seconds for the page to fully load the content.
        if last_position == current_position:
            scroll_tried = scroll_tried + 1
            if scroll_tried >= 3:
                scrolling = False
                break
            else:
                sleep(1.5)
        
        # If the last position is not the same as the current position, we know that the web page is loading new content properly. In this case, we assign the last position the
            # current position, so that we can continue keeping track of the position.
        else:
            last_position = current_position
            break
    
    # This is to limit the number of tweets we collect. Here, we are collecting 50 Tweets. To collect more Tweets, simply change it to a larger number
    if len(data) >= 50:
                    break

In [None]:
# Put the data collected into a data frame, assign the proper column names, and export the data as a CSV file
df = pd.DataFrame(data, columns = ['UserName', 'Handle', 'Timestamp', 'Text', 'Comments', 'Likes', 'Retweets'])
df.to_csv(r'C:\Users\34527\Desktop\tweet.csv', index = False)