In [65]:
#######################################################################
## Web-crawler 1.0
## Created by:  Coenraad F. Mulder
## Date:        01-12-2021
## Purpose:     Webscrape Twitter feed for specific hashtags, 
##              then supplement
##              with additional information from the Twitter API
#######################################################################
# Libraries
#######################################################################
import requests                                             # For getting URLS from the Web
import pandas as pd                                         # DataFrame functionality
import snscrape.modules.twitter as sntwitter                # Python Scraper Library for Twitter feed
from datetime import datetime, timedelta                    # Date and time manipulation

# Webcrawler | Task 2
This assignment uses the 'snscrape' library (created by JustAnotherArchivist), a social networking services scraper. The primary focus of this assignment is the Twitter platform (https://www.twitter.com), specifically focusing on the hype that is created on this platform when a new variant of Coronavirus is identified. The latest variant, Omicron, has been identified less than two weeks ago, which has caused a surge in social media commentary from both experts and amateurs alike. This begs the question, what are the issues that people are discussing around Omicron, and is the prevalent sentiment around it positive or negative? Is this variant as infectuous and deadly as its predecessor, or is its' impact just amplified through unsolicited social commentary fueled by fear and uncertainty? To investigate these issues, this web crawler was created to scrape a specified number of tweets, spanning a specified number of days, and covering all English language tweets during this period, around a specified keyword. For instance, this assignment uses the Twitter Web Crawler to scrape Twitter for all tweets between the 5th of December 2021 and the 14 days leading up to it, limiting the tweets to 1000 tweets, and using the keyword search term 'Omicron' (refer to the Hyper-parameters for the Scraper code segment below).

In [69]:
#######################################################################
# Hyper-parameters for the Scraper
#######################################################################
date_to = '2021-12-05'
number_of_days = 14
tweet_limit = 1000
search_term = 'omicron'

In [71]:
#######################################################################
# Twitter Web Crawler class
#######################################################################
### INPUT
#----------------------------------------------------------------------
#   search_term    : Term to search Twitter with
#   search_limit   : Maximum number of tweets to return  
#   search_to_date : End date for date range in format yyyy-mm-dd 
#   number_of_days : Number of days to go back in history  
#
### OUTPUT
#----------------------------------------------------------------------
#   tweets.xlsx    : Excel spreadsheet containing:
#                       - Date
#                       - Tweet Id
#                       - Tweet Text
#                       - Username of Tweeter
#                       - Tweet Length
#######################################################################
class TwitterWebCrawler():
    ###################################################################
    # CONSTRUCTOR                                                     
    ###################################################################
    def __init__(self, search_term, search_limit, search_to_date, number_of_days):
        dt_to = self.validate_date(input_date = search_to_date)
        
        if(dt_to == None):
            raise Exception("Invalid search_to_date supplied! Value must be in format yyyy-mm-dd.")

        dt_from = self.calc_from_date(dt_to, number_of_days)
        date_from = dt_from.strftime("%Y-%m-%d")
        date_to = dt_to.strftime("%Y-%m-%d")

        # Set the search terms for twitter specifying from and to dates
        # Only return English Tweets
        self.search_term = search_term + ' since:' + date_from + ' until:' + date_to + ' lang:en' 
        self.search_limit = search_limit
        self.tweets_df = pd.DataFrame(columns=['Date', 'Tweet Id', 'Tweet Text', 'Username', 'Tweet Length'])
            
    ###################################################################
    # Validate Date 
    ###################################################################
    # Purpose:  Check for a valid formatted date
    #           Date must be in format 'yyyy-mm-dd'
    # Input:    input_date      string      yyyy-mm-dd
    # Output:   result         boolean     True/False
    ###################################################################
    def validate_date(self, input_date):
        try:
            # Attempt to construct a date from the input_date
            return datetime.strptime(input_date, "%Y-%m-%d")
        except ValueError:
            return None

    ###################################################################
    # Calculate Start Date  
    ###################################################################
    # Purpose:  Calculate start date from input_datetime (offset by number_of_days)
    #           Date must be valid datetime format
    # Input:    input_datetime      datetime      yyyy-mm-dd
    #           number_of_days      integer       >0
    # Output:   result              datetime      True/False
    ###################################################################
    def calc_from_date(self, input_datetime, number_of_days):
        try:
            return input_datetime - timedelta(days=number_of_days)
        except ValueError:
            return None
        except TypeError:
            return None
    
    ###################################################################
    # Main execution thread for Twitter Web Crawler
    ###################################################################
    def run(self):
        try:
            print(datetime.now(), "Extracting tweets using search term: ", self.search_term)

            # Using TwitterSearchScraper to scrape data and append tweets to list
            twitter_items = sntwitter.TwitterSearchScraper(self.search_term).get_items()

            print(datetime.now(), "Processing tweets...")
            for idx,tweet in enumerate(twitter_items):
                # Only retrieve records up to search_limit of tweets
                if idx >= self.search_limit:
                    break

                tweet_length = len(str(tweet.content))

                # Add the extracted tweet to the Tweets dataframe 
                self.tweets_df.loc[len(self.tweets_df)] = [tweet.date, tweet.id, tweet.content, tweet.username, tweet_length]
            
            self.save_data_to_file()
        
        except Exception as e:
            raise Exception(e)

    ###################################################################
    # Save dataframe to Excel  
    ###################################################################
    # Purpose:  Save dateframe to Excel for further processing
    # Input:    -
    # Output:   tweets.xlsx saved in same directory as files
    ###################################################################
    def save_data_to_file(self):
        try:
            # Dates are implicitly stored as DateTime with Timezone in pandas dataframe - Remove timezone before storing to Excel
            self.tweets_df['Date'] = self.tweets_df['Date'].apply(lambda a: pd.to_datetime(a).date())

            # Save the dataframe content to an Excel file (as it is possible for the tweets to contain commas, which will break CSV format
            self.tweets_df.to_excel('tweets.xlsx', index = None, header=True)
            print(datetime.now(), "Tweet results successfully saved to the file 'tweets.xlsx'")
        except Exception as e:
            raise Exception(e)

In [73]:
if __name__ == '__main__':
    try:
        mycrawler = TwitterWebCrawler(
            search_term = search_term, 
            search_limit = tweet_limit, 
            search_to_date = date_to, 
            number_of_days = number_of_days)
        mycrawler.run()
    except Exception as e:
        print(datetime.now(), "ERROR OCCURRED!", str(e))

2021-12-05 12:46:08.175087 Extracting tweets using search term:  omicron since:2021-11-21 until:2021-12-05 lang:en
2021-12-05 12:46:08.175409 Processing tweets...
2021-12-05 12:46:41.960004 Tweet results successfully saved to the file 'tweets.xlsx'
