In [66]:
import os
import pandas as pd
import numpy as np
import csv
import snscrape.modules.twitter as sntwitter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import datetime as dt
import time
from pathlib import Path

In [67]:
print(Path.cwd())

/Users/yhacoupian/git_repos/project_2


In [68]:
def init():
    """Sets the initial parameters for the application

    Parameters
    ----------
        None: This function does not have any input parameters.

    Returns
    ----------
        None: This function does not return any value.

   """
    # Set the path for the data collection directory
    global data_dir
    data_dir = Path.cwd() / 'Data'
    
    # Create the directory if it does not exist
    try:
        os.makedirs(data_dir)    
        print("Directory " , data_dir ,  " Created ")
    except FileExistsError:
        print("Directory " , data_dir ,  " already exists")  
    
    # Max number of tweets to be scrapped per day
    global maxTweets
    maxTweets = 500
    
    # Setup the sentiment dataframe 
    column_names = ["date", "neg", "neu", "pos", "compound"]
    df = pd.DataFrame(columns = column_names)
    
    # Set sentiment file path
    global sentiment_path
    sentiment_path = Path.joinpath(data_dir,  'sentiment.csv') 
    
    # Create the CSV file with proper columns
    df.to_csv(sentiment_path, index=False)

In [69]:
def write_sentiment(date, neg, neu, pos, compound):
    """Writes the sentiment analysis results into the sentiment.csv file

    Parameters
    ----------
        date: This is the date corresponding to the day the sentiments belong to.
        neg: This is the value for the negative sentiment. 
        neu: This is the value for the neutral sentiment.
        pos: This is the value for the positive sentiment.
        compound: This is the compounded value.

    Returns
    ----------
        None: This function does not return any value.

   """
    try:
        # Sentiment_df = pd.read_csv('covid/sentiment.csv')
        sentiment_df = pd.read_csv(sentiment_path)
        # Create a new row 
        new_row = {'date':date, 'neg':neg, 'neu':neu, 'pos':pos, 'compound':compound}
        
        # Add the new row to the dataframe 
        sentiment_df.loc[len(sentiment_df)]= new_row
    
        # Update the csv file
        sentiment_df.to_csv(sentiment_path, index= False)
    except Exception as err:
        print(f"Error: '{err}'")

In [70]:
def scrap_per_day(keyword, start_date, end_date):
    """Uses the snscrape to go through tweets from the start_date to the end_date looking for the keyword. 

    Parameters
    ----------
        keyword: This is the phrase to look for in the tweets.
        start_date: The start date. 
        end_date: The end date.

    Returns
    ----------
        None: This function does not return any value.

   """
    
    # Set the path
    unique_file_name = keyword +'-sentiment-' + end_date + '.csv'
    daily_result_path = Path.joinpath(data_dir, unique_file_name)
    
    #Open/create a file to append data to
    csvFile = open(daily_result_path, 'a', newline='', encoding='utf8')
    
    #Use csv writer
    csvWriter = csv.writer(csvFile)
    csvWriter.writerow(['id','date','tweet',])
    
    # set the batch number
    batch = 1000

    print(f'Each batch is equal to {batch} tweet(s).')
    print("Scrapping...")
    try:
        count = 1
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(keyword + ' lang:en since:' +  start_date + ' until:' + end_date + ' -filter:links -filter:replies').get_items()):
            if i > maxTweets :
                break
            csvWriter.writerow([tweet.id, tweet.date, tweet.content])
        
            # Print a message for every 1000 tweets scrapped. 
            if i % batch == 0:
                print('batch =', end=f' {count} | ')
                count+=1
            
        csvFile.close()
    except Exception as err:
        print(f"Error: '{err}'")

In [71]:
def anlayze_vader(keyword, end_date):
    
    print("Analyzing...")
    analyzer = SentimentIntensityAnalyzer()
    
    # Construct the path 
    unique_file_name = keyword +'-sentiment-' + end_date + '.csv'
    daily_result_path = Path.joinpath(data_dir , unique_file_name)
    
    # Reading the CSV file 
    df = pd.read_csv(daily_result_path, parse_dates=True, index_col=0)

    # Creating sentiment scores columns
    df['compound'] = [analyzer.polarity_scores(x)['compound'] for x in df['tweet']]
    df['neg'] = [analyzer.polarity_scores(x)['neg'] for x in df['tweet']]
    df['neu'] = [analyzer.polarity_scores(x)['neu'] for x in df['tweet']]
    df['pos'] = [analyzer.polarity_scores(x)['pos'] for x in df['tweet']]

    # Taking averages of sentiment score columns
    avg_compound = np.average(df['compound'])
    avg_neg = np.average(df['neg']) * -1  # Change neg value to negative number for clarity
    avg_neu = np.average(df['neu'])
    avg_pos = np.average(df['pos'])

    # Counting number of tweets
    count = len(df.index)

    # Write to the sentiment csv file
    write_sentiment(end_date, avg_neg, avg_neu, avg_pos, avg_compound)
    
    print("Done!")
    print('-'*20)
    # Print Statements
    print("For the given period there has been", count ,  "tweets on " + keyword, end='\n*')
    print("Positive Sentiment:", '%.2f' % avg_pos, end='\n*')
    print("Neutral Sentiment:", '%.2f' % avg_neu, end='\n*')
    print("Negative Sentiment:", '%.2f' % avg_neg, end='\n*')
    print("Compound Sentiment:", '%.2f' % avg_compound, end='\n')

### Scrap and Analyze using VADER

In [72]:
init()

Directory  /Users/yhacoupian/git_repos/project_2/Data  Created 


In [73]:
# set the start date
start_date = dt.date(2021, 3, 1)

# set the end date
end_date = dt.date(2021, 4, 1)

# set the delta - currently set to one day
delta = dt.timedelta(days=1)
# specify the keword
word = 'covid'

# go through the days to scrap and analyze each day
while start_date < end_date:
    print(start_date, end=' : ')
    scrap_per_day(word, start_date.strftime('%Y-%m-%d'), (start_date + delta).strftime('%Y-%m-%d'))
    anlayze_vader(word, (start_date + delta).strftime('%Y-%m-%d'))
    start_date += delta 
    print('-'*20)

2021-03-01 : Each batch is equal to 1000 tweet(s).
Scrapping...
batch = 1 | Analyzing...
Done!
--------------------
For the given period there has been 501 tweets on covid
*Positive Sentiment: 0.10
*Neutral Sentiment: 0.81
*Negative Sentiment: -0.09
*Compound Sentiment: 0.03
--------------------
2021-03-02 : Each batch is equal to 1000 tweet(s).
Scrapping...
batch = 1 | Analyzing...
Done!
--------------------
For the given period there has been 501 tweets on covid
*Positive Sentiment: 0.09
*Neutral Sentiment: 0.80
*Negative Sentiment: -0.11
*Compound Sentiment: -0.07
--------------------
2021-03-03 : Each batch is equal to 1000 tweet(s).
Scrapping...
batch = 1 | Analyzing...
Done!
--------------------
For the given period there has been 501 tweets on covid
*Positive Sentiment: 0.09
*Neutral Sentiment: 0.81
*Negative Sentiment: -0.10
*Compound Sentiment: -0.02
--------------------
2021-03-04 : Each batch is equal to 1000 tweet(s).
Scrapping...
batch = 1 | Analyzing...
Done!
------------