In [58]:
import tweepy
import pandas as pd
import numpy as np
import preprocessor as p
import nltk
from datetime import datetime, date, timedelta
from nltk.sentiment.vader import SentimentIntensityAnalyzer

log = pd.read_csv("./config/login.csv")
consumerKey         = log['consumerKey'][0]
consumerSecret      = log['consumerSecret'][0]
accessToken         = log['accessToken'][0]
accessTokenSecret   = log['accessTokenSecret'][0]

auth = tweepy.OAuthHandler(consumerKey, consumerSecret)
auth.set_access_token(accessToken, accessTokenSecret)
api = tweepy.API(auth)
print("----------------access to tweeter api done")

----------------access to tweeter api done


In [59]:
# Generate list of dates (7 days window) based on today's date
print("Get the assigned data from tweeter")
list_of_dates = []
today = date.today()
for i in range(-7,1):
    target_date = (today + timedelta(days=i)).strftime("%Y-%m-%d")
    list_of_dates.append(target_date)

list_of_dicts = []
search_term = 'covid19 covid vaccine'
num_tweets = 100

for end_date in list_of_dates:
    start_date = (datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=1)).strftime(
        "%Y-%m-%d")  # Create 1-day windows for extraction
    tweet_count = len(list_of_dicts)

    for tweet in tweepy.Cursor(api.search_tweets,
                               q=f'{search_term} since:{start_date} until:{end_date}',
                               lang='en',
                               count=num_tweets,
                               tweet_mode='extended').items(num_tweets):
        if (not tweet.retweeted) and ('RT @' not in tweet.full_text):
            if tweet.lang == "en":
                tweet_dict = {}
                # nltk.download('vader_lexicon')
                # sid = SentimentIntensityAnalyzer()
                # score = sid.polarity_scores(tweet.full_text)
                # comp = score['compound']
                # if comp >= 0.05:
                #     tweet_dict['tar'] = 1
                # elif (comp >-0.05) and (comp <0.05):
                #     tweet_dict['tar'] = 0
                # elif comp <= -0.05:
                #     tweet_dict['tar'] = -1

                tweet_dict['username'] = tweet.user.name
                tweet_dict['location'] = tweet.user.location
                tweet_dict['text'] = tweet.full_text.lower()
                # tweet_dict['fav_count'] = tweet.favorite_count
                tweet_dict['hashtags'] = tweet.entities['hashtags']
                tweet_dict['tweet_date'] = tweet.created_at
                list_of_dicts.append(tweet_dict)
                tweet_count += 1

    print(f'Completed extraction for {start_date} to {end_date},extracted tweet count = {tweet_count}.')

tweets_df = pd.DataFrame(list_of_dicts)
tweets_df.sort_values(by='tweet_date').reset_index(drop=True)

# Setup function to extract hashtags text from the raw hashtag dictionaries
def extract_hashtags(hashtag_list):
    s = ""  # Create empty string
    if not hashtag_list:  # If list is empty, return empty string
        return s
    else:
        for dictionary in hashtag_list:
            s += str(dictionary['text'].lower() + ',')  # Create string (lowercase) for each hashtag text
        s = s[:-1]  # Drop last character ','
        return s

# Extract hashtags
tweets_df['hashtags_extracted'] = tweets_df['hashtags'].apply(lambda x: extract_hashtags(x))
tweets_df.drop(columns='hashtags', inplace=True)
print("----------------loading the data done")

Get the assigned data from tweeter
Completed extraction for 2022-04-11 to 2022-04-12,extracted tweet count = 22.
Completed extraction for 2022-04-12 to 2022-04-13,extracted tweet count = 48.
Completed extraction for 2022-04-13 to 2022-04-14,extracted tweet count = 76.
Completed extraction for 2022-04-14 to 2022-04-15,extracted tweet count = 111.
Completed extraction for 2022-04-15 to 2022-04-16,extracted tweet count = 114.
Completed extraction for 2022-04-16 to 2022-04-17,extracted tweet count = 124.
Completed extraction for 2022-04-17 to 2022-04-18,extracted tweet count = 145.
Completed extraction for 2022-04-18 to 2022-04-19,extracted tweet count = 159.
----------------loading the data done


In [60]:
# Clean tweet text with tweet-preprocessor
tweets_df['text_cleaned'] = tweets_df['text'].apply(lambda x: p.clean(x))
# Remove duplicate tweets
tweets_df.drop_duplicates(subset='text_cleaned', keep="first", inplace=True)
# Remove unnecessary characters
# Note: Need to remove % as Stanford CoreNLP annotation encounters error if text contains some of these characters
punct = ['%','.',',', '/', ':', '\\', '&amp;', '&', ';',"\'"]

def remove_punctuations(text):
    for punctuation in punct:
        text = text.replace(punctuation, '')
    return text

tweets_df['text_cleaned'] = tweets_df['text_cleaned'].apply(lambda x: remove_punctuations(x))

# Drop tweets which have empty text field
tweets_df['text_cleaned'].replace('', np.nan, inplace=True)
tweets_df['text_cleaned'].replace(' ', np.nan, inplace=True)
tweets_df.dropna(subset=['text_cleaned'], inplace=True)

tweets_df = tweets_df.reset_index(drop=True)
print("----------------cleaning the data done")
print(tweets_df)

----------------cleaning the data done
                                           username  \
0                                        ABNewswire   
1                                           IARS360   
2                                       Uncle Louie   
3    Elisha #NoVaccineMandates #NoVaccinePassports💉   
4                                    AngelsWakingUp   
..                                              ...   
141                                   Karl Harrison   
142                   Maricopa County Public Health   
143                               Chicagoland DPOCC   
144                                  Alex Martiniuk   
145                  KFF (Kaiser Family Foundation)   

                              location  \
0                                        
1                                        
2                        New York, USA   
3                         Carlsbad, CA   
4                                        
..                                 ...   
141   

In [61]:
#Write data to local file
print("writting the data into local file.....ing")
# Create timestamp for datetime of extraction
extract_datetime = datetime.today().strftime('%Y%m%d_%H%M%S')

# Create csv filename
filename = 'data/covid_vaccine_tweets_extracted_' + extract_datetime + '.csv'

# Drop duplicates (if any)
tweets_df.drop_duplicates(inplace=True)

# Export dataframe as csv file with above filename
tweets_df.to_csv(filename, index=False)
print("writting the data into local file.....done")

writting the data into local file.....ing
writting the data into local file.....done
