## 1. Import packages

In [1]:
import nltk
import numpy as np
import pandas as pd
import re
import string
import spacy
import tweepy as tw
import warnings

from nltk.corpus import stopwords
from tqdm import tqdm, notebook

nltk.download('stopwords')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
stop_words = stopwords.words('english')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elois\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 2. Twitter API authentication

In [3]:
# Enter your API key
consumer_api_key = "Enter your api key" 
# Enter your API secret
consumer_api_secret = "Enter you api secret" 

In [4]:
# Authorization of consumer key and consumer secret
auth = tw.OAuthHandler(consumer_api_key, consumer_api_secret)

In [5]:
# Calling the api 
api = tw.API(auth, wait_on_rate_limit=True)

## 3. Tweets query

### 3.1. Define the query

In [6]:
search_words = "#covid19 -filter:retweets" 
date_since = "2020-03-01"

In [7]:
# Collect tweets
tweets = tw.Cursor(api.search_tweets,
              q=search_words,
              lang="en",
              since=date_since).items(12452)

### 3.2. Retreive the tweets

In [None]:
# List of tweets 
tweets_copy = []
for tweet in tqdm(tweets):
     tweets_copy.append(tweet)

In [9]:
print(f"New tweets retrieved: {len(tweets_copy)}")

New tweets retrieved: 12452


## 4. Populate the dataset

In [10]:
tweets_df = pd.DataFrame()
for tweet in tqdm(tweets_copy):
    hashtags = []
    # Verifiying that is a tweet with get_status 
    try:
        for hashtag in tweet.entities["hashtags"]:
            hashtags.append(hashtag["text"])
        text = api.get_status(id=tweet.id, tweet_mode='extended').full_text
    except:
        pass
    tweets_df = tweets_df.append(pd.DataFrame({'user_name': tweet.user.name, 
                                               'user_location': tweet.user.location,\
                                               'user_description': tweet.user.description,
                                               'user_created': tweet.user.created_at,
                                               'user_followers': tweet.user.followers_count,
                                               'user_friends': tweet.user.friends_count,
                                               'user_favourites': tweet.user.favourites_count,
                                               'user_verified': tweet.user.verified,
                                               'date': tweet.created_at,
                                               'text': text, 
                                               'hashtags': [hashtags if hashtags else None],
                                               'source': tweet.source,
                                               'is_retweet': tweet.retweeted}, index=[0]))

  7%|▋         | 900/12452 [04:12<53:14,  3.62it/s]  Rate limit reached. Sleeping for: 942
 14%|█▍        | 1800/12452 [24:59<1:12:25,  2.45it/s]  Rate limit reached. Sleeping for: 891
 22%|██▏       | 2700/12452 [46:38<40:41,  3.99it/s]     Rate limit reached. Sleeping for: 789
 29%|██▉       | 3600/12452 [1:04:37<52:52,  2.79it/s]   Rate limit reached. Sleeping for: 906
 36%|███▌      | 4500/12452 [1:24:05<36:17,  3.65it/s]     Rate limit reached. Sleeping for: 934
 43%|████▎     | 5400/12452 [1:44:13<30:17,  3.88it/s]     Rate limit reached. Sleeping for: 922
 51%|█████     | 6300/12452 [2:04:10<30:12,  3.40it/s]     Rate limit reached. Sleeping for: 922
 58%|█████▊    | 7200/12452 [2:24:08<22:42,  3.85it/s]     Rate limit reached. Sleeping for: 921
 65%|██████▌   | 8100/12452 [2:44:40<21:00,  3.45it/s]     Rate limit reached. Sleeping for: 885
 72%|███████▏  | 9000/12452 [3:03:14<15:10,  3.79it/s]     Rate limit reached. Sleeping for: 967
 80%|███████▉  | 9900/12452 [3:24:18<13:13,

In [11]:
# Displays the 5 first rows of the database
tweets_df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,Andrew Vanderbilt Carr,"Newport, Rhode Island USA",Patriotic Progressive Capitalist. Fully-vaxed. Pro 1st amendment! Anti-woke/Anti-cancel/Anti-wokester. In desperate need of a 6 month vacation...twice a year,2020-07-15 14:56:33+00:00,562,369,6788,False,2022-01-27 06:56:38+00:00,"@davidcicilline Its TIME to end the ridiculous testing of U.S. citizens returning home from abroad, or make it 5 days so they can test neg. A fully vaxxed &amp; boostered person will miss flights, lose a job, &amp; many $$'s just getting a or false positive. #COVID19 #testing #endemic",,Twitter for iPhone,False
0,Argus News,Bhubaneswar,Argus News(https://t.co/79dFkqIXbC) from Digital to Satellite has emerged as a platform that Voices 4.5 crore Odias.,2020-12-05 12:30:31+00:00,14330,160,8,False,2022-01-27 06:56:28+00:00,"It seems the current wave of #Covid19 infection has touched the pick in Odisha. Everyday 5000-7000 fresh Corona cases are surfacing in the State. Despite the downward trend all should continue to take precautionary measures: Niranjan Mishra, Public Health Director https://t.co/ttQkLVXHez",[Covid19],TweetDeck,False
0,Magilla Gorilla 39,Mr. Peeble's Pet Shop,"I temporarily live at Mr. Peeble's Pet Shop, but I am hoping to get a permanent home soon. I'm a focused gorilla detective. Tick, Tock Manitowoc. Tick, Tock.",2020-02-04 19:21:29+00:00,1467,262,7355,False,2022-01-27 06:56:18+00:00,"(421) In South Africa, #COVID19 Deaths returned to near peak levels, two weeks after the initial peak seen during the South African #Omicron recovery.\n\nAlso, an abrupt down and then back up branch of the Deaths curve suggests a period of partial reporting. https://t.co/zs9HNbsEv5",[COVID19],Twitter Web App,False
0,Pratidin Time,"Guwahati, India","is a 24-hour Assamese news channel, and is a media product of Yash TV Entertainment Pvt. Ltd.",2011-12-16 07:39:41+00:00,149261,231,73,False,2022-01-27 06:56:05+00:00,"#ASSAM | Congress MP From Nagaon, Pradyut Bordoloi, Tests Positive For Covid-19 \n#Nagaon #Congress #MP #COVID19 \nhttps://t.co/PNkMAWsqjy \n@pradyutbordoloi","[ASSAM, Nagaon, Congress, MP, COVID19]",Twitter Web App,False
0,Covid-19 Bot,,Tweeting stats about Covid-19. A @joe_scotto product.,2020-03-11 20:27:58+00:00,90,1,2,False,2022-01-27 06:56:02+00:00,"Cases: 362,523,525 (+56,029) \nDeaths: 5,626,879 (+0) \nRecovered: 0 (+0) \nActive Cases: 362,523,525 (100%)\nCompleted Cases: 5,626,879 (1.55%)\nMortality Rate: 1.55% (No Change)\nCase Fatality Rate: 100.00% (No Change)\n\n#COVID19 #Coronavirus #StayHome",,Covid-19 Bot,False


## 5. Save the final data

### 5.1. Read the past data

In [13]:
tweets_old_df = pd.read_csv("covid19_tweets.csv")
print(f"Shape of past tweets: {tweets_old_df.shape}")

Shape of past tweets: (179108, 13)


### 5.2. Merge the past and new data

In [14]:
tweets_all_df = pd.concat([tweets_old_df, tweets_df], axis=0)
print(f"Number of new tweets: {tweets_df.shape[0]} Number of past tweets: {tweets_old_df.shape[0]} Number of all tweets: {tweets_all_df.shape[0]}")

Number of new tweets: 12452 Number of past tweets: 179108 Number of all tweets: 191560


### 5.3. Drop duplicates

In [15]:
tweets_all_df.drop_duplicates(subset = ["user_name", "date", "text"], inplace=True)
print(f"Shape of the final data with all tweets: {tweets_all_df.shape}")

Shape of the final data with all tweets: (191560, 13)


### 5.4. Export the updated data

In [16]:
tweets_all_df.to_csv("covid19_tweets.csv", index=False)

## 6. Precessing the final data

### 6.1. Functions to process text

In [17]:
def first_clean (row): 
    """ first_clean function
    
        Input:
            -- row: row of a dataframe
            
        Output:
            -- row: row of dataframe
    """
    # Transforms type of row into a string
    row=str(row)
    # Lower
    row=row.lower()
    
    row=row.replace('\n','')
    
    # Remove punctuation
    table=str.maketrans(dict.fromkeys(string.punctuation))
    row=row.translate(table)
    
    row=row.split(' - ')
    row=row[-1]
    
    # Remove link
    row=re.sub(r'http[A-Za-z0-9.]+', '', row)
    
    return row

In [18]:
def remove_emoji (row):
    """ remove_emoji function
    
        Input:
            --row: row of a dataframe
        
        Output:
            --row: row of a dataframe
    """
    row=str(row)
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                    "]+", flags = re.UNICODE)
    row=regrex_pattern.sub(r'',row)
    return row

### 6.2. Process column user_name

In [23]:
tweets_all_df["user_name"]=tweets_all_df["user_name"].apply(remove_emoji)

In [24]:
# Convert string missing values into real missing values
tweets_all_df["user_name"]=tweets_all_df["user_name"].apply(lambda x: np.nan if x == 'nan' else x)

### 6.3. Process column user_location

In [25]:
tweets_all_df["user_location"]=tweets_all_df["user_location"].apply(first_clean)

In [26]:
tweets_all_df["user_location"]=tweets_all_df["user_location"].apply(remove_emoji)

In [27]:
tweets_all_df["user_location"]=tweets_all_df["user_location"].apply(lambda x: np.nan if x == 'nan' else x)

### 6.4. Process column user_description

In [28]:
tweets_all_df["user_description"]=tweets_all_df["user_description"].apply(first_clean)

In [29]:
tweets_all_df["user_description"]=tweets_all_df["user_description"].apply(remove_emoji)

In [30]:
# Remove stopwords
tweets_all_df['user_description'] = tweets_all_df['user_description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [31]:
tweets_all_df["user_description"]=tweets_all_df["user_description"].apply(lambda x: np.nan if x == 'nan' else x)

### 6.5. Process column text

In [32]:
tweets_all_df["text"]=tweets_all_df["text"].apply(first_clean)

In [33]:
tweets_all_df["text"]=tweets_all_df["text"].apply(remove_emoji)

In [34]:
tweets_all_df['text'] = tweets_all_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [35]:
tweets_all_df["text"]=tweets_all_df["text"].apply(lambda x: np.nan if x == 'nan' else x)

### 6.6. Process column hashtags

In [36]:
tweets_all_df["hashtags"]=tweets_all_df["hashtags"].apply(first_clean)

In [37]:
tweets_all_df["hashtags"]=tweets_all_df["hashtags"].apply(lambda x: np.nan if x == 'nan' else x)

### 6.7. Export processed data

In [3]:
tweets_all_df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥլϮ,astroworld,wednesday addams disney princess keepin ̲̲̲̅̅̅ιοο̲̲̲̅̅̅,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,smelled scent hand sanitizers today someone past would think intoxicated that…,,Twitter for iPhone,False
1,Tom Basile,new york ny,husband father columnist commentator author tough sell fighting media war iraq bush admin alum newsmax contributor fmr exec dir nysgop,2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,hey yankees yankeespr mlb wouldnt made sense players pay respects a…,,Twitter for Android,False
2,Time4fisticuffs,pewee valley ky,christian catholic conservative reagan republican capitalist sports lover bbn cincinnati reds bengals trump2020,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,diane3443 wdunlap realdonaldtrump trump never claimed covid19 hoax claim effort to…,covid19,Twitter for Android,False
3,ethel mertz,stuck in the middle,browns indians clevelandproud cavs resist,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,brookbanktv one gift covid19 give appreciation simple things always around me…,covid19,Twitter for iPhone,False
4,DIPR-J&K,jammu and kashmir,official twitter handle department information public relations govt jammu kashmir,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 july media bulletin novel coronavirusupdates covid19 kansalrohit69 drsyedsehrish airnewsalerts ani…,coronavirusupdates covid19,Twitter for Android,False


In [38]:
tweets_all_df.to_csv("covid19_tweets_treat.csv", index=False)