In [None]:
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [1]:
# Import packages
import requests
import re
import os
import json
import pandas as pd
import csv
import datetime
import dateutil.parser
import unicodedata
import time
import base64
import preprocessor as p
from time import sleep

## 1. Get Tweets

In [2]:
# Setting bearer token as an environment variable
# os.environ['TOKEN'] = "<BEARER_TOKEN>"
# os.environ['API_KEY'] = "<API_KEY>"
# os.environ['API_SECRET_KEY'] = "<API_SECRET_KEY>"

In [3]:
# Defining getTweets function
'''
Authentication and get requests to retrieve Tweets under Twitter Search All V2.0
Academic Research API.

Arguments:
os_tokens -- boolean. Whether to look for tokens on os environments ['TOKEN'], 
['API_KEY'], and ['API_SECRET_KEY'] variables. Default = False.
bearer_token -- string. Bearer token. Default = None.
api_key -- string. API Key. Default = None.
api_key_secret -- string. API Key Secret. Default = None.
query -- string. Twitter query parameter. Mandatory argument. Default = None.
start_time -- string. Start date and time for tweets search interval.
end_time -- string. End date and time for tweets search interval.
max_results -- int. Maximum number of tweets retrieved per pagination.
Default = 100.
author -- str. User id for tweets search or username without @. When defined,
will target tweets from the specified twitter account. Default = None.
get_location -- boolean. Whether to send a post request to retrieve location
based on place_id. Default = False.
@@@ NOTE: This method is limited by Twitter GET Request Rate limit of 300 GET
requests per 15 minutes. Will return an error message if used on a query that
returns over 300 tweets!! @@@
paginate -- boolean. Whether to retrieve only first page results, or to paginate
over the following pages (see next argument). Default = False.
pages -- int. Maximum number of pages to paginate and retrieve results.
Default = 2. Will not work if paginate is set to False.
next_token -- str. Next_token to continue pagination

Returns:
tweets -- list of dictionairies. Tweets data retrieved from the selected period,
according to the given query parameter.
'''
def getTweets(os_tokens = False,
              bearer_token = None,
              api_key = None,
              api_key_secret = None,
              query = None,
              start_time = None,
              end_time = None,
              max_results = 100,
              author = None,
              get_location = False,
              paginate = False,
              pages = 2,
              next_token = None):

    tweets = []
    page = 1
    
    # Retrieving keys
    if os_tokens:
        try:
            os.getenv('TOKEN')
        except:
            print("Bearer Token not found under <'TOKEN'> os environment variable")
        else:
            bearer_token = os.environ['TOKEN']
        try:
            os.getenv('API_KEY')
        except:
            print("API Key not found under <'API_KEY'> os environment variable")
        else:
            api_key = os.environ['API_KEY']
        try:
            os.getenv('API_SECRET_KEY')
        except:
            print("API Secret Key not found under <'API_SECRET_KEY'> os environment variable")
        else:
            api_key_secret = os.environ['API_SECRET_KEY']                    
    else:
        bearer_token = bearer_token
        api_key = api_key
        api_key_secret = api_key_secret

    # Reformat the keys and encode
    key_secret = '{}:{}'.format(api_key, api_key_secret).encode('ascii')
    b64_encoded_key = base64.b64encode(key_secret)
    b64_encoded_key = b64_encoded_key.decode('ascii')

    # Posting authentication request using Twitter authentication resource URL
    base_url = 'https://api.twitter.com/'
    auth_url = '{}oauth2/token'.format(base_url)
    
    # Creating headers
    headers = {
        'Authorization': 'Bearer {}'.format(bearer_token)
        }
    auth_headers = {
        'Authorization': 'Basic {}'.format(b64_encoded_key),
        'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'
        }
    auth_data = {
        'grant_type': 'client_credentials'
        }

    # Creating URL --Tweets GET Request
    search_url = "https://api.twitter.com/2/tweets/search/all"
    if author == None:
        query_arguments = query
    else:
        query_arguments = query + f' from:{author}'
    query_params = {'query': query_arguments,
                    'start_time': start_time,
                    'end_time': end_time,
                    'max_results': max_results,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,username,location,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': next_token}
    
    # Creating URL --Locations POST Request
    base_url = 'https://api.twitter.com/'  
    auth_url = '{}oauth2/token'.format(base_url)

    # Connecting to endpoint
    # --Locations POST Request
    if get_location:
        auth_resp = requests.post(auth_url, headers = auth_headers, data = auth_data)
        if auth_resp.status_code != 200:
            raise Exception(auth_resp.status_code, auth_resp.text)
        else:
            access_token = auth_resp.json()['access_token']
            geo_headers = {
                'Authorization': 'Bearer {}'.format(access_token)    
            }
            print("Location POST request successfully retrieved: ", auth_resp.status_code)

    # --Tweets GET Request
    response = requests.request("GET", search_url, headers=headers, params=query_params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    else:
        temp_tweets = response.json()['data']
        # GET request to retrieve location name out of placeid
        if get_location:
            for tweet in temp_tweets:
                try:
                    tweet['geo']
                except:
                    tweet['place'] = ''
                else:
                    place_id = tweet['geo']['place_id']
                    geo_url = f'https://api.twitter.com/1.1/geo/id/{place_id}.json'
                    geo_resp = requests.get(geo_url, headers = geo_headers)
                    print("1", geo_resp.status_code, geo_resp.text)
                    tweet['place'] = geo_resp.json()['full_name']
        tweets += temp_tweets
        print(f"Request successful. Total retrieved tweets: {str(len(temp_tweets))} on page {page}. Earliest tweet date: {tweets[-1]['created_at'][0:10]}")

    # Paginating through next_token
    if paginate:
        page += 1
        while page <= pages:
            time.sleep(1)
            try:
                response.json()['meta']['next_token']
            except:
                print(f"Final result retrieved. Last tweet reached according to query parameter at page {page -1}. {len(tweets)} retrieved tweets.")
                break
            else:
                query_params['next_token'] = response.json()['meta']['next_token']
                response = requests.request("GET", search_url, headers=headers, params=query_params)
                if response.status_code != 200:
                    raise Exception(response.status_code, response.text)
                else:
                    try:
                        response.json()['data']
                    except:
                        print(f"No data retrieved out of page {page}.")
                        page += 1
                    else:
                        temp_tweets = response.json()['data']
                        if get_location:
                            for tweet in temp_tweets:
                                try:
                                    tweet['geo']
                                except:
                                    tweet['place'] = ''
                                else:
                                    place_id = tweet['geo']['place_id']
                                    geo_url = f'https://api.twitter.com/1.1/geo/id/{place_id}.json'
                                    geo_resp = requests.get(geo_url, headers = geo_headers)
                                    print("2", geo_resp.status_code, geo_resp.text)
                                    tweet['place'] = geo_resp.json()['full_name']
                        tweets += temp_tweets
                    print(f"Request successful. Total retrieved tweets: {str(len(temp_tweets))} on page {page}. Earliest tweet date: {tweets[-1]['created_at'][0:10]}")
                    page += 1   
        if page > pages:
            print(f"Final results retrieved. Last tweet reached according to maximum pages parameter at page {page -1}. {len(tweets)} retrieved tweets.")
            try:
                response.json()['meta']['next_token']
            except:
                print("No next_token to retrieve")
            else:
                print(f"next_token: {response.json()['meta']['next_token']}")
    
    # Storing next_token
    try:
        response.json()['meta']['next_token']
    except:
        next_token = ''
    else:
        next_token = response.json()['meta']['next_token']

    return tweets, next_token

In [1]:
# Query statement
query = '(elders OR elder OR elderly OR "senior citizen" OR "senior citizens" OR "senior men" OR "senior women" OR "senior adult" OR "senior adults" OR seniority OR seniors OR "older adults" OR "older adult" OR "old men" OR "old women" OR "old ladies" OR "older ladies" OR "aging population" OR "aging people" OR "aging men" OR "aging women" OR "old age" OR OAP) lang:en -("elder scrolls") -is:retweet has:geo place_country:CA'
query_nogeo = '(elders OR elder OR elderly OR "senior citizen" OR "senior citizens" OR "senior men" OR "senior women" OR "senior adult" OR "senior adults" OR seniority OR seniors OR "older adults" OR "older adult" OR "old men" OR "old women" OR "old ladies" OR "older ladies" OR "aging population" OR "aging people" OR "aging men" OR "aging women" OR "old age" OR OAP) lang:en -("elder scrolls") -is:retweet'

# Period
start_time = "2018-01-01T00:00:00.000Z"
end_time = "2022-02-25T00:00:00.000Z"

max_results = 500

In [5]:
tweets, next_token = getTweets(os_tokens = True,
                               query = query,
                               start_time = start_time,
                               end_time = end_time,
                               max_results = max_results,
                               paginate = True,
                               pages = 2,
                               author = None,
                               next_token = None)

Request successful. Total retrieved tweets: 500 on page 1. Earliest tweet date: 2022-02-04
Request successful. Total retrieved tweets: 498 on page 2. Earliest tweet date: 2022-01-13
Final results retrieved. Last tweet reached according to maximum pages parameter at page 2. 998 retrieved tweets.
next_token: b26v89c19zqg8o3fpe45jikrkx396wb7udo8clot6bdkt


In [6]:
tweets_df = pd.DataFrame(tweets)
tweets_df.head()

Unnamed: 0,id,author_id,source,public_metrics,in_reply_to_user_id,geo,text,created_at,lang,reply_settings,referenced_tweets,conversation_id
0,1496961461465542656,3240659214,Twitter for Android,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",1.2279890073610404e+18,{'place_id': '38d5974e82ed1a6c'},@JosephConwell7 @Reuters I live here &amp; the...,2022-02-24T21:33:19.000Z,en,everyone,"[{'type': 'replied_to', 'id': '149690238323805...",1496535338554114054
1,1496945165365219333,1469167464101515267,Twitter for iPhone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",,{'place_id': '0811cf61cd9ea52f'},"Winnipeg seniors, wya?!! \nI’m doing Mini sess...",2022-02-24T20:28:33.000Z,en,everyone,,1496945165365219333
2,1496944326416510984,3355188729,Twitter for Android,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",,{'place_id': '3797791ff9c0e4c6'},This is no joke. I have gone to several stores...,2022-02-24T20:25:13.000Z,en,everyone,,1496944326416510984
3,1496934029542793224,21305650,Twitter for Android,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",21536408.0,{'place_id': '3797791ff9c0e4c6'},@cllrainslie Vaccines are great for the elderl...,2022-02-24T19:44:18.000Z,en,everyone,"[{'type': 'replied_to', 'id': '149693213568451...",1496932135684513796
4,1496865157967925268,3130104741,Twitter for Android,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",274070478.0,{'place_id': '5f102b4a7cc3d42e'},@msemilyrushton @CTVNews Old men start wars yp...,2022-02-24T15:10:38.000Z,en,everyone,"[{'type': 'replied_to', 'id': '149683194677671...",1496831946776711177


## 2. Organizing tweets_df

In [7]:
def organize_tweets(df):
    tweets = {}
    tweets['date_time'] = df['created_at']
    tweets['tweet_id'] = df['id']
    tweets['author_id'] = df['author_id']
    tweets['place_id'] = [geo['place_id'] for geo in df['geo']]
    tweets['tweet_text'] = df['text']

    return tweets

In [8]:
tweets = pd.DataFrame(organize_tweets(tweets_df))
tweets.head()

Unnamed: 0,date_time,tweet_id,author_id,place_id,tweet_text
0,2022-02-24T21:33:19.000Z,1496961461465542656,3240659214,38d5974e82ed1a6c,@JosephConwell7 @Reuters I live here &amp; the...
1,2022-02-24T20:28:33.000Z,1496945165365219333,1469167464101515267,0811cf61cd9ea52f,"Winnipeg seniors, wya?!! \nI’m doing Mini sess..."
2,2022-02-24T20:25:13.000Z,1496944326416510984,3355188729,3797791ff9c0e4c6,This is no joke. I have gone to several stores...
3,2022-02-24T19:44:18.000Z,1496934029542793224,21305650,3797791ff9c0e4c6,@cllrainslie Vaccines are great for the elderl...
4,2022-02-24T15:10:38.000Z,1496865157967925268,3130104741,5f102b4a7cc3d42e,@msemilyrushton @CTVNews Old men start wars yp...


## 3. Cleaning tweets' texts

In [9]:
# Text cleaning - Removing URLs, mentions, etc using tweet-preprocessor package
def tweet_clean(tweet, lower_case = False, remove_digits = False,
                remove_punct = False, replace_amper = False,
                retrieve_hashtag = True):
  """
  Clean tweet with tweet-preprocessor p.clean() removing unwanted characters,
  user mentions, punctuations and setting to lower case text.

  Arguments:
  tweet -- Text string.
  lower_case -- Boolean. When True, will lower case tweets. Default = False.
  remove_digits -- Boolean. When True, will remove numbers from tweets.
  Default = False.
  remove_punct -- Boolean. When True, will remove punctuations from tweets.
  Default = False.
  replace_amper -- Boolean. When True, will replace HTML ampersand format for
  'and' stopword. Default = False.
  retrieve_hashtage -- Boolean. When True, will retrieve tweet hashtag in an
  independent series object 'hashtags'. Default = True.
  
  Returns:
  cleaned_tweet -- Cleaned tweet text string.
  hashtags -- List with hashtags extracted from tweet
  """
  # Remove user mentions, symbols and unwanted characters
  p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED,
                p.OPT.EMOJI, p.OPT.SMILEY)
  
  tweet = p.clean(tweet)

  # Lower case
  if lower_case:
    tweet = tweet.lower()

  # Remove digits
  if remove_digits:
    tweet = tweet.replace('\d+', '')

  # Remove punctuations
  if remove_punct:
    tweet = re.sub(r'[^\w\s]', '', tweet)
  
  # Replace amper
  if replace_amper:
    tweet = tweet.replace('&amp;', 'and')
    tweet = tweet.replace('&AMP;', 'and')

  if retrieve_hashtag:
    hashtags = re.findall(r'#(\w+)', tweet)
    tweet = tweet.replace('#', '')

  cleaned_tweet = tweet

  if retrieve_hashtag:
    return pd.Series([cleaned_tweet, hashtags])
  else:
    return cleaned_tweet

In [10]:
cleaned_tweets = tweets['tweet_text'].apply(tweet_clean, replace_amper = True)

In [11]:
cleaned_tweets

Unnamed: 0,0,1
0,I live here and these far right bully boys pic...,[]
1,"Winnipeg seniors, wya?!! Im doing Mini session...","[winnipeg, collegegrad]"
2,This is no joke. I have gone to several stores...,[]
3,Vaccines are great for the elderly and / or th...,[]
4,Old men start wars ypung people fight them,[]
...,...,...
993,I'm not fearful of Covid personally ..and I'm ...,[]
994,"Intercultural Youth Lounge Date: Thursdays, Ja...",[]
995,While it is true that the vaccine does give go...,[]
996,"The Witcher S2, Episode 7: By the Elder Blood,...",[]


In [12]:
tweets = pd.concat([tweets, cleaned_tweets], axis = 1)
tweets.columns = list(tweets.columns[:5]) + ['cleaned_text', 'hashtags']
tweets.head()

Unnamed: 0,date_time,tweet_id,author_id,place_id,tweet_text,cleaned_text,hashtags
0,2022-02-24T21:33:19.000Z,1496961461465542656,3240659214,38d5974e82ed1a6c,@JosephConwell7 @Reuters I live here &amp; the...,I live here and these far right bully boys pic...,[]
1,2022-02-24T20:28:33.000Z,1496945165365219333,1469167464101515267,0811cf61cd9ea52f,"Winnipeg seniors, wya?!! \nI’m doing Mini sess...","Winnipeg seniors, wya?!! Im doing Mini session...","[winnipeg, collegegrad]"
2,2022-02-24T20:25:13.000Z,1496944326416510984,3355188729,3797791ff9c0e4c6,This is no joke. I have gone to several stores...,This is no joke. I have gone to several stores...,[]
3,2022-02-24T19:44:18.000Z,1496934029542793224,21305650,3797791ff9c0e4c6,@cllrainslie Vaccines are great for the elderl...,Vaccines are great for the elderly and / or th...,[]
4,2022-02-24T15:10:38.000Z,1496865157967925268,3130104741,5f102b4a7cc3d42e,@msemilyrushton @CTVNews Old men start wars yp...,Old men start wars ypung people fight them,[]


In [13]:
for tweet in tweets['cleaned_text'].head(10):
  print(tweet)

I live here and these far right bully boys picked on poor, elderly and vulnerable consistently, punching down like cowards, stealing food from homeless, abusing residents of women and street youth shelters threatening to sexually assault girls, smashing windows of lgbqt families..
Winnipeg seniors, wya?!! Im doing Mini sessions starting March. DM me to shoot! winnipeg collegegrad
This is no joke. I have gone to several stores today to try and buy incontinence products. The shelves are almost empty. Amazon deliveries dates are lengthy. Is this supply chains, truckers blocking the borders or what? Why must the elderly and disabled suffer humiliation?
Vaccines are great for the elderly and / or the sick.
Old men start wars ypung people fight them
God save us from old men in suits.
Not impressed with Nova Scotias reopening plan. Phase 1 seems reasonable but I think the other phases are rushed, particularly Phase 3. Very concerned about elderly and those with health conditions. 1/1 nspoli
I

In [None]:
pd.DataFrame(tweets).to_csv('cleaned_elderly_tweets.csv')