In [None]:
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [None]:
# Import packages
import requests
import re
import os
import json
import pandas as pd
import csv
import datetime
import dateutil.parser
import unicodedata
import time
import base64
import preprocessor as p
from time import sleep

## 1. Get Tweets

In [None]:
# Setting bearer token as an environment variable
# os.environ['TOKEN'] = "<BEARER_TOKEN>"
# os.environ['API_KEY'] = "<API_KEY>"
# os.environ['API_SECRET_KEY'] = "<API_SECRET_KEY>"

In [None]:
# Defining getTweets function
'''
Authentication and get requests to retrieve Tweets under Twitter Search All V2.0
Academic Research API.

Arguments:
tokens -- boolean. Whether to look for tokens on os environments ['TOKEN'], 
['API_KEY'], and ['API_SECRET_KEY'] variables. Default = False.
bearer_token -- string. Bearer token. Default = None.
api_key -- string. API Key. Default = None.
api_key_secret -- string. API Key Secret. Default = None.
query -- string. Twitter query parameter. Mandatory argument. Default = None.
start_time -- string. Start date and time for tweets search interval.
end_time -- string. End date and time for tweets search interval.
max_results -- int. Maximum number of tweets retrieved per pagination.
Default = 100.
author -- str. User id for tweets search or username without @. When defined,
will target tweets from the specified twitter account. Default = None.
get_location -- boolean. Whether to send a post request to retrieve location
based on place_id. Default = False.
@@@ NOTE: This method is limited by Twitter GET Request Rate limit of 300 GET
requests per 15 minutes. Will return an error message if used on a query that
returns over 300 tweets!! @@@
paginate -- boolean. Whether to retrieve only first page results, or to paginate
over the following pages (see next argument). Default = False.
pages -- int. Maximum number of pages to paginate and retrieve results.
Default = 2. Will not work if paginate is set to False.
next_token -- str. Next_token to continue pagination

Returns:
tweets -- list of dictionairies. Tweets data retrieved from the selected period,
according to the given query parameter.
'''
def getTweets(os_tokens = False,
              bearer_token = None,
              api_key = None,
              api_key_secret = None,
              query = None,
              start_time = None,
              end_time = None,
              max_results = 100,
              author = None,
              get_location = False,
              paginate = False,
              pages = 2,
              next_token = None):

    tweets = []
    page = 1
    
    # Retrieving keys
    if os_tokens == 'os':
        try:
            os.getenv('TOKEN')
        except:
            print("Bearer Token not found under <'TOKEN'> os environment variable")
        else:
            bearer_token = os.environ['TOKEN']
        try:
            os.getenv('API_KEY')
        except:
            print("API Key not found under <'API_KEY'> os environment variable")
        else:
            api_key = os.environ['API_KEY']
        try:
            os.getenv('API_SECRET_KEY')
        except:
            print("API Secret Key not found under <'API_SECRET_KEY'> os environment variable")
        else:
            api_key_secret = os.environ['API_SECRET_KEY']                    
    else:
        bearer_token = bearer_token
        api_key = api_key
        api_key_secret = api_key_secret

    # Reformat the keys and encode
    key_secret = '{}:{}'.format(api_key, api_key_secret).encode('ascii')
    b64_encoded_key = base64.b64encode(key_secret)
    b64_encoded_key = b64_encoded_key.decode('ascii')

    # Posting authentication request using Twitter authentication resource URL
    base_url = 'https://api.twitter.com/'
    auth_url = '{}oauth2/token'.format(base_url)
    
    # Creating headers
    headers = {
        'Authorization': 'Bearer {}'.format(bearer_token)
        }
    auth_headers = {
        'Authorization': 'Basic {}'.format(b64_encoded_key),
        'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'
        }
    auth_data = {
        'grant_type': 'client_credentials'
        }

    # Creating URL --Tweets GET Request
    search_url = "https://api.twitter.com/2/tweets/search/all"
    query_params = {'query': query + f' from:{author}',
                    'start_time': start_time,
                    'end_time': end_time,
                    'max_results': max_results,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,username,location,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': next_token}
    
    # Creating URL --Locations POST Request
    base_url = 'https://api.twitter.com/'  
    auth_url = '{}oauth2/token'.format(base_url)

    # Connecting to endpoint
    # --Locations POST Request
    if get_location:
        auth_resp = requests.post(auth_url, headers = auth_headers, data = auth_data)
        if auth_resp.status_code != 200:
            raise Exception(auth_resp.status_code, auth_resp.text)
        else:
            access_token = auth_resp.json()['access_token']
            geo_headers = {
                'Authorization': 'Bearer {}'.format(access_token)    
            }
            print("Location POST request successfully retrieved: ", auth_resp.status_code)

    # --Tweets GET Request
    response = requests.request("GET", search_url, headers=headers, params=query_params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    else:
        temp_tweets = response.json()['data']
        # GET request to retrieve location name out of placeid
        if get_location:
            for tweet in temp_tweets:
                try:
                    tweet['geo']
                except:
                    tweet['place'] = ''
                else:
                    place_id = tweet['geo']['place_id']
                    geo_url = f'https://api.twitter.com/1.1/geo/id/{place_id}.json'
                    geo_resp = requests.get(geo_url, headers = geo_headers)
                    print("1", geo_resp.status_code, geo_resp.text)
                    tweet['place'] = geo_resp.json()['full_name']
        tweets += temp_tweets
        print(f"Request successful. Total retrieved tweets: {str(len(temp_tweets))} on page {page}. Earliest tweet date: {tweets[-1]['created_at'][0:10]}")

    # Paginating through next_token
    if paginate:
        page += 1
        while page <= pages:
            time.sleep(1)
            try:
                response.json()['meta']['next_token']
            except:
                print(f"Final result retrieved. Last tweet reached according to query parameter at page {page -1}. {len(tweets)} retrieved tweets.")
                break
            else:
                query_params['next_token'] = response.json()['meta']['next_token']
                response = requests.request("GET", search_url, headers=headers, params=query_params)
                if response.status_code != 200:
                    raise Exception(response.status_code, response.text)
                else:
                    try:
                        response.json()['data']
                    except:
                        print(f"No data retrieved out of page {page}.")
                        page += 1
                    else:
                        temp_tweets = response.json()['data']
                        if get_location:
                            for tweet in temp_tweets:
                                try:
                                    tweet['geo']
                                except:
                                    tweet['place'] = ''
                                else:
                                    place_id = tweet['geo']['place_id']
                                    geo_url = f'https://api.twitter.com/1.1/geo/id/{place_id}.json'
                                    geo_resp = requests.get(geo_url, headers = geo_headers)
                                    print("2", geo_resp.status_code, geo_resp.text)
                                    tweet['place'] = geo_resp.json()['full_name']
                        tweets += temp_tweets
                    print(f"Request successful. Total retrieved tweets: {str(len(temp_tweets))} on page {page}. Earliest tweet date: {tweets[-1]['created_at'][0:10]}")
                    page += 1   
        if page > pages:
            print(f"Final results retrieved. Last tweet reached according to maximum pages parameter at page {page -1}. {len(tweets)} retrieved tweets.")
            try:
                response.json()['meta']['next_token']
            except:
                print("No next_token to retrieve")
            else:
                print(f"next_token: {response.json()['meta']['next_token']}")
    
    # Storing next_token
    try:
        response.json()['meta']['next_token']
    except:
        next_token = ''
    else:
        next_token = response.json()['meta']['next_token']

    return tweets, next_token

In [None]:
# Query statement
query = '(elders OR elder OR elderly OR "senior citizen" OR "senior citizens" OR "senior men" OR "senior women" OR "senior adult" OR "senior adults" OR seniority OR seniors OR "older adults" OR "older adult" OR "old men" OR "old women" OR "old ladies" OR "older ladies" OR "aging population" OR "aging people" OR "aging men" OR "aging women" OR "old age" OR OAP) lang:en -("elder scrolls") -is:retweet has:geo place_country:CA'
# query_user = 'lang:en -is:retweet'
query_nogeo = '(elders OR elder OR elderly OR "senior citizen" OR "senior citizens" OR "senior men" OR "senior women" OR "senior adult" OR "senior adults" OR seniority OR seniors OR "older adults" OR "older adult" OR "old men" OR "old women" OR "old ladies" OR "older ladies" OR "aging population" OR "aging people" OR "aging men" OR "aging women" OR "old age" OR OAP) lang:en -("elder scrolls") -is:retweet'
# query_learning = '(elders OR elder OR elderly OR "senior citizen" OR "senior citizens" OR "senior men" OR "senior women" OR "senior adult" OR "senior adults" OR seniority OR seniors OR "older adults" OR "older adult" OR "old men" OR "old women" OR "old ladies" OR "older ladies" OR "aging population" OR "aging people" OR "aging men" OR "aging women" OR "old age" OR OAP) (Scholarship OR Schooling OR Study OR Information OR Wisdom OR Educated OR Education OR Cognitive) lang:en -("elder scrolls") -is:retweet has:geo place_country:CA'
# query_test = 'lang:en -is:retweet has:geo place_country:CA'

# Period
start_time = "2018-01-01T00:00:00.000Z"
end_time = "2022-02-25T00:00:00.000Z"

max_results = 500

In [None]:
tweets, next_token = getTweets(os_tokens = "os",
                               query = query,
                               start_time = start_time,
                               end_time = end_time,
                               max_results = max_results,
                               paginate = True,
                               pages = 2,
                               author = 'CBCNews',
                               next_token = None)

Request successful. Total retrieved tweets: 500 on page 1. Earliest tweet date: 2022-02-09
Request successful. Total retrieved tweets: 500 on page 2. Earliest tweet date: 2022-01-25
Final results retrieved. Last tweet reached according to maximum pages parameter at page 2. 1000 retrieved tweets.
next_token: b26v89c19zqg8o3fpe47899zswqun17yzu7e6fj37nsl9


### 1.1. Tweets from news channels and important personalities

In [None]:
tweets_df = pd.read_csv('elderly_tweets.csv')
tweets_df.drop('Unnamed: 0', axis = 1, inplace = True)

In [None]:
tweets_df

Unnamed: 0,source,text,geo,created_at,conversation_id,reply_settings,lang,referenced_tweets,id,author_id,public_metrics,in_reply_to_user_id
0,Twitter for Android,@JosephConwell7 @Reuters I live here &amp; the...,{'place_id': '38d5974e82ed1a6c'},2022-02-24T21:33:19.000Z,1496535338554114054,everyone,en,"[{'type': 'replied_to', 'id': '149690238323805...",1496961461465542656,3240659214,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",1.227989e+18
1,Twitter for iPhone,"Winnipeg seniors, wya?!! \nI’m doing Mini sess...",{'place_id': '0811cf61cd9ea52f'},2022-02-24T20:28:33.000Z,1496945165365219333,everyone,en,,1496945165365219333,1469167464101515267,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",
2,Twitter for Android,This is no joke. I have gone to several stores...,{'place_id': '3797791ff9c0e4c6'},2022-02-24T20:25:13.000Z,1496944326416510984,everyone,en,,1496944326416510984,3355188729,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",
3,Twitter for Android,@cllrainslie Vaccines are great for the elderl...,{'place_id': '3797791ff9c0e4c6'},2022-02-24T19:44:18.000Z,1496932135684513796,everyone,en,"[{'type': 'replied_to', 'id': '149693213568451...",1496934029542793224,21305650,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",2.153641e+07
4,CareerArc 2.0,Be part of the movement from hallway care to h...,"{'place_id': '71bdc845bc7609c7', 'coordinates'...",2022-02-24T18:53:50.000Z,1496921328103804935,everyone,en,,1496921328103804935,50382485,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",
...,...,...,...,...,...,...,...,...,...,...,...,...
57134,Twitter for iPhone,@mheavyhead I lived in India for 4 months and ...,{'place_id': '13e80e6f3ac67066'},2018-01-01T14:36:44.000Z,947828294140944384,everyone,en,"[{'type': 'replied_to', 'id': '947828935558959...",947839007450308608,777511588701937665,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",2.397010e+09
57135,Twitter for iPhone,@CMcKerracher @themadsloth Refugees get nowher...,{'place_id': '53504716d445dcad'},2018-01-01T06:29:56.000Z,947598925275549697,everyone,en,"[{'type': 'replied_to', 'id': '947716069296193...",947716501452177408,3164684096,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",5.560242e+08
57136,Tweetbot for iΟS,“Old women find me attractive.”\n-not me,{'place_id': '626695e48d21858b'},2018-01-01T05:29:42.000Z,947701343443091456,everyone,en,,947701343443091456,28465428,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",
57137,Twitter for Android,(I forgot to do this for 2016.)\n\nMy favourit...,{'place_id': '5d058f2e9fe1516c'},2018-01-01T03:08:09.000Z,947665721349279745,everyone,en,,947665721349279745,26161882,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",


In [None]:
# pd.DataFrame(tweets).to_csv('elderly_tweets.csv')

In [None]:
# import pickle
# with open('tweets.pkl', 'wb') as handle:
#   pickle.dump(tweets, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
news_accounts = ['CBCNews', 'CTVNews', 'CP24', 'globeandmail', 'nationalpost', 'CdnPressNews']
other_accounts = ['SRyan4elders', 'NationalECE', 'mindingourelder', 'EldersNative', 'AgingCare', 'drjohnmorley']

In [None]:
query_user = 'lang:en -is:retweet'

for account in news_accounts:
  temp_tweets, _ = getTweets(os_tokens = "os",
                              query = query_nogeo,
                              start_time = start_time,
                              end_time = end_time,
                              max_results = max_results,
                              paginate = True,
                              pages = 250,
                              author = account,
                              next_token = None)

  pd.DataFrame(temp_tweets).to_csv(f'{account}_tweets.csv')

Request successful. Total retrieved tweets: 19 on page 1. Earliest tweet date: 2021-10-11
Request successful. Total retrieved tweets: 23 on page 2. Earliest tweet date: 2021-05-31
Request successful. Total retrieved tweets: 34 on page 3. Earliest tweet date: 2021-01-18
Request successful. Total retrieved tweets: 25 on page 4. Earliest tweet date: 2020-10-10
Request successful. Total retrieved tweets: 12 on page 5. Earliest tweet date: 2020-06-30
Request successful. Total retrieved tweets: 49 on page 6. Earliest tweet date: 2020-03-26
Request successful. Total retrieved tweets: 9 on page 7. Earliest tweet date: 2019-11-11
Request successful. Total retrieved tweets: 15 on page 8. Earliest tweet date: 2019-08-10
Request successful. Total retrieved tweets: 16 on page 9. Earliest tweet date: 2019-03-19
Request successful. Total retrieved tweets: 19 on page 10. Earliest tweet date: 2018-11-18
Request successful. Total retrieved tweets: 22 on page 11. Earliest tweet date: 2018-06-17
Request s

In [None]:
CBCNews_df = pd.read_csv('CBCNews_tweets.csv')
CTVNews_df = pd.read_csv('CTVNews_tweets.csv')
CP24_df = pd.read_csv('CP24_tweets.csv')
globeandmail_df = pd.read_csv('globeandmail_tweets.csv')
nationalpost_df = pd.read_csv('nationalpost_tweets.csv')
CdnPressNews_df = pd.read_csv('CdnPressNews_tweets.csv')

In [None]:
news_dfs = [CBCNews_df, CTVNews_df, CP24_df, globeandmail_df, nationalpost_df, CdnPressNews_df]

In [None]:
CBCNews_df.head()

Unnamed: 0.1,Unnamed: 0,created_at,id,conversation_id,author_id,lang,public_metrics,source,reply_settings,text,in_reply_to_user_id,referenced_tweets
0,0,2022-02-19T06:00:26.000Z,1494914753772376065,1494914753772376065,6433472,en,"{'retweet_count': 17, 'reply_count': 27, 'like...",Buffer,everyone,"After a spike in scams against seniors, Niagar...",,
1,1,2022-02-11T16:30:18.000Z,1492174161455812614,1492174161455812614,6433472,en,"{'retweet_count': 43, 'reply_count': 33, 'like...",Buffer,everyone,"More than 16,000 residents of long-term care h...",,
2,2,2022-01-28T06:00:27.000Z,1486942226185871360,1486942226185871360,6433472,en,"{'retweet_count': 6, 'reply_count': 3, 'like_c...",Buffer,everyone,VPD says it is investigating more than a dozen...,,
3,3,2022-01-26T16:00:50.000Z,1486368541465055239,1486368541465055239,6433472,en,"{'retweet_count': 11, 'reply_count': 18, 'like...",Buffer,everyone,The portrait of those dying in Quebec in the c...,,
4,4,2022-01-12T17:30:28.000Z,1481317669550931971,1481317669550931971,6433472,en,"{'retweet_count': 12, 'reply_count': 51, 'like...",Buffer,everyone,With the return to in-person learning less tha...,,


In [None]:
CTVNews_df.head()

Unnamed: 0.1,Unnamed: 0,conversation_id,created_at,public_metrics,text,reply_settings,author_id,lang,id,source,in_reply_to_user_id,referenced_tweets
0,0,1496741835783667712,2022-02-24T07:00:36.000Z,"{'retweet_count': 3, 'reply_count': 6, 'like_c...",A Calgary seniors residence is bringing circus...,everyone,203123011,en,1496741835783667712,True Anthem,,
1,1,1493518014502871041,2022-02-15T09:30:17.000Z,"{'retweet_count': 4, 'reply_count': 1, 'like_c...","Older adults process too much information, lea...",everyone,203123011,en,1493518014502871041,True Anthem,,
2,2,1493367045865582592,2022-02-14T23:30:23.000Z,"{'retweet_count': 4, 'reply_count': 12, 'like_...","Older adults process too much information, lea...",everyone,203123011,en,1493367045865582592,True Anthem,,
3,3,1492876259919020035,2022-02-13T15:00:11.000Z,"{'retweet_count': 3, 'reply_count': 5, 'like_c...",Valentine's cards for seniors https://t.co/xCh...,everyone,203123011,en,1492876259919020035,True Anthem,,
4,4,1491849555394330625,2022-02-10T19:00:25.000Z,"{'retweet_count': 4, 'reply_count': 2, 'like_c...","A Midland, Ont., man created a hockey stick to...",everyone,203123011,en,1491849555394330625,True Anthem,,


In [None]:
CP24_df.head()

Unnamed: 0.1,Unnamed: 0,public_metrics,conversation_id,lang,source,text,reply_settings,id,created_at,author_id,in_reply_to_user_id,referenced_tweets
0,0,"{'retweet_count': 32, 'reply_count': 10, 'like...",1486763898414022657,en,TweetDeck,‘Grandparent scam’ targeting seniors in Toront...,everyone,1486763898414022657,2022-01-27T18:11:50.000Z,19636948,,
1,1,"{'retweet_count': 40, 'reply_count': 94, 'like...",1483139432165347336,en,TweetDeck,Ontario widens access to free PCR COVID-19 tes...,everyone,1483139432165347336,2022-01-17T18:09:30.000Z,19636948,,
2,2,"{'retweet_count': 60, 'reply_count': 70, 'like...",1475616660765782023,en,Twitter Web App,Elderly woman sexually assaulted after being a...,everyone,1475616660765782023,2021-12-27T23:56:42.000Z,19636948,,
3,3,"{'retweet_count': 9, 'reply_count': 5, 'like_c...",1469721519538507784,en,TweetDeck,Toronto paramedics say a pedestrian has died a...,everyone,1469721519538507784,2021-12-11T17:31:31.000Z,19636948,,
4,4,"{'retweet_count': 3, 'reply_count': 4, 'like_c...",1469010165588496388,en,TweetDeck,The province's police watchdog is investigatin...,everyone,1469010165588496388,2021-12-09T18:24:51.000Z,19636948,,


In [None]:
globeandmail_df.head()

Unnamed: 0.1,Unnamed: 0,text,author_id,conversation_id,created_at,source,reply_settings,public_metrics,lang,id,in_reply_to_user_id,referenced_tweets
0,0,"Family raises $35,000 to bring Inuk elder home...",8736882,1495021738823802883,2022-02-19T13:05:33.000Z,dlvr.it,everyone,"{'retweet_count': 7, 'reply_count': 6, 'like_c...",en,1495021738823802883,,
1,1,Ottawa moves up timeline to repay benefit claw...,8736882,1493338531460120576,2022-02-14T21:37:05.000Z,dlvr.it,everyone,"{'retweet_count': 3, 'reply_count': 2, 'like_c...",en,1493338531460120576,,
2,2,"Elderly woman misses daily Wordle update, lead...",8736882,1492204048023384064,2022-02-11T18:29:03.000Z,dlvr.it,everyone,"{'retweet_count': 6, 'reply_count': 0, 'like_c...",en,1492204048023384064,,
3,3,Deals for seniors’ residences surge as sector ...,8736882,1491354085114609669,2022-02-09T10:11:36.000Z,dlvr.it,everyone,"{'retweet_count': 1, 'reply_count': 3, 'like_c...",en,1491354085114609669,,
4,4,"Iqaluit’s sole elder-care home reopens, allowi...",8736882,1489052031981473793,2022-02-03T01:44:04.000Z,dlvr.it,everyone,"{'retweet_count': 2, 'reply_count': 2, 'like_c...",en,1489052031981473793,,


In [None]:
nationalpost_df.head()

Unnamed: 0.1,Unnamed: 0,created_at,id,conversation_id,author_id,lang,public_metrics,source,reply_settings,text,referenced_tweets,in_reply_to_user_id
0,0,2022-02-11T19:43:47.000Z,1492222856482197506,1492222856482197506,14216661,en,"{'retweet_count': 3, 'reply_count': 4, 'like_c...",Echobox,everyone,Number of seniors seeking help with debt is ri...,,
1,1,2022-02-01T13:22:24.000Z,1488502997671989250,1488502997671989250,14216661,en,"{'retweet_count': 1, 'reply_count': 4, 'like_c...",Echobox,everyone,Financial abuse of seniors a growing problem a...,,
2,2,2022-01-31T21:49:02.000Z,1488268108355837954,1488268108355837954,14216661,en,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",Echobox,everyone,When care partners of older adults first encou...,,
3,3,2022-01-24T22:31:33.000Z,1485742094798966791,1485742094798966791,14216661,en,"{'retweet_count': 4, 'reply_count': 8, 'like_c...",Echobox,everyone,COVID-19 has amplified existing cracks in the ...,,
4,4,2022-01-24T19:03:48.000Z,1485689813378510849,1485689813378510849,14216661,en,"{'retweet_count': 0, 'reply_count': 2, 'like_c...",Echobox,everyone,According to new research from New York Univer...,,


In [None]:
CdnPressNews_df.head()

Unnamed: 0.1,Unnamed: 0,created_at,text,source,author_id,lang,id,public_metrics,reply_settings,conversation_id
0,0,2021-08-12T20:40:58.000Z,"Staff, volunteers in B.C. senior care centres ...",TweetDeck,43355844,en,1425920266648043524,"{'retweet_count': 4, 'reply_count': 1, 'like_c...",everyone,1425920266648043524
1,1,2021-04-30T14:54:00.000Z,COVID-19 vaccine uptake among seniors in Canad...,TweetDeck,43355844,en,1388144610476797952,"{'retweet_count': 2, 'reply_count': 0, 'like_c...",everyone,1388144610476797952
2,2,2021-03-16T17:13:41.000Z,NACI says Oxford-AstraZeneca COVID-19 vaccine ...,TweetDeck,43355844,en,1371872307866656774,"{'retweet_count': 1, 'reply_count': 3, 'like_c...",everyone,1371872307866656774
3,3,2020-04-14T18:45:14.000Z,More older Canadians die as COVID-19 toll pass...,TweetDeck,43355844,en,1250133026484424704,"{'retweet_count': 2, 'reply_count': 0, 'like_c...",everyone,1250133026484424704
4,4,2020-04-14T12:43:17.000Z,Elderly main victims of COVID-19 and sci-fi wr...,TweetDeck,43355844,en,1250041938574745601,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",everyone,1250041938574745601


## 2. Organizing tweets_df

In [None]:
def organize_tweets(df):
    tweets = {}
    tweets['date_time'] = df['created_at']
    tweets['tweet_id'] = df['id']
    tweets['author_id'] = df['author_id']
    try:
      df['geo']
    except:
      pass
    else:
      tweets['place_id'] = [json.loads(tweet.replace("'", '"'))['place_id'] for tweet in df['geo']]
    tweets['tweet_text'] = df['text']

    return tweets

In [None]:
organized_news_dfs = []
for df in news_dfs:
  organized_news_dfs.append(pd.DataFrame(organize_tweets(df)))

In [None]:
tweets = pd.DataFrame(organize_tweets(tweets_df))
tweets.head()

Unnamed: 0,date_time,tweet_id,author_id,place_id,tweet_text
0,2022-02-24T21:33:19.000Z,1496961461465542656,3240659214,38d5974e82ed1a6c,@JosephConwell7 @Reuters I live here &amp; the...
1,2022-02-24T20:28:33.000Z,1496945165365219333,1469167464101515267,0811cf61cd9ea52f,"Winnipeg seniors, wya?!! \nI’m doing Mini sess..."
2,2022-02-24T20:25:13.000Z,1496944326416510984,3355188729,3797791ff9c0e4c6,This is no joke. I have gone to several stores...
3,2022-02-24T19:44:18.000Z,1496934029542793224,21305650,3797791ff9c0e4c6,@cllrainslie Vaccines are great for the elderl...
4,2022-02-24T18:53:50.000Z,1496921328103804935,50382485,71bdc845bc7609c7,Be part of the movement from hallway care to h...


## 3. Cleaning tweets' texts

In [None]:
# Text cleaning - Removing URLs, mentions, etc using tweet-preprocessor package
def tweet_clean(tweet, lower_case = False, remove_digits = False,
                remove_punct = False, replace_amper = False,
                retrieve_hashtag = True):
  """
  Clean tweet with tweet-preprocessor p.clean() removing unwanted characters,
  user mentions, punctuations and setting to lower case text.

  Arguments:
  tweet -- Text string.
  lower_case -- Boolean. When True, will lower case tweets. Default = False.
  remove_digits -- Boolean. When True, will remove numbers from tweets.
  Default = False.
  remove_punct -- Boolean. When True, will remove punctuations from tweets.
  Default = False.
  replace_amper -- Boolean. When True, will replace HTML ampersand format for
  'and' stopword. Default = False.
  retrieve_hashtage -- Boolean. When True, will retrieve tweet hashtag in an
  independent series object 'hashtags'. Default = True.
  
  Returns:
  cleaned_tweet -- Cleaned tweet text string.
  hashtags -- List with hashtags extracted from tweet
  """
  # Remove user mentions, symbols and unwanted characters
  p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED,
                p.OPT.EMOJI, p.OPT.SMILEY)
  
  tweet = p.clean(tweet)

  # Lower case
  if lower_case:
    tweet = tweet.lower()

  # Remove digits
  if remove_digits:
    tweet = tweet.replace('\d+', '')

  # Remove punctuations
  if remove_punct:
    tweet = re.sub(r'[^\w\s]', '', tweet)
  
  # Replace amper
  if replace_amper:
    tweet = tweet.replace('&amp;', 'and')
    tweet = tweet.replace('&AMP;', 'and')

  if retrieve_hashtag:
    hashtags = re.findall(r'#(\w+)', tweet)
    tweet = tweet.replace('#', '')

  cleaned_tweet = tweet

  if retrieve_hashtag:
    return pd.Series([cleaned_tweet, hashtags])
  else:
    return cleaned_tweet

In [None]:
cleaned_tweets = tweets['tweet_text'].apply(tweet_clean, replace_amper = True)

In [None]:
cleaned_tweets

Unnamed: 0,0,1
0,I live here and these far right bully boys pic...,[]
1,"Winnipeg seniors, wya?!! Im doing Mini session...","[winnipeg, collegegrad]"
2,This is no joke. I have gone to several stores...,[]
3,Vaccines are great for the elderly and / or th...,[]
4,Be part of the movement from hallway care to h...,"[Woodstock, Healthcare]"
...,...,...
57134,I lived in India for 4 months and the amount o...,[]
57135,Refugees get nowhere close to that. That lies ...,[]
57136,Old women find me attractive. -not me,[]
57137,(I forgot to do this for 2016.) My favourite 2...,[]


In [None]:
tweets = pd.concat([tweets, cleaned_tweets], axis = 1)
tweets.columns = list(tweets.columns[:5]) + ['cleaned_text', 'hashtags']
tweets.head()

Unnamed: 0,date_time,tweet_id,author_id,place_id,tweet_text,cleaned_text,hashtags
0,2022-02-24T21:33:19.000Z,1496961461465542656,3240659214,38d5974e82ed1a6c,@JosephConwell7 @Reuters I live here &amp; the...,I live here and these far right bully boys pic...,[]
1,2022-02-24T20:28:33.000Z,1496945165365219333,1469167464101515267,0811cf61cd9ea52f,"Winnipeg seniors, wya?!! \nI’m doing Mini sess...","Winnipeg seniors, wya?!! Im doing Mini session...","[winnipeg, collegegrad]"
2,2022-02-24T20:25:13.000Z,1496944326416510984,3355188729,3797791ff9c0e4c6,This is no joke. I have gone to several stores...,This is no joke. I have gone to several stores...,[]
3,2022-02-24T19:44:18.000Z,1496934029542793224,21305650,3797791ff9c0e4c6,@cllrainslie Vaccines are great for the elderl...,Vaccines are great for the elderly and / or th...,[]
4,2022-02-24T18:53:50.000Z,1496921328103804935,50382485,71bdc845bc7609c7,Be part of the movement from hallway care to h...,Be part of the movement from hallway care to h...,"[Woodstock, Healthcare]"


In [None]:
for tweet in tweets['cleaned_text'].head(10):
  print(tweet)

I live here and these far right bully boys picked on poor, elderly and vulnerable consistently, punching down like cowards, stealing food from homeless, abusing residents of women and street youth shelters threatening to sexually assault girls, smashing windows of lgbqt families..
Winnipeg seniors, wya?!! Im doing Mini sessions starting March. DM me to shoot! winnipeg collegegrad
This is no joke. I have gone to several stores today to try and buy incontinence products. The shelves are almost empty. Amazon deliveries dates are lengthy. Is this supply chains, truckers blocking the borders or what? Why must the elderly and disabled suffer humiliation?
Vaccines are great for the elderly and / or the sick.
Be part of the movement from hallway care to home care. Help our aging population age well at home. Seeking Allied Health Manager in Woodstock, ON. Healthcare
Old men start wars ypung people fight them
God save us from old men in suits.
Not impressed with Nova Scotias reopening plan. Phas

In [None]:
pd.DataFrame(tweets).to_csv('cleaned_elderly_tweets.csv')

In [None]:
organized_news_dfs[5]

Unnamed: 0,date_time,tweet_id,author_id,tweet_text
0,2021-08-12T20:40:58.000Z,1425920266648043524,43355844,"Staff, volunteers in B.C. senior care centres ..."
1,2021-04-30T14:54:00.000Z,1388144610476797952,43355844,COVID-19 vaccine uptake among seniors in Canad...
2,2021-03-16T17:13:41.000Z,1371872307866656774,43355844,NACI says Oxford-AstraZeneca COVID-19 vaccine ...
3,2020-04-14T18:45:14.000Z,1250133026484424704,43355844,More older Canadians die as COVID-19 toll pass...
4,2020-04-14T12:43:17.000Z,1250041938574745601,43355844,Elderly main victims of COVID-19 and sci-fi wr...
5,2020-04-13T12:31:57.000Z,1249676700100767744,43355844,Federal officials to provide COVID-19 update f...
6,2019-03-25T15:25:56.000Z,1110201154946908161,43355844,Police in Florida have launched a double homic...
7,2019-01-21T17:50:30.000Z,1087407098621816834,43355844,Coroner probes exposure death of Gilles Ducepp...
8,2018-12-21T09:23:16.000Z,1076045426707611649,43355844,"Elmer Courchene, a well-known and respected Fi..."
9,2018-11-14T03:23:43.000Z,1062546594678169601,43355844,A federal government initiative to support Ind...


In [None]:
cleaned_tweets = tweets['tweet_text'].apply(tweet_clean, replace_amper = True)
tweets = pd.concat([tweets, cleaned_tweets], axis = 1)
tweets.columns = list(tweets.columns[:5]) + ['cleaned_text', 'hashtags']
tweets.head()

In [None]:
news_dfs = []
for df, account in zip(organized_news_dfs, news_accounts):
  df['account'] = account
  cleaned_tweets = df['tweet_text'].apply(tweet_clean, replace_amper = True)
  df = pd.concat([df, cleaned_tweets], axis = 1)
  df.columns = list(df.columns[:5]) + ['cleaned_text', 'hashtags']
  news_dfs.append(df)

In [None]:
for df, account in zip(news_dfs, news_accounts):
  df.to_csv(f'cleaned_{account}_tweets.csv')