In [None]:
import pandas as pd
import os
import base64
import requests
from time import sleep
import matplotlib.pyplot as plt

## Twitter dataset preparation for labeling

### 1. Importing cleaned csv

In [None]:
tweets_df = pd.read_csv('cleaned_elderly_tweets.csv')

In [None]:
tweets_df.head()

Unnamed: 0.1,Unnamed: 0,date_time,tweet_id,author_id,place_id,tweet_text,cleaned_text,hashtags
0,0,2022-02-24T21:33:19.000Z,1496961461465542656,3240659214,38d5974e82ed1a6c,@JosephConwell7 @Reuters I live here &amp; the...,I live here and these far right bully boys pic...,[]
1,1,2022-02-24T20:28:33.000Z,1496945165365219333,1469167464101515267,0811cf61cd9ea52f,"Winnipeg seniors, wya?!! \nI’m doing Mini sess...","Winnipeg seniors, wya?!! Im doing Mini session...","['winnipeg', 'collegegrad']"
2,2,2022-02-24T20:25:13.000Z,1496944326416510984,3355188729,3797791ff9c0e4c6,This is no joke. I have gone to several stores...,This is no joke. I have gone to several stores...,[]
3,3,2022-02-24T19:44:18.000Z,1496934029542793224,21305650,3797791ff9c0e4c6,@cllrainslie Vaccines are great for the elderl...,Vaccines are great for the elderly and / or th...,[]
4,4,2022-02-24T18:53:50.000Z,1496921328103804935,50382485,71bdc845bc7609c7,Be part of the movement from hallway care to h...,Be part of the movement from hallway care to h...,"['Woodstock', 'Healthcare']"


### 2. Gathering location name from place_id variables

In [None]:
places = tweets_df['place_id'].value_counts()
places

3797791ff9c0e4c6    9197
38d5974e82ed1a6c    3222
53504716d445dcad    2926
6a6d896ba1cb5dc4    2791
1e5cb4d0509db554    2685
                    ... 
209726d85549d2d5       1
2b283eb6bc155115       1
73fe89382cabbe53       1
532b2c07da89e14d       1
07d9db50c2080001       1
Name: place_id, Length: 2987, dtype: int64

In [None]:
places_ids = list(places.index)
places_ids[:10]

['3797791ff9c0e4c6',
 '38d5974e82ed1a6c',
 '53504716d445dcad',
 '6a6d896ba1cb5dc4',
 '1e5cb4d0509db554',
 '0811cf61cd9ea52f',
 '5c43cbdfce4d3247',
 '5d058f2e9fe1516c',
 '4939b600461c30a4',
 '36775d842cbec509']

In [None]:
# Setting bearer token as an environment variable
# os.environ['API_KEY'] = "<API_KEY>"
# os.environ['API_SECRET_KEY'] = "<API_SECRET_KEY>"

In [None]:
# Request for tweets place_id information

# Reformat the keys and encode
key_secret = '{}:{}'.format(os.environ['API_KEY'], os.environ['API_SECRET_KEY']).encode('ascii')

# Transform from bytes to bytes that can be printed
b64_encoded_key = base64.b64encode(key_secret)

# Transform from bytes back into Unicode
b64_encoded_key = b64_encoded_key.decode('ascii')

# Posting authentication request using Twitter authentication resource URL
base_url = 'https://api.twitter.com/'
auth_url = '{}oauth2/token'.format(base_url)
auth_headers = {
    'Authorization': 'Basic {}'.format(b64_encoded_key),
    'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'
}
auth_data = {
    'grant_type': 'client_credentials'
}
auth_resp = requests.post(auth_url, headers = auth_headers, data = auth_data)
print(auth_resp.status_code)
access_token = auth_resp.json()['access_token']

200


In [None]:
# Storing access_token as an OS environment variable
os.environ['ACCESS_TOKEN'] = access_token

In [None]:
def getLocation(id):
    '''
    Arguments:
    id -- string. Place id.

    Returns:
    place_name -- string. Full_name of place returned from the geo_localization id GET request.
    '''
    try:
      os.environ['ACCESS_TOKEN']
    except:
      print('Access token not found under <"ACCESS_TOKEN"> OS Environment variable.')
    else:
      access_token = os.environ['ACCESS_TOKEN']

    # Creating headers
    geo_headers = {
    'Authorization': 'Bearer {}'.format(access_token)    
    }

    # Building url
    geo_url = f'https://api.twitter.com/1.1/geo/id/{id}.json'
    geo_resp = requests.get(geo_url, headers = geo_headers)

    if geo_resp.status_code != 200:
        print(f'Failed request. {geo_resp.text}')
        place_name = ''
    else:
        place_name = geo_resp.json()['full_name']
        print(f'Request successful. Place name {place_name} returned for place_id {id}.')

    return place_name

In [None]:
places = {}
places['place_id'] = places_ids
places['place_name'] = []

request_counter = 0
for n, place_id in enumerate(places_ids):
    if request_counter < 99:
        print(f'{n+1}: place_id {place_id}')
        places['place_name'].append(getLocation(place_id))
        request_counter += 1
    else:
        print(f'{n+1}: place_id {place_id}')
        places['place_name'].append(getLocation(place_id))
        print('100 requests limit reached. Sleeping for 15 minutes...')
        sleep(910)
        print('Resuming requests...')
        request_counter = 0

1: place_id 3797791ff9c0e4c6
Request successful. Place name Toronto, Ontario returned for place_id 3797791ff9c0e4c6.
2: place_id 38d5974e82ed1a6c
Request successful. Place name Ottawa, Ontario returned for place_id 38d5974e82ed1a6c.
3: place_id 53504716d445dcad
Request successful. Place name Calgary, Alberta returned for place_id 53504716d445dcad.
4: place_id 6a6d896ba1cb5dc4
Request successful. Place name Edmonton, Alberta returned for place_id 6a6d896ba1cb5dc4.
5: place_id 1e5cb4d0509db554
Request successful. Place name Vancouver, British Columbia returned for place_id 1e5cb4d0509db554.
6: place_id 0811cf61cd9ea52f
Request successful. Place name Winnipeg, Manitoba returned for place_id 0811cf61cd9ea52f.
7: place_id 5c43cbdfce4d3247
Request successful. Place name Mississauga, Ontario returned for place_id 5c43cbdfce4d3247.
8: place_id 5d058f2e9fe1516c
Request successful. Place name Halifax, Nova Scotia returned for place_id 5d058f2e9fe1516c.
9: place_id 4939b600461c30a4
Request succes

In [None]:
len(places['place_id'])

2987

In [None]:
places_df = pd.DataFrame(places)
places_df

Unnamed: 0,place_id,place_name
0,3797791ff9c0e4c6,"Toronto, Ontario"
1,38d5974e82ed1a6c,"Ottawa, Ontario"
2,53504716d445dcad,"Calgary, Alberta"
3,6a6d896ba1cb5dc4,"Edmonton, Alberta"
4,1e5cb4d0509db554,"Vancouver, British Columbia"
...,...,...
2982,209726d85549d2d5,
2983,2b283eb6bc155115,
2984,73fe89382cabbe53,
2985,532b2c07da89e14d,


### 2.1 Dealing with not found place_id's

In [None]:
not_found_ids = places_df[places_df['place_name'] == '']['place_id']
len(not_found_ids)

84

In [None]:
not_found_places = {}
not_found_places['place_id'] = not_found_ids
not_found_places['place_name'] = []

request_counter = 0
for n, place_id in enumerate(not_found_ids):
    print(f'{n+1}: place_id {place_id}')
    not_found_places['place_name'].append(getLocation(place_id))
    request_counter += 1

1: place_id 446dd74cab94bc2c
Request successful. Place name North Bay, Ontario returned for place_id 446dd74cab94bc2c.
2: place_id 133e76c8ced7070d
Request successful. Place name Frontenac Islands, Ontario returned for place_id 133e76c8ced7070d.
3: place_id 691faf928074005b
Request successful. Place name Midland, Ontario returned for place_id 691faf928074005b.
4: place_id 3f767a84de8478c5
Request successful. Place name East Kootenay F, British Columbia returned for place_id 3f767a84de8478c5.
5: place_id 7c09c2a06f095bcb
Request successful. Place name Northern Bruce Peninsula, Ontario returned for place_id 7c09c2a06f095bcb.
6: place_id 07d9db0d5e482001
Request successful. Place name Toronto Pearson International Airport (YYZ) returned for place_id 07d9db0d5e482001.
7: place_id 4334903d32077699
Request successful. Place name Hudson, Québec returned for place_id 4334903d32077699.
8: place_id 49ebfff28bf5f231
Request successful. Place name Rosthern, Saskatchewan returned for place_id 49ebf

In [None]:
not_found_places_df = pd.DataFrame(not_found_places)
not_found_places_df

Unnamed: 0,place_id,place_name
120,446dd74cab94bc2c,"North Bay, Ontario"
155,133e76c8ced7070d,"Frontenac Islands, Ontario"
241,691faf928074005b,"Midland, Ontario"
362,3f767a84de8478c5,"East Kootenay F, British Columbia"
483,7c09c2a06f095bcb,"Northern Bruce Peninsula, Ontario"
...,...,...
2982,209726d85549d2d5,"Memramcook, New Brunswick"
2983,2b283eb6bc155115,"Machin, Ontario"
2984,73fe89382cabbe53,"McCartney's Flat 4, British Columbia"
2985,532b2c07da89e14d,"Grimshaw, Alberta"


In [None]:
for place_id, place_name in zip(not_found_places_df['place_id'], not_found_places_df['place_name']):
  places_df.loc[places_df['place_id'] == place_id, 'place_name'] = place_name

places_df

Unnamed: 0,place_id,place_name
0,3797791ff9c0e4c6,"Toronto, Ontario"
1,38d5974e82ed1a6c,"Ottawa, Ontario"
2,53504716d445dcad,"Calgary, Alberta"
3,6a6d896ba1cb5dc4,"Edmonton, Alberta"
4,1e5cb4d0509db554,"Vancouver, British Columbia"
...,...,...
2982,209726d85549d2d5,"Memramcook, New Brunswick"
2983,2b283eb6bc155115,"Machin, Ontario"
2984,73fe89382cabbe53,"McCartney's Flat 4, British Columbia"
2985,532b2c07da89e14d,"Grimshaw, Alberta"


In [None]:
for place_id, place_name in zip(places_df['place_id'], places_df['place_name']):
  tweets_df.loc[tweets_df['place_id'] == place_id, 'place_name'] = place_name

tweets_df

Unnamed: 0.1,Unnamed: 0,date_time,tweet_id,author_id,place_id,tweet_text,cleaned_text,hashtags,place_name
0,0,2022-02-24T21:33:19.000Z,1496961461465542656,3240659214,38d5974e82ed1a6c,@JosephConwell7 @Reuters I live here &amp; the...,I live here and these far right bully boys pic...,[],"Ottawa, Ontario"
1,1,2022-02-24T20:28:33.000Z,1496945165365219333,1469167464101515267,0811cf61cd9ea52f,"Winnipeg seniors, wya?!! \nI’m doing Mini sess...","Winnipeg seniors, wya?!! Im doing Mini session...","['winnipeg', 'collegegrad']","Winnipeg, Manitoba"
2,2,2022-02-24T20:25:13.000Z,1496944326416510984,3355188729,3797791ff9c0e4c6,This is no joke. I have gone to several stores...,This is no joke. I have gone to several stores...,[],"Toronto, Ontario"
3,3,2022-02-24T19:44:18.000Z,1496934029542793224,21305650,3797791ff9c0e4c6,@cllrainslie Vaccines are great for the elderl...,Vaccines are great for the elderly and / or th...,[],"Toronto, Ontario"
4,4,2022-02-24T18:53:50.000Z,1496921328103804935,50382485,71bdc845bc7609c7,Be part of the movement from hallway care to h...,Be part of the movement from hallway care to h...,"['Woodstock', 'Healthcare']","Woodstock, Ontario"
...,...,...,...,...,...,...,...,...,...
57134,57134,2018-01-01T14:36:44.000Z,947839007450308608,777511588701937665,13e80e6f3ac67066,@mheavyhead I lived in India for 4 months and ...,I lived in India for 4 months and the amount o...,[],"Maple Ridge, British Columbia"
57135,57135,2018-01-01T06:29:56.000Z,947716501452177408,3164684096,53504716d445dcad,@CMcKerracher @themadsloth Refugees get nowher...,Refugees get nowhere close to that. That lies ...,[],"Calgary, Alberta"
57136,57136,2018-01-01T05:29:42.000Z,947701343443091456,28465428,626695e48d21858b,“Old women find me attractive.”\n-not me,Old women find me attractive. -not me,[],"Brandon, Manitoba"
57137,57137,2018-01-01T03:08:09.000Z,947665721349279745,26161882,5d058f2e9fe1516c,(I forgot to do this for 2016.)\n\nMy favourit...,(I forgot to do this for 2016.) My favourite 2...,[],"Halifax, Nova Scotia"


In [None]:
import re

for tweet_id, place in zip(tweets_df['tweet_id'], tweets_df['place_name']):
  tweets_df.loc[tweets_df['tweet_id'] == tweet_id, 'region'] = re.search('[^,]*$', place).group(0).strip()

In [None]:
reordered_columns = ['tweet_id', 'date_time', 'author_id', 'place_id', 'place_name', 'region', 'tweet_text', 'cleaned_text', 'hashtags']
tweets_df = tweets_df[reordered_columns]
tweets_df

Unnamed: 0,tweet_id,date_time,author_id,place_id,place_name,region,tweet_text,cleaned_text,hashtags
0,1496961461465542656,2022-02-24T21:33:19.000Z,3240659214,38d5974e82ed1a6c,"Ottawa, Ontario",Ontario,@JosephConwell7 @Reuters I live here &amp; the...,I live here and these far right bully boys pic...,[]
1,1496945165365219333,2022-02-24T20:28:33.000Z,1469167464101515267,0811cf61cd9ea52f,"Winnipeg, Manitoba",Manitoba,"Winnipeg seniors, wya?!! \nI’m doing Mini sess...","Winnipeg seniors, wya?!! Im doing Mini session...","['winnipeg', 'collegegrad']"
2,1496944326416510984,2022-02-24T20:25:13.000Z,3355188729,3797791ff9c0e4c6,"Toronto, Ontario",Ontario,This is no joke. I have gone to several stores...,This is no joke. I have gone to several stores...,[]
3,1496934029542793224,2022-02-24T19:44:18.000Z,21305650,3797791ff9c0e4c6,"Toronto, Ontario",Ontario,@cllrainslie Vaccines are great for the elderl...,Vaccines are great for the elderly and / or th...,[]
4,1496921328103804935,2022-02-24T18:53:50.000Z,50382485,71bdc845bc7609c7,"Woodstock, Ontario",Ontario,Be part of the movement from hallway care to h...,Be part of the movement from hallway care to h...,"['Woodstock', 'Healthcare']"
...,...,...,...,...,...,...,...,...,...
57134,947839007450308608,2018-01-01T14:36:44.000Z,777511588701937665,13e80e6f3ac67066,"Maple Ridge, British Columbia",British Columbia,@mheavyhead I lived in India for 4 months and ...,I lived in India for 4 months and the amount o...,[]
57135,947716501452177408,2018-01-01T06:29:56.000Z,3164684096,53504716d445dcad,"Calgary, Alberta",Alberta,@CMcKerracher @themadsloth Refugees get nowher...,Refugees get nowhere close to that. That lies ...,[]
57136,947701343443091456,2018-01-01T05:29:42.000Z,28465428,626695e48d21858b,"Brandon, Manitoba",Manitoba,“Old women find me attractive.”\n-not me,Old women find me attractive. -not me,[]
57137,947665721349279745,2018-01-01T03:08:09.000Z,26161882,5d058f2e9fe1516c,"Halifax, Nova Scotia",Nova Scotia,(I forgot to do this for 2016.)\n\nMy favourit...,(I forgot to do this for 2016.) My favourite 2...,[]


In [None]:
tweets_df['region'].value_counts()[:11]

Ontario                      27150
British Columbia              9161
Alberta                       8178
Québec                        2105
Manitoba                      2053
Nova Scotia                   2004
Saskatchewan                  1476
Newfoundland and Labrador     1001
New Brunswick                  795
Canada                         356
Prince Edward Island           291
Name: region, dtype: int64

In [None]:
tweets_df.to_csv('cleaned_elderly_tweets.csv')

### 3. Exporting to json format for Labelbox labeling

In [None]:
labelbox_test = tweets_df[:10]

In [None]:
labelbox_test

Unnamed: 0.1,Unnamed: 0,date_time,tweet_id,author_id,place_id,tweet_text,cleaned_text
0,0,2022-02-24T21:33:19.000Z,1496961461465542656,3240659214,38d5974e82ed1a6c,@JosephConwell7 @Reuters I live here &amp; the...,i live here amp these far right bully boys pic...
1,1,2022-02-24T20:28:33.000Z,1496945165365219333,1469167464101515267,0811cf61cd9ea52f,"Winnipeg seniors, wya?!! \nI’m doing Mini sess...",winnipeg seniors wya im doing mini sessions st...
2,2,2022-02-24T20:25:13.000Z,1496944326416510984,3355188729,3797791ff9c0e4c6,This is no joke. I have gone to several stores...,this is no joke i have gone to several stores ...
3,3,2022-02-24T19:44:18.000Z,1496934029542793224,21305650,3797791ff9c0e4c6,@cllrainslie Vaccines are great for the elderl...,vaccines are great for the elderly and or the...
4,4,2022-02-24T18:53:50.000Z,1496921328103804935,50382485,71bdc845bc7609c7,Be part of the movement from hallway care to h...,be part of the movement from hallway care to h...
5,5,2022-02-24T15:10:38.000Z,1496865157967925268,3130104741,5f102b4a7cc3d42e,@msemilyrushton @CTVNews Old men start wars yp...,old men start wars ypung people fight them
6,6,2022-02-24T14:25:12.000Z,1496853721283797002,633402666,5c43cbdfce4d3247,God save us from old men in suits.,god save us from old men in suits
7,7,2022-02-24T14:19:46.000Z,1496852357539418116,1012293706077138944,4a0cd47b60820d6f,Not impressed with Nova Scotia’s reopening pla...,not impressed with nova scotias reopening plan...
8,8,2022-02-24T13:46:04.000Z,1496843876916727809,770021584681701379,701b7d92002f366d,"@Kalandras_ In a recent one shot, two characte...",in a recent one shot two characters were grump...
9,9,2022-02-24T03:54:06.000Z,1496694903690174467,2151994626,3797791ff9c0e4c6,@oldpappy59 @FrankKhalidUK During the peak of ...,during the peak of the pandemic roman bought a...


In [None]:
labelbox_dict = {}
labelbox_dict['data'] = list(labelbox_test.iloc[:, -1])
labelbox_dict['externalId'] = 'tweets.txt'
labelbox_dict

{'data': ['i live here amp these far right bully boys picked on poor elderly amp vulnerable consistently punching down like cowards stealing food from homeless abusing residents of women amp street youth shelters threatening to sexually assault girls smashing windows of lgbqt families',
  'winnipeg seniors wya im doing mini sessions starting march dm me to shoot winnipeg collegegrad',
  'this is no joke i have gone to several stores today to try and buy incontinence products the shelves are almost empty amazon deliveries dates are lengthy is this supply chains truckers blocking the borders or what why must the elderly and disabled suffer humiliation',
  'vaccines are great for the elderly and  or the sick',
  'be part of the movement from hallway care to home care help our aging population age well at home seeking allied health manager in woodstock on healthcare',
  'old men start wars ypung people fight them',
  'god save us from old men in suits',
  'not impressed with nova scotias r

In [None]:
import json

with open('labelbox_test.json', 'w') as outfile:
  json.dump(labelbox_dict, outfile)