# Get All Tweets with #PodRevDay

In [1]:
# Get the data
# !snscrape --jsonl twitter-hashtag "#podrevday since:2020-01-01 until:2021-10-10" > 'data/jan20_oct21.json'

In [2]:
import pandas as pd
from geotext import GeoText

df_tweets = pd.read_json('data/jan20_oct21.json', lines=True)

In [3]:
df_tweets.shape

(3163, 21)

In [4]:
df_tweets.to_csv('data/jan20_oct21.json', index=False)

# Get User Data

In [5]:
df_tweets.user[0]

{'username': 'thatangellove',
 'displayname': 'Angel Lovecchio',
 'id': 2577871830,
 'description': 'Clarifying complexities so that we realize our power to create the world around us.',
 'rawDescription': 'Clarifying complexities so that we realize our power to create the world around us.',
 'descriptionUrls': [],
 'verified': False,
 'created': '2014-06-20T02:43:31+00:00',
 'followersCount': 56,
 'friendsCount': 259,
 'statusesCount': 138,
 'favouritesCount': 384,
 'listedCount': 0,
 'mediaCount': 22,
 'location': 'TN ',
 'protected': False,
 'linkUrl': None,
 'linkTcourl': None,
 'profileImageUrl': 'https://pbs.twimg.com/profile_images/1380323964300443650/YnkqquJF_normal.jpg',
 'profileBannerUrl': 'https://pbs.twimg.com/profile_banners/2577871830/1611515065',
 'url': 'https://twitter.com/thatangellove'}

In [7]:
df_tweets['username'] = df_tweets['url'].str.split('/').str[3]

In [8]:
pod_rev_users = pd.DataFrame(df_tweets.user.to_list())

In [9]:
pod_rev_users.head().T

Unnamed: 0,0,1,2,3,4
username,thatangellove,thatangellove,chainofbeing,greenhorizonpod,CaseyBroda
displayname,Angel Lovecchio,Angel Lovecchio,Chain of Being | Mythic Science Fiction Podcast,The Green Horizon | Irish Indie Audio Drama ðŸ‡®ðŸ‡ªðŸš€,Casey Broda
id,2577871830,2577871830,836274023180951552,1212088323885301760,533456292
description,Clarifying complexities so that we realize our...,Clarifying complexities so that we realize our...,A glimpse into the future: where gods control ...,That show about Irish people in space. Proud m...,Podcast Editor & Manager. Photographer. Hiker.
rawDescription,Clarifying complexities so that we realize our...,Clarifying complexities so that we realize our...,A glimpse into the future: where gods control ...,That show about Irish people in space. Proud m...,Podcast Editor & Manager. Photographer. Hiker.
descriptionUrls,[],[],[],[],[]
verified,False,False,False,False,False
created,2014-06-20T02:43:31+00:00,2014-06-20T02:43:31+00:00,2017-02-27T17:57:19+00:00,2019-12-31T19:09:21+00:00,2012-03-22T18:35:49+00:00
followersCount,56,56,669,881,415
friendsCount,259,259,1086,670,431


In [10]:
len(set(pod_rev_users.username))

617

In [11]:
pod_rev_users = pod_rev_users.drop_duplicates(subset=['username'])
len(pod_rev_users)

617

In [12]:
pod_rev_users.head().T

Unnamed: 0,0,2,3,4,5
username,thatangellove,chainofbeing,greenhorizonpod,CaseyBroda,bl_save
displayname,Angel Lovecchio,Chain of Being | Mythic Science Fiction Podcast,The Green Horizon | Irish Indie Audio Drama ðŸ‡®ðŸ‡ªðŸš€,Casey Broda,BL Can't Save You But...
id,2577871830,836274023180951552,1212088323885301760,533456292,1384179676491390978
description,Clarifying complexities so that we realize our...,A glimpse into the future: where gods control ...,That show about Irish people in space. Proud m...,Podcast Editor & Manager. Photographer. Hiker.,Offical account for 'BL Can't Save You But...'...
rawDescription,Clarifying complexities so that we realize our...,A glimpse into the future: where gods control ...,That show about Irish people in space. Proud m...,Podcast Editor & Manager. Photographer. Hiker.,Offical account for 'BL Can't Save You But...'...
descriptionUrls,[],[],[],[],[]
verified,False,False,False,False,False
created,2014-06-20T02:43:31+00:00,2017-02-27T17:57:19+00:00,2019-12-31T19:09:21+00:00,2012-03-22T18:35:49+00:00,2021-04-19T16:22:40+00:00
followersCount,56,669,881,415,46
friendsCount,259,1086,670,431,40


In [13]:
print(pod_rev_users.columns)
pod_rev_users.shape

Index(['username', 'displayname', 'id', 'description', 'rawDescription',
       'descriptionUrls', 'verified', 'created', 'followersCount',
       'friendsCount', 'statusesCount', 'favouritesCount', 'listedCount',
       'mediaCount', 'location', 'protected', 'linkUrl', 'linkTcourl',
       'profileImageUrl', 'profileBannerUrl', 'url'],
      dtype='object')


(617, 21)

# Clean User Data

In [14]:
df_users = pod_rev_users.loc[:, ['id','username', 'displayname', 'location', 'created', 'followersCount', 'friendsCount', 'url', 'verified' ]]

In [15]:
def location_extraction (df):
    '''Creates creates a geotext column to extract city and country info if possible'''

    df.loc[:, "location"] = df.loc[:, "location"].fillna("blank")
    df.loc[:, "geotext"] = df.loc[:, "location"].apply(GeoText)
    df.loc[:, 'city'] = df.loc[:, 'geotext'].apply(lambda x: x.cities)
    df.loc[:, 'country'] = df.loc[:, 'geotext'].apply(lambda x: x.countries)
    
    return df

df_users = location_extraction(df_users)

In [16]:
from geonamescache import GeonamesCache
gc = GeonamesCache()
countries = gc.get_countries()
country_info = pd.DataFrame(countries).T
country_info = country_info.set_index('geonameid').reset_index()
name_code = country_info.loc[:, ["name", "iso3"]]

us_states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
             "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA",
             "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY",
             "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX",
             "UT", "VT", "VA", "WA", "WV", "WI", "WY", "USA", "United States",
             'Seattle', "Los Angeles", "Houston", "Atlanta", "Pittsburgh"]

us_state_names = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut", 
               "District ", "of Columbia", "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", 
               "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", 
               "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", 
               "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", 
               "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", 
               "Wisconsin", "West Virginia", "Wyoming"]

can_prov_abbrev = {'Alberta': 'AB','British Columbia': 'BC','Manitoba': 'MB', 'New Brunswick': 'NB',
                       'Newfoundland and Labrador': 'NL', 'Northwest Territories': 'NT','Nova Scotia': 'NS','Nunavut': 'NU',
                       'Ontario': 'ON','Prince Edward Island': 'PE', 'Quebec': 'QC','Saskatchewan': 'SK','Yukon': 'YT'}

can_prov_names, can_prov_abbr = zip(*can_prov_abbrev.items())

uk = ["England", 'Wales', "Scotland", 'London', "Manchester", "Isle of Wight", "Northern Ireland", "United Kingdom", 'Bailiwick of Guernsey', "UK", "Hoxton", "Jersey"]

india_city = ["Bangalore", "Delhi", "Hyderabad", "Bengaluru"]

german_city = ["Munich", "Berlin", "eisgau","Hamburg", "Dortmund"]

south_africa = ["South Africa", "Durban", "Johannesburg"]

south_korea = ["Republic of Korea"]

uae = ['UAE', 'Dubai', 'Abu Dhabi']


def replacer(area, name):
    df_users.loc[(df_users.location.str.contains('|'.join(area))), "country"] = name    
    return df_users

def list_to_string(df):
    df["city"] = df['city'].apply(lambda x: "".join(map(str, x)))
    df["country"] = df['country'].apply(lambda x: "".join(map(str, x)))
    
    return df

In [17]:
def location_cleaner(df):
    df = replacer(can_prov_names, "Canada")
    df = replacer(can_prov_abbrev, "Canada")
    df = replacer(us_state_names, "United States")
    df = replacer(us_states, "United States")
    df = replacer(uk, "United Kingdom")
    df = replacer(german_city, "Germany")
    df = replacer(south_africa, "South Africa")
    df = replacer(india_city, "India")
    df = replacer(uae, 'United Arab Emirates')
    df = replacer(south_korea, "South Korea")
    df.loc[(df.location == "Italia"), "country"] = "Italy" 
    df.loc[(df.location == "Belgrade"), "country"] = "Serbia" 
    df.loc[(df.country == "PolandSerbia"), "country"] = "Poland" 
    df = list_to_string(df)
    
    return df

In [18]:
df_users = location_cleaner(df_users)
df_users.loc[(df_users.country == "PolandSerbia"), "country"] = "Poland" 

In [19]:
df_users.country.value_counts()

                        290
United States           204
United Kingdom           55
Canada                   26
Germany                   6
Ireland                   4
Australia                 3
Croatia                   3
Spain                     3
Poland                    3
India                     3
New Zealand               3
France                    2
Malaysia                  2
South Africa              2
Sweden                    1
Italy                     1
Nigeria                   1
Ecuador                   1
South Korea               1
United Arab Emirates      1
Czech Republic            1
Vanuatu                   1
Name: country, dtype: int64

In [20]:
df_users_full = pd.merge(left=df_users,
                    right=name_code,
                    how='left',
                    left_on='country',
                    right_on='name')

In [21]:
df_users_full.head()

Unnamed: 0,id,username,displayname,location,created,followersCount,friendsCount,url,verified,geotext,city,country,name,iso3
0,2577871830,thatangellove,Angel Lovecchio,TN,2014-06-20T02:43:31+00:00,56,259,https://twitter.com/thatangellove,False,<geotext.geotext.GeoText object at 0x7f1819ce0...,,United States,United States,USA
1,836274023180951552,chainofbeing,Chain of Being | Mythic Science Fiction Podcast,,2017-02-27T17:57:19+00:00,669,1086,https://twitter.com/chainofbeing,False,<geotext.geotext.GeoText object at 0x7f1819ce0...,,,,
2,1212088323885301760,greenhorizonpod,The Green Horizon | Irish Indie Audio Drama ðŸ‡®ðŸ‡ªðŸš€,"Waterford, Ireland ðŸ‡®ðŸ‡ª",2019-12-31T19:09:21+00:00,881,670,https://twitter.com/greenhorizonpod,False,<geotext.geotext.GeoText object at 0x7f1819ce0...,Waterford,Ireland,Ireland,IRL
3,533456292,CaseyBroda,Casey Broda,"Currently Seattle, WA",2012-03-22T18:35:49+00:00,415,431,https://twitter.com/CaseyBroda,False,<geotext.geotext.GeoText object at 0x7f1819ce0...,,United States,United States,USA
4,1384179676491390978,bl_save,BL Can't Save You But...,"California, USA",2021-04-19T16:22:40+00:00,46,40,https://twitter.com/bl_save,False,<geotext.geotext.GeoText object at 0x7f1819ce0...,,United States,United States,USA


## Save the Users

In [22]:
df_users.to_csv('data/user_data.csv')

# Merge User's Geographic Data with Tweets

In [23]:
full_df = pd.merge(left=df_tweets, 
                   right=df_users, 
                   how='left',
                   on='username')

full_df.to_csv('data/tweets_users_oct_21.csv', index=False)

In [24]:
full_df.shape

(3163, 33)

In [25]:
full_df.head()

Unnamed: 0,url_x,date,content,renderedContent,id_x,user,outlinks,tcooutlinks,replyCount,retweetCount,...,displayname,location,created,followersCount,friendsCount,url_y,verified,geotext,city,country
0,https://twitter.com/thatangellove/status/14469...,2021-10-09 22:50:07+00:00,TODAY IS #PodRevDay!!!!!!!\n\n 1. Leave a pod...,TODAY IS #PodRevDay!!!!!!!\n\n 1. Leave a pod...,1446971262782386179,"{'username': 'thatangellove', 'displayname': '...",[https://www.podchaser.com/podcasts/clearing-c...,[https://t.co/UnsE0dnB6B],0,0,...,Angel Lovecchio,TN,2014-06-20T02:43:31+00:00,56,259,https://twitter.com/thatangellove,False,<geotext.geotext.GeoText object at 0x7f1819ce0...,,United States
1,https://twitter.com/thatangellove/status/14469...,2021-10-09 22:49:05+00:00,TODAY IS #PodRevDay!!!!!!!\n\n 1. Leave a pod...,TODAY IS #PodRevDay!!!!!!!\n\n 1. Leave a pod...,1446971005432483852,"{'username': 'thatangellove', 'displayname': '...",[https://www.podchaser.com/podcasts/curious-id...,[https://t.co/rg30RyvLYi],0,0,...,Angel Lovecchio,TN,2014-06-20T02:43:31+00:00,56,259,https://twitter.com/thatangellove,False,<geotext.geotext.GeoText object at 0x7f1819ce0...,,United States
2,https://twitter.com/chainofbeing/status/144691...,2021-10-09 19:10:55+00:00,"Hey, guess what?\nYou should drop a review for...","Hey, guess what?\nYou should drop a review for...",1446916099677827074,"{'username': 'chainofbeing', 'displayname': 'C...",[],[],0,3,...,Chain of Being | Mythic Science Fiction Podcast,,2017-02-27T17:57:19+00:00,669,1086,https://twitter.com/chainofbeing,False,<geotext.geotext.GeoText object at 0x7f1819ce0...,,
3,https://twitter.com/greenhorizonpod/status/144...,2021-10-09 19:03:17+00:00,Am I too late for #PodRevDay ? Yes...Most defi...,Am I too late for #PodRevDay ? Yes...Most defi...,1446914179768664067,"{'username': 'greenhorizonpod', 'displayname':...",[https://www.podchaser.com/podcasts/the-green-...,[https://t.co/vFhNhYDvp9],0,4,...,The Green Horizon | Irish Indie Audio Drama ðŸ‡®ðŸ‡ªðŸš€,"Waterford, Ireland ðŸ‡®ðŸ‡ª",2019-12-31T19:09:21+00:00,881,670,https://twitter.com/greenhorizonpod,False,<geotext.geotext.GeoText object at 0x7f1819ce0...,Waterford,Ireland
4,https://twitter.com/CaseyBroda/status/14468808...,2021-10-09 16:50:48+00:00,Yesterday was #PodRevDay but this morning I li...,Yesterday was #PodRevDay but this morning I li...,1446880840869289984,"{'username': 'CaseyBroda', 'displayname': 'Cas...",[https://open.spotify.com/episode/1rayJEyhthua...,[https://t.co/WNzDjoC2bH],0,2,...,Casey Broda,"Currently Seattle, WA",2012-03-22T18:35:49+00:00,415,431,https://twitter.com/CaseyBroda,False,<geotext.geotext.GeoText object at 0x7f1819ce0...,,United States


In [1]:
import pandas as pd 

In [24]:
df = pd.read_csv('data/tweets_users_oct_21.csv')

In [25]:
# df['freq'] = df.groupby('a')['a'].transform('count')


df['count'] = df.groupby('username')['username'].transform('count')

In [26]:
df.sort_values(by='count')

Unnamed: 0,url_x,date,content,renderedContent,id_x,user,outlinks,tcooutlinks,replyCount,retweetCount,...,location,created,followersCount,friendsCount,url_y,verified,geotext,city,country,count
1581,https://twitter.com/BookingPod/status/13587979...,2021-02-08 15:20:56+00:00,So cool! \nHappy #PodRevDay ðŸ¥³ðŸ¥³ðŸ¥³ !!!!! https://...,So cool! \nHappy #PodRevDay ðŸ¥³ðŸ¥³ðŸ¥³ !!!!! twitter....,1358797970225979397,"{'username': 'BookingPod', 'displayname': 'Pod...",['https://twitter.com/Podchaser/status/1358772...,['https://t.co/uaTiR09KgB'],1,0,...,,2020-07-29T14:04:50+00:00,289,407,https://twitter.com/BookingPod,False,<geotext.geotext.GeoText object at 0x7f181a08e...,,,1
1727,https://twitter.com/PodRealms/status/133642444...,2020-12-08 21:36:32+00:00,"Today is #PodRevDay, praise is the currency th...","Today is #PodRevDay, praise is the currency th...",1336424446756786176,"{'username': 'PodRealms', 'displayname': 'Litt...",[],[],2,4,...,United States,2018-08-08T01:30:46+00:00,2101,1757,https://twitter.com/PodRealms,False,<geotext.geotext.GeoText object at 0x7f181a08e...,,United States,1
1730,https://twitter.com/TedHudson_/status/13364089...,2020-12-08 20:34:58+00:00,@GreenBenchStory I've left my review for @gree...,@GreenBenchStory I've left my review for @gree...,1336408951739584517,"{'username': 'TedHudson_', 'displayname': 'Ted...",[],[],0,2,...,"Hamilton, Ontario",2010-08-20T16:55:08+00:00,3757,4993,https://twitter.com/TedHudson_,False,<geotext.geotext.GeoText object at 0x7f181a08e...,HamiltonOntario,Canada,1
338,https://twitter.com/BrettFishA/status/14353373...,2021-09-07 20:21:13+00:00,"September 8th is Podcast Review Day, #podrevda...","September 8th is Podcast Review Day, #podrevda...",1435337379615756296,"{'username': 'BrettFishA', 'displayname': 'bre...",['http://www.podrevday.com'],['https://t.co/GIzXBOm0iP'],1,4,...,"Cape Town, South Africa",2009-04-25T15:01:42+00:00,7537,8064,https://twitter.com/BrettFishA,False,<geotext.geotext.GeoText object at 0x7f1819e98...,Cape Town,South Africa,1
337,https://twitter.com/notaladypodcast/status/143...,2021-09-07 20:25:20+00:00,#PodRevDay is coming up! Which shows will you ...,#PodRevDay is coming up! Which shows will you ...,1435338415906557953,"{'username': 'notaladypodcast', 'displayname':...",['https://twitter.com/PodRevDay/status/1435330...,['https://t.co/8jWY0N48oN'],1,1,...,,2020-08-04T23:42:51+00:00,145,110,https://twitter.com/notaladypodcast,False,<geotext.geotext.GeoText object at 0x7f1819e98...,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2100,https://twitter.com/PodRevDay/status/130333134...,2020-09-08 13:56:21+00:00,@themenasaur @SeenNotHeardPod @Podchaser Now t...,@themenasaur @SeenNotHeardPod @Podchaser Now t...,1303331342319587332,"{'username': 'PodRevDay', 'displayname': 'Podc...",[],[],0,0,...,Republic of Croatia,2020-05-07T19:40:12+00:00,1780,1905,https://twitter.com/PodRevDay,False,<geotext.geotext.GeoText object at 0x7f1819ce0...,,Croatia,421
2101,https://twitter.com/PodRevDay/status/130333119...,2020-09-08 13:55:46+00:00,"@DanRMorris hi @danrmo, this sounds great. Wha...","@DanRMorris hi @danrmo, this sounds great. Wha...",1303331197288960000,"{'username': 'PodRevDay', 'displayname': 'Podc...",[],[],0,0,...,Republic of Croatia,2020-05-07T19:40:12+00:00,1780,1905,https://twitter.com/PodRevDay,False,<geotext.geotext.GeoText object at 0x7f1819ce0...,,Croatia,421
2104,https://twitter.com/PodRevDay/status/130332966...,2020-09-08 13:49:41+00:00,P\nO\nD\nR\nE\nV\nD\nA\nY\nis \nhere! \n1. wri...,P\nO\nD\nR\nE\nV\nD\nA\nY\nis \nhere! \n1. wri...,1303329664761556993,"{'username': 'PodRevDay', 'displayname': 'Podc...",['http://www.podrevday.com'],['https://t.co/oet2vmUjAb'],4,10,...,Republic of Croatia,2020-05-07T19:40:12+00:00,1780,1905,https://twitter.com/PodRevDay,False,<geotext.geotext.GeoText object at 0x7f1819ce0...,,Croatia,421
117,https://twitter.com/PodRevDay/status/144647716...,2021-10-08 14:06:46+00:00,@Podchaser @Podchaser the email you sent out t...,@Podchaser @Podchaser the email you sent out t...,1446477169719451648,"{'username': 'PodRevDay', 'displayname': 'Podc...",['https://www.youtube.com/watch?v=VF-r5TtlT9w'],['https://t.co/5dzDQqLD4d'],0,0,...,Republic of Croatia,2020-05-07T19:40:12+00:00,1780,1905,https://twitter.com/PodRevDay,False,<geotext.geotext.GeoText object at 0x7f1819ce0...,,Croatia,421


In [27]:
df.columns

Index(['url_x', 'date', 'content', 'renderedContent', 'id_x', 'user',
       'outlinks', 'tcooutlinks', 'replyCount', 'retweetCount', 'likeCount',
       'quoteCount', 'conversationId', 'lang', 'source', 'sourceUrl',
       'sourceLabel', 'media', 'retweetedTweet', 'quotedTweet',
       'mentionedUsers', 'username', 'id_y', 'displayname', 'location',
       'created', 'followersCount', 'friendsCount', 'url_y', 'verified',
       'geotext', 'city', 'country', 'count'],
      dtype='object')

In [29]:
keep = ['username', 'id_y', 'displayname', 'location',
        'created', 'followersCount', 'friendsCount', 'url_y', 'verified',
        'city', 'country', 'count']

df.loc[:, keep].drop_duplicates().sort_values(by='count').to_csv('data/user_data_count_10_21.csv', index=False)