# Data Collection

In [11]:
#Necessary imports
import pandas as pd
import numpy as np
from tqdm import tqdm
import tweepy
from datetime import date
import pickle 
import time

In [9]:
#Get twitter credentials from AppCred.py.
#You must have your own credentials stored in working dir
from AppCred import API_KEY, API_SECRET
from AppCred import ACCESS_TOKEN, ACCESS_TOKEN_SECRET

## Load MP data

In [5]:
#Filter by Australian Parlamentarians
#Download the file full_member_info at http://twitterpoliticians.org/download
mp_df = pd.read_csv("full_member_info.csv")
mp_df = mp_df[['p.country', 'm.name', 'p.party', 'm.uid']].copy()
mp_df = mp_df.loc[mp_df["p.country"]=="Australia"]
filtered_df =  mp_df.loc[mp_df["m.uid"] != "\\N"]

## Get twitter handles

In [None]:
#Get the twitter handle of each politicians
#and add it to the df
def get_handle(uid):
    try:
        handle = api.get_user(uid).screen_name
    except:
        handle = np.nan
    return handle
            
filtered_df["twitter_handle"] = filtered_df["m.uid"].apply(lambda x: get_handle(x))

In [None]:
filtered_df.to_csv("australian_mps") #Create csv

## Get Tweets

Having downloaded the csv with australians mps you can start from here.

In [10]:
filtered_df = pd.read_csv("data/australian_mps.csv")
#Get a list of unique twitter handles
handles = filtered_df["twitter_handle"].unique().tolist()

In [38]:
#Remove private and "nan"
handles.remove("JohnAlexanderMP")
handles.remove(np.nan)

In [40]:
class TweetCollector:
    
    def __init__(self):
        self.api = self.connect_api()
    

    def connect_api(self):
        """
        Connect to the API upon initalizing that class. You need to have
        your own credentials imported
        """
        auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
        auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
        api = tweepy.API(auth, wait_on_rate_limit= True, 
                         wait_on_rate_limit_notify=True, 
                         retry_count = 10, retry_delay = 5, 
                         retry_errors= set([401, 404, 500, 503]))
        
        return api
        
    def get_tweets(self, screen_name):
        """
        Given a screen name this method tries to fetch the last 3200 tweets (maximum allowed).
        """
        #initialize a list to hold all the tweepy Tweets
        all_tweets = []  
    
        #make initial request for most recent tweets
        new_tweets = self.api.user_timeline(screen_name = screen_name,
                                            count=200, tweet_mode = "extended")
    
        #save most recent tweets
        all_tweets.extend(new_tweets)
    
        #save the id of the oldest tweet less one
        oldest = all_tweets[-1].id - 1
            
            
        #keep grabbing tweets until there are no tweets left to grab
        while len(new_tweets) > 0:
            try:
                #all subsiquent requests use the max_id param to prevent duplicates
                new_tweets = self.api.user_timeline(screen_name = screen_name,count=200,
                                                    max_id=oldest, tweet_mode = "extended")

                #save most recent tweets
                all_tweets.extend(new_tweets)

                #update the id of the oldest tweet less one
                oldest = all_tweets[-1].id - 1
                    
            except tweepy.TweepError as e:
                print(e.reason)
                time.sleep(60)
                continue
                
            
        return all_tweets
    
    
    def pickle_dump(self, screen_names, dump = True):
        """
        Given a list of screen names, this method returns a dictionary
        containing all the fetchable tweets from the list of users.
        Dumps everything as a pickle file locally
        """
        
        if not isinstance(screen_names, list):
            screen_names = [screen_names]
        
        all_tweets = {}
        
        for screen_name in tqdm(screen_names):
            try:
                tweets = self.get_tweets(screen_name)
                all_tweets[screen_name] = tweets
            except:
                print(f"could not get {screen_name}")
                continue
         
        #If True dumps all the tweets in a pickle file
        if dump:
            with open(f'pickled_tweets_{str(date.today())}.data', 'wb') as f:
                # store the data as binary data stream
                pickle.dump(all_tweets, f)
                
        return all_tweets


In [41]:
#Initiate collector object
collector = TweetCollector()
#Dump tweets from 3 MPs
tweets = collector.pickle_dump(handles)

 99%|█████████▉| 142/143 [1:27:50<00:50, 50.57s/it]

could not get TrevorEvansBne


100%|██████████| 143/143 [1:27:55<00:00, 36.89s/it]


In [158]:
tweets["AlanTudgeMP"][40].entities#["urls"][0]["expanded_url"]

{'hashtags': [{'text': 'TerryYoungMP', 'indices': [19, 32]}],
 'symbols': [],
 'user_mentions': [],
 'urls': [],
 'media': [{'id': 1384001422681722888,
   'id_str': '1384001422681722888',
   'indices': [190, 213],
   'media_url': 'http://pbs.twimg.com/media/EzT19g-UUAgoU9m.jpg',
   'media_url_https': 'https://pbs.twimg.com/media/EzT19g-UUAgoU9m.jpg',
   'url': 'https://t.co/BsVZaTmNyW',
   'display_url': 'pic.twitter.com/BsVZaTmNyW',
   'expanded_url': 'https://twitter.com/AlanTudgeMP/status/1384001428146982923/photo/1',
   'type': 'photo',
   'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
    'large': {'w': 1024, 'h': 768, 'resize': 'fit'},
    'medium': {'w': 1024, 'h': 768, 'resize': 'fit'},
    'small': {'w': 680, 'h': 510, 'resize': 'fit'}}}]}

In [29]:
#Tweets is now a dictionary where the keys are politicians and values a list of all their tweets
print(tweets.keys())
#Amount of tweets first MP
print(f"Amount of tweets {len(tweets['AlanTudgeMP'])}")
#To access the first tweet object of the first MP
#tweets["AlanTudgeMP"][0].user.followers

dict_keys(['AlanTudgeMP'])
Amount of tweets 2848


In [160]:
class BuildTweetDF:
    """
    This class builds a Pandas dataframe using a pickle dump 
    of Tweepy tweet objects as collected by the TweetCollector
    """
    
    def __init__(self, pickle_dump):
        self.all_tweets = self.pickle_open(pickle_dump)
    
    def pickle_open(self, pickle_dump):
        with open(pickle_dump, 'rb') as f:
        # read the data as binary data stream
            all_tweets = pickle.load(f)
        
        return all_tweets
    
    def get_df(self):
        
        final_df_lst = []
        
        for politician, tweets in self.all_tweets.items():
            
            #Empty list for df. More things can be added later
            screen_name = []
            created_at = []
            full_text = []
            favorite_count = []
            retweet_count = []
            tweet_id = []
            in_reply_to_screen_name = []
            hashtags = []
            user_mentions = []
            urls = []
            image = []
            
            for tweet in tweets:
                
                screen_name.append(tweet.user.screen_name)
                created_at.append(tweet.created_at)
                full_text.append(tweet.full_text)
                favorite_count.append(tweet.favorite_count)
                retweet_count.append(tweet.retweet_count)
                tweet_id.append(tweet.id)
                in_reply_to_screen_name.append(tweet.in_reply_to_screen_name)
                
                user_mentions.append([i["screen_name"] for i in tweet.entities["user_mentions"]])
                hashtags.append([i["text"] for i in tweet.entities["hashtags"]])
                
                try:
                    urls.append(tweet.entities["urls"][0]["expanded_url"])
                except:
                    urls.append(np.nan)    
                try:
                    image.append(tweet.entities["media"][3]["media_url"])
                except:
                    image.append(np.nan)
                            
                
            df = pd.DataFrame({"screen_name":screen_name,
                               "tweet_id":tweet_id,
                               "created_at":created_at,
                               "full_text":full_text,
                               "favorite_count":favorite_count,
                               "retweet_count":retweet_count,
                               "in_reply_to_screen_name":in_reply_to_screen_name,
                               "hashtags":hashtags,
                               "user_mentions":user_mentions,
                               "url":urls,
                               "image_url":image})
            
            #Append politican df to list of all dfs
            final_df_lst.append(df)
            
        #Concat to one final df
        final_df = pd.concat(final_df_lst).reset_index(drop=True)
        
        return final_df
                
                
            
            

In [161]:
#Load a pickle dump
build_df = BuildTweetDF("pickled_tweets_2021-05-04.data")

In [162]:
tweet_df = build_df.get_df()
tweet_df

Unnamed: 0,screen_name,tweet_id,created_at,full_text,favorite_count,retweet_count,in_reply_to_screen_name,hashtags,user_mentions,url,image_url
0,AlanTudgeMP,1389485615918370816,2021-05-04 07:42:40,Research commercialisation in action. @QUT wor...,25,5,,[],"[QUT, TrevorEvansBne]",,
1,AlanTudgeMP,1389434288811036675,2021-05-04 04:18:43,Hearing first hand how our extra $1.7b boost t...,7,0,,[],"[GoodStartel, BertVanManen]",,
2,AlanTudgeMP,1389430043437244420,2021-05-04 04:01:51,RT @CISOZ: [NEW EDUCATION EVENT]\nTuesday 18 M...,0,5,,[],"[CISOZ, AlanTudgeMP]",,
3,AlanTudgeMP,1389372057041866754,2021-05-04 00:11:26,"Great to meet the kids, parents and teachers a...",5,1,,[],"[Kingscc, BertVanManen]",https://ministers.dese.gov.au/tudge/new-facili...,
4,AlanTudgeMP,1389001683368169474,2021-05-02 23:39:42,RT @pmc_gov_au: Media Release @JoshFrydenberg ...,0,4,,[],"[pmc_gov_au, JoshFrydenberg, MarisePayne, Alan...",,
...,...,...,...,...,...,...,...,...,...,...,...
335964,P_Thompson88,930256196220502016,2017-11-14 02:08:55,A #LivedExperiencedVeteran voice at the macro ...,0,0,,"[LivedExperiencedVeteran, SuicidePrevention, V...",[],,
335965,P_Thompson88,930245044598063104,2017-11-14 01:24:36,#NewProfilePic https://t.co/jVWltJHHRD,1,0,,[NewProfilePic],[],,
335966,P_Thompson88,929899901919444992,2017-11-13 02:33:07,Very honoured to be named the 2018 QLD Young A...,13,6,,"[SuicidePrevention, veteransuicideprevention, ...",[],,
335967,P_Thompson88,929899276372545537,2017-11-13 02:30:38,RT @ausoftheyear: A young veteran’s champion d...,0,5,,[QLD],[ausoftheyear],,


In [163]:
#Subset bushfire tweets
bf_tweets = tweet_df.loc[(tweet_df["full_text"].str.contains("burnt|Koala|koala|fire|black summer|bushfire|Bushfire|Bush fire|bush fire|bush-fire|Bush-fire")) & 
             (tweet_df["created_at"] > "2019-06-01") & (tweet_df["created_at"] < "2020-04-01")].reset_index()

In [164]:
tweet_df.loc[(tweet_df["full_text"].str.contains("koala|Koala")) & 
            (tweet_df["created_at"] > "2019-06-01") & (tweet_df["created_at"] < "2020-04-01")]

Unnamed: 0,screen_name,tweet_id,created_at,full_text,favorite_count,retweet_count,in_reply_to_screen_name,hashtags,user_mentions,url,image_url
6350,AndrewLamingMP,1197275047406080001,2019-11-20 22:06:30,Supports Koala hospital. Rips off People Hospi...,0,0,,[],[],https://www.facebook.com/story.php?story_fbid=...,
6470,AndrewLamingMP,1181829729675923456,2019-10-09 07:12:19,Peter Switzer reckons Councillors who oppose e...,0,1,,[],[],https://www.theaustralian.com.au/business/prop...,
27081,GregHuntMP,1216566236089221121,2020-01-13 03:42:48,RT @JoshFrydenberg: Great to be at @HospitalKo...,0,58,,[],"[JoshFrydenberg, HospitalKoala, sussanley, Pat...",,
36665,JoshFrydenberg,1216637740353110016,2020-01-13 08:26:55,"With around 8m hectares burnt, the #bushfires ...",57,17,,[bushfires],"[HospitalKoala, sussanley, PatConaghanMP, TSCo...",,
36666,JoshFrydenberg,1216538757769252864,2020-01-13 01:53:36,Great to be at @HospitalKoala in Port Macquari...,140,58,,[bushfires],"[HospitalKoala, sussanley, PatConaghanMP]",,
...,...,...,...,...,...,...,...,...,...,...,...
311730,DarrenChesterMP,1217632570260643841,2020-01-16 02:20:01,Not just helping humans... the crews on the gr...,80,18,,"[YourADF, OpBushfireAssist, lovegippsland, TYFYS]","[DeptDefence, AustralianArmy]",,
324032,MakeMayoMatter,1221014815092568064,2020-01-25 10:19:51,So sad to read Sam Mitchell has experienced a ...,23,7,,"[istandwithSam, KIfires]",[],https://apple.news/ArrmGc_O4TriSjk1YP2xOnw,
324042,MakeMayoMatter,1216690441367605254,2020-01-13 11:56:20,RT @wwf_uk: Wildlife rescuer Simon Adamczyk ca...,0,287,,[],[wwf_uk],,
324057,MakeMayoMatter,1213402604450934791,2020-01-04 10:11:39,RT @ElspethHussey7: People are being asked not...,0,157,,[],[ElspethHussey7],,


In [165]:
bf_tweets.sample(10)

Unnamed: 0,index,screen_name,tweet_id,created_at,full_text,favorite_count,retweet_count,in_reply_to_screen_name,hashtags,user_mentions,url,image_url
1357,143061,emma_husar,1208251468525928448,2019-12-21 05:02:52,"@RossThomas007 I never joined my local, althou...",2,0,RossThomas007,[],[RossThomas007],,
205,42437,karenandrewsmp,1217582244119736320,2020-01-15 23:00:03,Scientists hold bushfires meeting with Federal...,30,4,,[],[],https://www.abc.net.au/radio/programs/pm/scien...,
1017,106748,AlboMP,1203153500756070401,2019-12-07 03:25:22,"We can see, smell and feel the changing climat...",2598,707,,[],[],,
230,58354,Melissa4Durack,1215441502437953536,2020-01-10 01:13:30,A medical team from HMAS Adelaide provided fir...,2,0,Melissa4Durack,[yourADF],[],https://news.defence.gov.au/national/defence-b...,
1517,164106,JoanneRyanLalor,1212824145597095936,2020-01-02 19:53:04,RT @abcnews: 'It's resonated across the world'...,0,33,,[],[abcnews],https://www.abc.net.au/news/2020-01-03/bushfir...,
1436,155089,JasonClareMP,1220592550022545414,2020-01-24 06:21:56,A big thank you to Mayor Liz Innes and the tea...,5,2,,[auspol],[GregWarren_],,
2932,327711,AdamBandt,1204677078529921025,2019-12-11 08:19:31,RT @simonahac: shame on @siemens for helping t...,0,277,,[],"[simonahac, Siemens]",,
1557,167654,Josh4Freo,1215571193194926081,2020-01-10 09:48:51,Great work - and nice to know that some WA sea...,4,2,,[saveaustralianshipping],[],https://twitter.com/VictoriaPolice/status/1215...,
2960,327909,AdamBandt,1197739495690342401,2019-11-22 04:52:03,RT @climatecouncil: This is not normal. \n\nCl...,0,96,,[],[climatecouncil],,
2784,318511,M_McCormackMP,1214360513900974083,2020-01-07 01:38:02,RT @sarahcawte: Deputy Prime Minister @M_McCor...,0,4,,[],"[sarahcawte, M_McCormackMP, 9NewsRiverina, 9Ne...",,


In [157]:
bf_tweets["full_text"][34]

'The little known Kangaroo Island fire has been a catastrophe. Redlands is 100% behind the KI community- and your help, helps us, help them. https://t.co/vCTObuerPI'