# Data Collection

In [47]:
#Necessary imports
import pandas as pd
import numpy as np
from tqdm import tqdm
import tweepy
from datetime import date
import pickle 
import time
import json

In [2]:
#Get twitter credentials from AppCred.py.
#You must have your own credentials stored in working dir
from AppCred import API_KEY, API_SECRET
from AppCred import ACCESS_TOKEN, ACCESS_TOKEN_SECRET

## Load MP data

In [5]:
#Filter by Australian Parlamentarians
#Download the file full_member_info at http://twitterpoliticians.org/download
mp_df = pd.read_csv("full_member_info.csv")
mp_df = mp_df[['p.country', 'm.name', 'p.party', 'm.uid']].copy()
mp_df = mp_df.loc[mp_df["p.country"]=="Australia"]
filtered_df =  mp_df.loc[mp_df["m.uid"] != "\\N"]

## Get twitter handles

In [None]:
#Get the twitter handle of each politicians
#and add it to the df
def get_handle(uid):
    try:
        handle = api.get_user(uid).screen_name
    except:
        handle = np.nan
    return handle
            
filtered_df["twitter_handle"] = filtered_df["m.uid"].apply(lambda x: get_handle(x))

In [None]:
filtered_df.to_csv("australian_mps") #Create csv

## Get Tweets

Having downloaded the csv with australians mps you can start from here.

In [3]:
filtered_df = pd.read_csv("data/australian_mps.csv")
#Get a list of unique twitter handles
handles = filtered_df["twitter_handle"].unique().tolist()

In [4]:
#Remove private and "nan"
handles.remove("JohnAlexanderMP")
handles.remove(np.nan)

In [32]:
class TweetCollector:
    
    def __init__(self):
        self.api = self.connect_api()
    

    def connect_api(self):
        """
        Connect to the API upon initalizing that class. You need to have
        your own credentials imported
        """
        auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
        auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
        api = tweepy.API(auth, wait_on_rate_limit= True, 
                         wait_on_rate_limit_notify=True, 
                         retry_count = 10, retry_delay = 5, 
                         retry_errors= set([401, 404, 500, 503]))
        
        return api
        
    def get_tweets(self, screen_name):
        """
        Given a screen name this method tries to fetch the last 3200 tweets (maximum allowed).
        """
        #initialize a list to hold all the tweepy Tweets
        all_tweets = []  
    
        #make initial request for most recent tweets
        new_tweets = self.api.user_timeline(screen_name = screen_name,
                                            count=200, tweet_mode = "extended")
    
        #save most recent tweets
        all_tweets.extend(new_tweets)
    
        #save the id of the oldest tweet less one
        oldest = all_tweets[-1].id - 1
            
            
        #keep grabbing tweets until there are no tweets left to grab
        while len(new_tweets) > 0:
            try:
                #all subsiquent requests use the max_id param to prevent duplicates
                new_tweets = self.api.user_timeline(screen_name = screen_name,count=200,
                                                    max_id=oldest, tweet_mode = "extended")

                #save most recent tweets
                all_tweets.extend(new_tweets)

                #update the id of the oldest tweet less one
                oldest = all_tweets[-1].id - 1
                    
            except tweepy.TweepError as e:
                print(e.reason)
                time.sleep(60)
                continue
                
            
        return all_tweets
    
    
    def pickle_dump(self, screen_names, dump = True):
        """
        Given a list of screen names, this method returns a dictionary
        containing all the fetchable tweets from the list of users.
        Dumps everything as a pickle file locally
        """
        
        if not isinstance(screen_names, list):
            screen_names = [screen_names]
        
        all_tweets = {}
        
        for screen_name in tqdm(screen_names):
            try:
                tweets = self.get_tweets(screen_name)
                all_tweets[screen_name] = tweets
            except:
                print(f"could not get {screen_name}")
                continue
         
        #If True dumps all the tweets in a pickle file
        if dump:
            with open(f'pickled_tweets_{str(date.today())}.data', 'wb') as f:
                # store the data as binary data stream
                pickle.dump(all_tweets, f)
                
        return all_tweets


In [33]:
#Initiate collector object
collector = TweetCollector()
#Dump tweets from 3 MPs
tweets = collector.pickle_dump(handles[0])

  0%|          | 0/1 [00:00<?, ?it/s]

Failed to send request: ("Connection broken: ConnectionResetError(104, 'Connection reset by peer')", ConnectionResetError(104, 'Connection reset by peer'))


100%|██████████| 1/1 [01:35<00:00, 95.33s/it]


In [38]:
tweets["AlanTudgeMP"][0]

TypeError: 'Status' object is not iterable

In [53]:
tweets["AlanTudgeMP"][0]._json

{'created_at': 'Mon Jun 07 11:03:25 +0000 2021',
 'id': 1401857321416814599,
 'id_str': '1401857321416814599',
 'full_text': 'RT @RurbsOz: “If we seek to learn from Singapore’s world-leading practices in the teaching of mathematics, then we also need to adopt the S…',
 'truncated': False,
 'display_text_range': [0, 140],
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [{'screen_name': 'RurbsOz',
    'name': 'Rebecca Urban',
    'id': 702210398,
    'id_str': '702210398',
    'indices': [3, 11]}],
  'urls': []},
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 185932331,
  'id_str': '185932331',
  'name': 'Alan Tudge',
  'screen_name': 'AlanTudgeMP',
  'location': 'Authorised by A Tudge, Liberal Party, Wantirna South, Victoria',
  'description

In [25]:
tweets["AlanTudgeMP"][2].created_at

datetime.datetime(2021, 6, 7, 3, 10, 37)

In [10]:
class BuildTweetDF:
    """
    This class builds a Pandas dataframe using a pickle dump 
    of Tweepy tweet objects as collected by the TweetCollector
    """
    
    def __init__(self, pickle_dump):
        self.all_tweets = self.pickle_open(pickle_dump)
    
    def pickle_open(self, pickle_dump):
        with open(pickle_dump, 'rb') as f:
        # read the data as binary data stream
            all_tweets = pickle.load(f)
        
        return all_tweets
    
    def get_df(self):
        
        final_df_lst = []
        
        for politician, tweets in self.all_tweets.items():
            
            #Empty list for df. More things can be added later
            screen_name = []
            user_id = []
            created_at = []
            full_text = []
            favorite_count = []
            retweet_count = []
            tweet_id = []
            in_reply_to_screen_name = []
            hashtags = []
            user_mentions = []
            urls = []
            image = []
            
            for tweet in tweets:
                
                screen_name.append(tweet.user.screen_name)
                user_id.append(tweet.user.id)
                created_at.append(tweet.created_at)
                full_text.append(tweet.full_text)
                favorite_count.append(tweet.favorite_count)
                retweet_count.append(tweet.retweet_count)
                tweet_id.append(tweet.id)
                in_reply_to_screen_name.append(tweet.in_reply_to_screen_name)
                
                user_mentions.append([i["screen_name"] for i in tweet.entities["user_mentions"]])
                hashtags.append([i["text"] for i in tweet.entities["hashtags"]])
                
                try:
                    urls.append(tweet.entities["urls"][0]["expanded_url"])
                except:
                    urls.append(np.nan)    
                try:
                    image.append(tweet.entities["media"][0]["media_url"])
                except:
                    image.append(np.nan)
                            
                
            df = pd.DataFrame({"screen_name":screen_name,
                               "user_id":user_id,
                               "tweet_id":tweet_id,
                               "created_at":created_at,
                               "full_text":full_text,
                               "favorite_count":favorite_count,
                               "retweet_count":retweet_count,
                               "in_reply_to_screen_name":in_reply_to_screen_name,
                               "hashtags":hashtags,
                               "user_mentions":user_mentions,
                               "url":urls,
                               "image_url":image})
            
            #Append politican df to list of all dfs
            final_df_lst.append(df)
            
        #Concat to one final df
        final_df = pd.concat(final_df_lst).reset_index(drop=True)
        
        return final_df
                
                
            
            

In [11]:
#Load a pickle dump
build_df = BuildTweetDF("data/pickled_tweets_2021-05-04.data")

In [12]:
tweet_df = build_df.get_df()
#tweet_df.to_csv("mp_tweets")

In [9]:
tweet_df["image_url"][0][0]["media_url"]

'http://pbs.twimg.com/media/E0hxzTAVgAQS3dT.jpg'

In [225]:
tweet_df.to_csv("mp_tweets")

In [14]:
#Subset bushfire tweets
bf_tweets = tweet_df.loc[(tweet_df["full_text"].str.contains("burnt|Koala|koala|fire|black summer|bushfire|Bushfire|Bush fire|bush fire|bush-fire|Bush-fire")) & 
             (tweet_df["created_at"] > "2019-06-01") & (tweet_df["created_at"] < "2020-04-01")].reset_index()

In [19]:
image_tweets = bf_tweets.loc[~bf_tweets["image_url"].isnull()].tail()
image_tweets.to_csv("image_tweets")

In [164]:
tweet_df.loc[(tweet_df["full_text"].str.contains("koala|Koala")) & 
            (tweet_df["created_at"] > "2019-06-01") & (tweet_df["created_at"] < "2020-04-01")]

Unnamed: 0,screen_name,tweet_id,created_at,full_text,favorite_count,retweet_count,in_reply_to_screen_name,hashtags,user_mentions,url,image_url
6350,AndrewLamingMP,1197275047406080001,2019-11-20 22:06:30,Supports Koala hospital. Rips off People Hospi...,0,0,,[],[],https://www.facebook.com/story.php?story_fbid=...,
6470,AndrewLamingMP,1181829729675923456,2019-10-09 07:12:19,Peter Switzer reckons Councillors who oppose e...,0,1,,[],[],https://www.theaustralian.com.au/business/prop...,
27081,GregHuntMP,1216566236089221121,2020-01-13 03:42:48,RT @JoshFrydenberg: Great to be at @HospitalKo...,0,58,,[],"[JoshFrydenberg, HospitalKoala, sussanley, Pat...",,
36665,JoshFrydenberg,1216637740353110016,2020-01-13 08:26:55,"With around 8m hectares burnt, the #bushfires ...",57,17,,[bushfires],"[HospitalKoala, sussanley, PatConaghanMP, TSCo...",,
36666,JoshFrydenberg,1216538757769252864,2020-01-13 01:53:36,Great to be at @HospitalKoala in Port Macquari...,140,58,,[bushfires],"[HospitalKoala, sussanley, PatConaghanMP]",,
...,...,...,...,...,...,...,...,...,...,...,...
311730,DarrenChesterMP,1217632570260643841,2020-01-16 02:20:01,Not just helping humans... the crews on the gr...,80,18,,"[YourADF, OpBushfireAssist, lovegippsland, TYFYS]","[DeptDefence, AustralianArmy]",,
324032,MakeMayoMatter,1221014815092568064,2020-01-25 10:19:51,So sad to read Sam Mitchell has experienced a ...,23,7,,"[istandwithSam, KIfires]",[],https://apple.news/ArrmGc_O4TriSjk1YP2xOnw,
324042,MakeMayoMatter,1216690441367605254,2020-01-13 11:56:20,RT @wwf_uk: Wildlife rescuer Simon Adamczyk ca...,0,287,,[],[wwf_uk],,
324057,MakeMayoMatter,1213402604450934791,2020-01-04 10:11:39,RT @ElspethHussey7: People are being asked not...,0,157,,[],[ElspethHussey7],,


In [171]:
sample = bf_tweets.sample(30).reset_index()

In [180]:
sample

Unnamed: 0,level_0,index,screen_name,tweet_id,created_at,full_text,favorite_count,retweet_count,in_reply_to_screen_name,hashtags,user_mentions,url,image_url
0,1873,214423,MikeKellyofEM,1215833448964714496,2020-01-11 03:10:57,Bushfire states face $451m disaster funding cu...,6,10,,[],[],https://www.afr.com/politics/federal/bushfire-...,
1,180,36707,JoshFrydenberg,1212849592963723264,2020-01-02 21:34:11,Joining @LaTrioli this morning on @abcmelbourn...,12,2,,[],"[LaTrioli, abcmelbourne]",,
2,81,27075,GregHuntMP,1216878511077519360,2020-01-14 00:23:40,RT @stuartrobertmp: SERVICE UPDATE: Services A...,0,6,,"[bushfires, YourADF]",[stuartrobertmp],,
3,73,27064,GregHuntMP,1217947303496912900,2020-01-16 23:10:40,RT @stuartrobertmp: The fastest and easiest wa...,0,2,,[bushfire],[stuartrobertmp],,
4,2738,318387,M_McCormackMP,1233266258864267264,2020-02-28 05:42:43,It’s @ThankYouFireys Day. \n\nTerrific to meet...,11,3,,[],[ThankYouFireys],,
5,379,73069,ScottMorrisonMP,1193667545129050112,2019-11-10 23:11:34,I welcome the announcement a state of emergenc...,391,118,,[],[],https://twitter.com/NSWRFS/status/119365419107...,
6,840,102156,DaveSharma,1219210010665746433,2020-01-20 10:48:13,👉New initiatives to help support small busines...,3,0,,[],[],https://www.pm.gov.au/media/immediate-small-bu...,
7,2023,223538,MThistlethwaite,1213712234989228033,2020-01-05 06:42:01,Food and water loaded onto the second truck lo...,25,7,,[],[],,
8,1982,214927,MikeKellyofEM,1197404942736084995,2019-11-21 06:42:39,Bushfires burn earlier across Australia https:...,2,3,,[],[canberratimes],https://www.canberratimes.com.au/story/6504694...,
9,1091,116646,AnneWerriwa,1213236864104067072,2020-01-03 23:13:03,RT @AlboMP: This is something that we don’t ta...,0,1175,,[],[AlboMP],,


In [210]:
sample["full_text"][28]

'RT @MurrayWatt: 7 years in office and this government thinks it’s time for a bushfire/climate change roundtable.'