# Data Collection

In [11]:
#Necessary imports
import pandas as pd
import numpy as np
from tqdm import tqdm
import tweepy
from datetime import date
import pickle 
import time

In [9]:
#Get twitter credentials from AppCred.py.
#You must have your own credentials stored in working dir
from AppCred import API_KEY, API_SECRET
from AppCred import ACCESS_TOKEN, ACCESS_TOKEN_SECRET

## Load MP data

In [5]:
#Filter by Australian Parlamentarians
#Download the file full_member_info at http://twitterpoliticians.org/download
mp_df = pd.read_csv("full_member_info.csv")
mp_df = mp_df[['p.country', 'm.name', 'p.party', 'm.uid']].copy()
mp_df = mp_df.loc[mp_df["p.country"]=="Australia"]
filtered_df =  mp_df.loc[mp_df["m.uid"] != "\\N"]

## Get twitter handles

In [None]:
#Get the twitter handle of each politicians
#and add it to the df
def get_handle(uid):
    try:
        handle = api.get_user(uid).screen_name
    except:
        handle = np.nan
    return handle
            
filtered_df["twitter_handle"] = filtered_df["m.uid"].apply(lambda x: get_handle(x))

In [None]:
filtered_df.to_csv("australian_mps") #Create csv

## Get Tweets

Having downloaded the csv with australians mps you can start from here.

In [10]:
filtered_df = pd.read_csv("data/australian_mps.csv")
#Get a list of unique twitter handles
handles = filtered_df["twitter_handle"].unique().tolist()

In [22]:
class TweetCollector:
    
    def __init__(self):
        self.api = self.connect_api()
    

    def connect_api(self):
        """
        Connect to the API upon initalizing that class. You need to have
        your own credentials imported
        """
        auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
        auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
        api = tweepy.API(auth, wait_on_rate_limit= True, 
                         wait_on_rate_limit_notify=True, 
                         retry_count = 10, retry_delay = 5, 
                         retry_errors= set([401, 404, 500, 503]))
        
        return api
        
    def get_tweets(self, screen_name):
        """
        Given a screen name this method tries to fetch the last 3200 tweets (maximum allowed).
        """
        #initialize a list to hold all the tweepy Tweets
        all_tweets = []  
    
        #make initial request for most recent tweets
        new_tweets = self.api.user_timeline(screen_name = screen_name,
                                            count=200, tweet_mode = "extended")
    
        #save most recent tweets
        all_tweets.extend(new_tweets)
    
        #save the id of the oldest tweet less one
        oldest = all_tweets[-1].id - 1
            
            
        #keep grabbing tweets until there are no tweets left to grab
        while len(new_tweets) > 0:
            try:
                #all subsiquent requests use the max_id param to prevent duplicates
                new_tweets = self.api.user_timeline(screen_name = screen_name,count=200,
                                                    max_id=oldest, tweet_mode = "extended")

                #save most recent tweets
                all_tweets.extend(new_tweets)

                #update the id of the oldest tweet less one
                oldest = all_tweets[-1].id - 1
                    
            except tweepy.TweepError as e:
                print(e.reason)
                time.sleep(60)
                continue
                
            
        return all_tweets
    
    
    def pickle_dump(self, screen_names, dump = True):
        """
        Given a list of screen names, this method returns a dictionary
        containing all the fetchable tweets from the list of users.
        Dumps everything as a pickle file locally
        """
        
        if not isinstance(screen_names, list):
            screen_names = [screen_names]
        
        all_tweets = {}
        
        for screen_name in tqdm(screen_names):
            try:
                tweets = self.get_tweets(screen_name)
                all_tweets[screen_name] = tweets
            except:
                print(f"could not get {screen_name}")
                continue
         
        #If True dumps all the tweets in a pickle file
        if dump:
            with open(f'pickled_tweets_{str(date.today())}.data', 'wb') as f:
                # store the data as binary data stream
                pickle.dump(all_tweets, f)
                
        return all_tweets


In [23]:
#Initiate collector object
collector = TweetCollector()
#Dump tweets from 3 MPs
tweets = collector.pickle_dump(handles[0])

100%|██████████| 1/1 [00:38<00:00, 38.68s/it]


In [24]:
tweets["AlanTudgeMP"][100].full_text

'We must never forget the tragedy of the Holocaust and the evil that underpinned it. Every student should learn about it. The Morrison Govt is providing $2m towards the Tas Holocaust Education &amp; Interpretation Centre. #EliseArcherMP read more: https://t.co/7WVKCTuX4J'

In [29]:
#Tweets is now a dictionary where the keys are politicians and values a list of all their tweets
print(tweets.keys())
#Amount of tweets first MP
print(f"Amount of tweets {len(tweets['AlanTudgeMP'])}")
#To access the first tweet object of the first MP
#tweets["AlanTudgeMP"][0].user.followers

dict_keys(['AlanTudgeMP'])
Amount of tweets 2848


In [25]:
class BuildTweetDF:
    """
    This class builds a Pandas dataframe using a pickle dump 
    of Tweepy tweet objects as collected by the TweetCollector
    """
    
    def __init__(self, pickle_dump):
        self.all_tweets = self.pickle_open(pickle_dump)
    
    def pickle_open(self, pickle_dump):
        with open(pickle_dump, 'rb') as f:
        # read the data as binary data stream
            all_tweets = pickle.load(f)
        
        return all_tweets
    
    def get_df(self):
        
        final_df_lst = []
        
        for politician, tweets in self.all_tweets.items():
            
            #Empty list for df. More things can be added later
            screen_name = []
            created_at = []
            full_text = []
            favorite_count = []
            retweet_count = []
            tweet_id = []
            in_reply_to_screen_name = []
            
            for tweet in tweets:
                
                screen_name.append(tweet.user.screen_name)
                created_at.append(tweet.created_at)
                full_text.append(tweet.full_text)
                favorite_count.append(tweet.favorite_count)
                retweet_count.append(tweet.retweet_count)
                tweet_id.append(tweet.id)
                in_reply_to_screen_name.append(tweet.in_reply_to_screen_name)
                
            df = pd.DataFrame({"screen_name":screen_name,
                               "tweet_id":tweet_id,
                               "created_at":created_at,
                               "full_text":full_text,
                               "favorite_count":favorite_count,
                               "retweet_count":retweet_count,
                               "in_reply_to_screen_name":in_reply_to_screen_name})
            
            #Append politican df to list of all dfs
            final_df_lst.append(df)
            
        #Concat to one final df
        final_df = pd.concat(final_df_lst).reset_index(drop=True)
        
        return final_df
                
                
            
            

In [27]:
#Load a pickle test dump of 3 Australian MPs
build_df = BuildTweetDF("pickled_tweets_2021-05-04.data")

In [28]:
tweet_df = build_df.get_df()
tweet_df

Unnamed: 0,screen_name,tweet_id,created_at,full_text,favorite_count,retweet_count,in_reply_to_screen_name
0,AlanTudgeMP,1389485615918370816,2021-05-04 07:42:40,Research commercialisation in action. @QUT wor...,25,5,
1,AlanTudgeMP,1389434288811036675,2021-05-04 04:18:43,Hearing first hand how our extra $1.7b boost t...,7,0,
2,AlanTudgeMP,1389430043437244420,2021-05-04 04:01:51,RT @CISOZ: [NEW EDUCATION EVENT]\nTuesday 18 M...,0,5,
3,AlanTudgeMP,1389372057041866754,2021-05-04 00:11:26,"Great to meet the kids, parents and teachers a...",5,1,
4,AlanTudgeMP,1389001683368169474,2021-05-02 23:39:42,RT @pmc_gov_au: Media Release @JoshFrydenberg ...,0,4,
...,...,...,...,...,...,...,...
2843,AlanTudgeMP,142741721963438080,2011-12-02 23:07:33,Van Onselen: It is not the media that needs to...,0,0,
2844,AlanTudgeMP,142022433539489792,2011-11-30 23:29:21,PM has early-learning trifecta: forcing 3 y.o....,0,2,
2845,AlanTudgeMP,141285577499742208,2011-11-28 22:41:21,I argue for principals to have power over teac...,0,0,
2846,AlanTudgeMP,4731072050888704,2010-11-17 03:02:30,Looking forward to speaking in opposition to L...,0,0,
