# Data Collection

In [13]:
#Necessary imports
import pandas as pd
import numpy as np
from tqdm import tqdm
import tweepy
from datetime import date
import pickle 

In [3]:
#Get twitter credentials from AppCred.py.
#You must have your own credentials stored in working dir
from AppCred import API_KEY, API_SECRET
from AppCred import ACCESS_TOKEN, ACCESS_TOKEN_SECRET

## Load MP data

In [5]:
#Filter by Australian Parlamentarians
#Download the file full_member_info at http://twitterpoliticians.org/download
mp_df = pd.read_csv("full_member_info.csv")
mp_df = mp_df[['p.country', 'm.name', 'p.party', 'm.uid']].copy()
mp_df = mp_df.loc[mp_df["p.country"]=="Australia"]
filtered_df =  mp_df.loc[mp_df["m.uid"] != "\\N"]

## Get twitter handles

In [None]:
#Get the twitter handle of each politicians
#and add it to the df
def get_handle(uid):
    try:
        handle = api.get_user(uid).screen_name
    except:
        handle = np.nan
    return handle
            
filtered_df["twitter_handle"] = filtered_df["m.uid"].apply(lambda x: get_handle(x))

In [None]:
filtered_df.to_csv("australian_mps") #Create csv

## Get Tweets

Having downloaded the csv with australians mps you can start from here.

In [7]:
filtered_df = pd.read_csv("australian_mps.csv")
#Get a list of unique twitter handles
handles = filtered_df["twitter_handle"].unique().tolist()

In [14]:
class TweetCollector:
    
    def __init__(self):
        self.api = self.connect_api()
    

    def connect_api(self):
        """
        Connect to the API upon initalizing that class. You need to have
        your own credentials imported
        """
        auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
        auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
        api = tweepy.API(auth, wait_on_rate_limit= True, 
                         wait_on_rate_limit_notify=True, 
                         retry_count = 10, retry_delay = 5, 
                         retry_errors= set([401, 404, 500, 503]))
        
        return api
        
    def get_tweets(self, screen_name):
        """
        Given a screen name this method tries to fetch the last 3200 tweets (maximum allowed).
        """
        try:
            #initialize a list to hold all the tweepy Tweets
            all_tweets = []  
    
            #make initial request for most recent tweets
            new_tweets = self.api.user_timeline(screen_name = screen_name,
                                           count=200, tweet_mode = "extended")
    
            #save most recent tweets
            all_tweets.extend(new_tweets)
    
            #save the id of the oldest tweet less one
            oldest = all_tweets[-1].id - 1
            
            print("-"*66)
            print(f"Fetching {screen_name} Tweets")
            print("-"*66)
            
            #keep grabbing tweets until there are no tweets left to grab
            while len(new_tweets) > 0:
                print(f"getting tweets before {oldest}")
        
                #all subsiquent requests use the max_id param to prevent duplicates
                new_tweets = self.api.user_timeline(screen_name = screen_name,count=200,
                                                    max_id=oldest, tweet_mode = "extended")
        
                #save most recent tweets
                all_tweets.extend(new_tweets)
        
                #update the id of the oldest tweet less one
                oldest = all_tweets[-1].id - 1
        
                print(f"...{len(all_tweets)} tweets downloaded so far")
            
        except:
            print(f"could not get tweets from {screen_name}")
            raise
            
        return all_tweets
    
    
    def pickle_dump(self, screen_names, dump = True):
        """
        Given a list of screen names, this method returns a dictionary
        containing all the fetchable tweets from the list of users.
        Dumps everything as a pickle file locally
        """
        
        if not isinstance(screen_names, list):
            screen_names = [screen_names]
        
        all_tweets = {}
        
        for screen_name in tqdm(screen_names):
            
            tweets = self.get_tweets(screen_name)
            all_tweets[screen_name] = tweets
         
        #If True dumps all the tweets in a pickle file
        if dump:
            with open(f'pickled_tweets_{str(date.today())}.data', 'wb') as f:
                # store the data as binary data stream
                pickle.dump(all_tweets, f)
                
        return all_tweets


In [15]:
#Initiate collector object
collector = TweetCollector()
#Dump tweets from 3 MPs
tweets = collector.pickle_dump(handles[0])

  0%|          | 0/1 [00:00<?, ?it/s]

------------------------------------------------------------------
Fetching AlanTudgeMP Tweets
------------------------------------------------------------------
getting tweets before 1325682029078351871
...400 tweets downloaded so far
getting tweets before 1303897535686164479
...600 tweets downloaded so far
getting tweets before 1276370497404432383
...800 tweets downloaded so far
getting tweets before 1247662602437586943
...1000 tweets downloaded so far
getting tweets before 1169527441733210111
...1200 tweets downloaded so far
getting tweets before 1101255660774125567
...1400 tweets downloaded so far
getting tweets before 988292030852825088
...1600 tweets downloaded so far
getting tweets before 838191483823890431
...1800 tweets downloaded so far
getting tweets before 676158195673395199
...1997 tweets downloaded so far
getting tweets before 633008236493336575
...2195 tweets downloaded so far
getting tweets before 539532580828086271
...2395 tweets downloaded so far
getting tweets before

100%|██████████| 1/1 [00:36<00:00, 36.74s/it]

...2844 tweets downloaded so far





In [32]:
tweets["AlanTudgeMP"][100].full_text

"The PM's Spelling Bee is a great initiative to educate, build confidence and get kids excited about spelling. It's also a lot of fun!\nhttps://t.co/U5Vu8xwNFb"

In [40]:
#Tweets is now a dictionary where the keys are politicians and values a list of all their tweets
print(tweets.keys())
#Amount of tweets first MP
print(f"Amount of tweets {len(tweets['AlanTudgeMP'])}")
#To access the first tweet object of the first MP
tweets["AlanTudgeMP"][0].user.followers

dict_keys(['AlanTudgeMP'])
Amount of tweets 2844


<bound method User.followers of User(_api=<tweepy.api.API object at 0x7fde199e05e0>, _json={'id': 185932331, 'id_str': '185932331', 'name': 'Alan Tudge', 'screen_name': 'AlanTudgeMP', 'location': 'Authorised by A Tudge, Liberal Party, Wantirna South, Victoria', 'description': 'Federal Member for Aston, Minister for Education and Youth', 'url': 'https://t.co/QSlbuhwDsP', 'entities': {'url': {'urls': [{'url': 'https://t.co/QSlbuhwDsP', 'expanded_url': 'http://www.alantudge.com.au', 'display_url': 'alantudge.com.au', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 21460, 'friends_count': 441, 'listed_count': 329, 'created_at': 'Thu Sep 02 04:56:34 +0000 2010', 'favourites_count': 362, 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'verified': True, 'statuses_count': 2850, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_ima

In [185]:
class BuildTweetDF:
    """
    This class builds a Pandas dataframe using a pickle dump 
    of Tweepy tweet objects as collected by the TweetCollector
    """
    
    def __init__(self, pickle_dump):
        self.all_tweets = self.pickle_open(pickle_dump)
    
    def pickle_open(self, pickle_dump):
        with open(pickle_dump, 'rb') as f:
        # read the data as binary data stream
            all_tweets = pickle.load(f)
        
        return all_tweets
    
    def get_df(self):
        
        final_df_lst = []
        
        for politician, tweets in self.all_tweets.items():
            
            #Empty list for df. More things can be added later
            screen_name = []
            created_at = []
            full_text = []
            favorite_count = []
            retweet_count = []
            tweet_id = []
            in_reply_to_screen_name = []
            
            for tweet in tweets:
                
                screen_name.append(tweet.user.screen_name)
                created_at.append(tweet.created_at)
                full_text.append(tweet.full_text)
                favorite_count.append(tweet.favorite_count)
                retweet_count.append(tweet.retweet_count)
                tweet_id.append(tweet.id)
                in_reply_to_screen_name.append(tweet.in_reply_to_screen_name)
                
            df = pd.DataFrame({"screen_name":screen_name,
                               "tweet_id":tweet_id,
                               "created_at":created_at,
                               "full_text":full_text,
                               "favorite_count":favorite_count,
                               "retweet_count":retweet_count,
                               "in_reply_to_screen_name":in_reply_to_screen_name})
            
            #Append politican df to list of all dfs
            final_df_lst.append(df)
            
        #Concat to one final df
        final_df = pd.concat(final_df_lst).reset_index(drop=True)
        
        return final_df
                
                
            
            

In [186]:
#Load a pickle test dump of 3 Australian MPs
build_df = BuildTweetDF("tweets_2021-04-29.data")

In [187]:
tweet_df = build_df.get_df()
tweet_df

Unnamed: 0,screen_name,tweet_id,created_at,full_text,favorite_count,retweet_count,in_reply_to_screen_name
0,AlanTudgeMP,1387521079627771907,2021-04-28 21:36:18,Draft national curriculum is out for public co...,31,10,
1,AlanTudgeMP,1387356919430406147,2021-04-28 10:44:00,A pleasure to be part of it. A great initiati...,14,4,
2,AlanTudgeMP,1387179682492080134,2021-04-27 22:59:43,We need more of these faster pathways into tea...,20,2,AlanTudgeMP
3,AlanTudgeMP,1387178951127175168,2021-04-27 22:56:49,Interesting examples of mid career people swit...,15,3,
4,AlanTudgeMP,1386938376071245828,2021-04-27 07:00:51,We will consider Victoria’s quarantine proposa...,116,20,
...,...,...,...,...,...,...,...
9211,AndrewLamingMP,869370149572231168,2017-05-30 01:49:30,The PM getting down and dirty with the real be...,0,0,
9212,AndrewLamingMP,869327552384454657,2017-05-29 23:00:14,No shred of decency. Shorten flips on Medicare...,0,0,
9213,AndrewLamingMP,869327375414120448,2017-05-29 22:59:32,Not a shred of decency. Shorten flips on Medic...,0,0,
9214,AndrewLamingMP,869324477888253954,2017-05-29 22:48:01,Tennis great. Grand Slam winner. Opposes gay m...,1,0,
