# 1.data_collection

In this notebook we collect all the tweets from Australian MPs. To do this we rely on a list of MPs from a curated database that can be found here: http://twitterpoliticians.org/download

In [47]:
#Necessary imports
import pandas as pd
import numpy as np
from tqdm import tqdm
import tweepy
from datetime import date
import pickle 
import time
import json

In [2]:
#Get twitter credentials from AppCred.py.
#You must have your own credentials stored in working dir
from AppCred import API_KEY, API_SECRET
from AppCred import ACCESS_TOKEN, ACCESS_TOKEN_SECRET

## Load MP data and get twitter handles

As a first step we use tweepy to get the twitter "screen name" of each MP using the user id given by the Twitter politicians data.

In [5]:
#Filter by Australian Parlamentarians
#Download the file full_member_info at http://twitterpoliticians.org/download
mp_df = pd.read_csv("full_member_info.csv")
mp_df = mp_df[['p.country', 'm.name', 'p.party', 'm.uid']].copy()
mp_df = mp_df.loc[mp_df["p.country"]=="Australia"]
filtered_df =  mp_df.loc[mp_df["m.uid"] != "\\N"]

In [None]:
#Get the twitter handle of each politicians
#and add it to the df
def get_handle(uid):
    try:
        handle = api.get_user(uid).screen_name
    except:
        handle = np.nan
    return handle
            
filtered_df["twitter_handle"] = filtered_df["m.uid"].apply(lambda x: get_handle(x))

## Get Tweets

Next we define a class that takes a list of screen names and tries to get the latest 3200 tweets. It then locally dumps the full tweepy tweet for each tweet as a dictionary. The reason we opt for storing everything is to avoid later having to call the API again in case we need more data.

In [3]:
#Get a list of unique twitter handles
handles = filtered_df["twitter_handle"].unique().tolist()

In [4]:
#Remove private and "nan"
handles.remove("JohnAlexanderMP")
handles.remove(np.nan)

In [32]:
class TweetCollector:
    
    def __init__(self):
        self.api = self.connect_api()
    

    def connect_api(self):
        """
        Connect to the API upon initalizing that class. You need to have
        your own credentials imported
        """
        auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
        auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
        api = tweepy.API(auth, wait_on_rate_limit= True, 
                         wait_on_rate_limit_notify=True, 
                         retry_count = 10, retry_delay = 5, 
                         retry_errors= set([401, 404, 500, 503]))
        
        return api
        
    def get_tweets(self, screen_name):
        """
        Given a screen name, this method tries to fetch the last 3200 tweets (maximum allowed).
        """
        #initialize a list to hold all the tweepy Tweets
        all_tweets = []  
    
        #make initial request for most recent tweets
        new_tweets = self.api.user_timeline(screen_name = screen_name,
                                            count=200, tweet_mode = "extended")
    
        #save most recent tweets
        all_tweets.extend(new_tweets)
    
        #save the id of the oldest tweet less one
        oldest = all_tweets[-1].id - 1
            
            
        #keep grabbing tweets until there are no tweets left to grab
        while len(new_tweets) > 0:
            try:
                #all subsiquent requests use the max_id param to prevent duplicates
                new_tweets = self.api.user_timeline(screen_name = screen_name,count=200,
                                                    max_id=oldest, tweet_mode = "extended")

                #save most recent tweets
                all_tweets.extend(new_tweets)

                #update the id of the oldest tweet less one
                oldest = all_tweets[-1].id - 1
                    
            except tweepy.TweepError as e:
                print(e.reason)
                time.sleep(60)
                continue
                
            
        return all_tweets
    
    
    def pickle_dump(self, screen_names, dump = True):
        """
        Given a list of screen names, this method returns a dictionary
        containing all the fetchable tweets from the list of users.
        Dumps everything as a pickle file locally
        """
        
        if not isinstance(screen_names, list):
            screen_names = [screen_names]
        
        all_tweets = {}
        
        for screen_name in tqdm(screen_names):
            try:
                tweets = self.get_tweets(screen_name)
                all_tweets[screen_name] = tweets
            except:
                print(f"could not get {screen_name}")
                continue
         
        #If True dumps all the tweets in a pickle file
        if dump:
            with open(f'pickled_tweets_{str(date.today())}.data', 'wb') as f:
                # store the data as binary data stream
                pickle.dump(all_tweets, f)
                
        return all_tweets


In [None]:
#Initiate collector object
collector = TweetCollector()
#Dump tweets from all MPs
tweets = collector.pickle_dump(handles)

## Build data-frame

Lastly we create a class that builds a data-frame of tweets and its corresponding relevant variables based on the locally dumped dictionary of tweepy tweet objects. 

In [70]:
class BuildTweetDF:
    """
    This class builds a Pandas dataframe using a pickle dump 
    of Tweepy tweet objects as collected by the TweetCollector
    """
    
    def __init__(self, pickle_dump):
        self.all_tweets = self.pickle_open(pickle_dump)
    
    def pickle_open(self, pickle_dump):
        with open(pickle_dump, 'rb') as f:
        # read the data as binary data stream
            all_tweets = pickle.load(f)
        
        return all_tweets
    
    def get_df(self):
        
        final_df_lst = []
        
        for politician, tweets in self.all_tweets.items():
            
            #Empty list for df. More things can be added later
            screen_name = []
            user_id = []
            created_at = []
            full_text = []
            favorite_count = []
            retweet_count = []
            retweet_name = []
            tweet_id = []
            in_reply_to_screen_name = []
            hashtags = []
            user_mentions = []
            urls = []
            image = []
            
            for tweet in tweets:
                
                screen_name.append(tweet.user.screen_name)
                user_id.append(tweet.user.id)
                created_at.append(tweet.created_at)
                full_text.append(tweet.full_text)
                favorite_count.append(tweet.favorite_count)
                retweet_count.append(tweet.retweet_count)
                tweet_id.append(tweet.id)
                in_reply_to_screen_name.append(tweet.in_reply_to_screen_name)
                
                user_mentions.append([i["screen_name"] for i in tweet.entities["user_mentions"]])
                hashtags.append([i["text"] for i in tweet.entities["hashtags"]])
                try:
                    retweet_name.append(tweet.retweeted_status.author.screen_name)
                except:
                    retweet_name.append(np.nan)
                try:
                    urls.append(tweet.entities["urls"][0]["expanded_url"])
                except:
                    urls.append(np.nan)    
                try:
                    image.append(tweet.entities["media"][0]["media_url"])
                except:
                    image.append(np.nan)
                            
                
            df = pd.DataFrame({"screen_name":screen_name,
                               "user_id":user_id,
                               "tweet_id":tweet_id,
                               "created_at":created_at,
                               "full_text":full_text,
                               "favorite_count":favorite_count,
                               "retweet_count":retweet_count,
                               "retweet_name":retweet_name,
                               "in_reply_to_screen_name":in_reply_to_screen_name,
                               "hashtags":hashtags,
                               "user_mentions":user_mentions,
                               "url":urls,
                               "image_url":image})
            
            #Append politican df to list of all dfs
            final_df_lst.append(df)
            
        #Concat to one final df
        final_df = pd.concat(final_df_lst).reset_index(drop=True)
        
        return final_df
                
                
            
            

In [71]:
#Load a pickle dump
build_df = BuildTweetDF("data/pickled_tweets_2021-05-04.data")

In [72]:
tweet_df = build_df.get_df()
#tweet_df.to_csv("mp_tweets")

In [75]:
tweet_df.to_csv("data/mp_tweets.csv")