# Data Collection

In [3]:
#Necessary imports
import pandas as pd
import numpy as np
from tqdm import tqdm
import tweepy

In [4]:
#Get twitter credentials from AppCred.py.
#You must have your own credentials stored in working dir
from AppCred import API_KEY, API_SECRET
from AppCred import ACCESS_TOKEN, ACCESS_TOKEN_SECRET

## Load MP data

In [None]:
#Filter by Australian Parlamentarians
#Download the file full_member_info at http://twitterpoliticians.org/download
mp_df = pd.read_csv("data/full_member_info.csv")
mp_df = mp_df[['p.country', 'm.name', 'p.party', 'm.uid']].copy()
mp_df = mp_df.loc[mp_df["p.country"]=="Australia"]
filtered_df =  mp_df.loc[mp_df["m.uid"] != "\\N"]

## Get twitter handles

In [None]:
#Get the twitter handle of each politicians
#and add it to the df
def get_handle(uid):
    try:
        handle = api.get_user(uid).screen_name
    except:
        handle = np.nan
    return handle
            
filtered_df["twitter_handle"] = filtered_df["m.uid"].apply(lambda x: get_handle(x))

In [None]:
filtered_df.to_csv("australian_mps") #Create csv

## Get Tweets

Having downloaded the csv with australians mps you can start from here.

In [8]:
filtered_df = pd.read_csv("australian_mps")
#Get a list of unique twitter handles
handles = filtered_df["twitter_handle"].unique().tolist()

In [5]:
class TweetCollector:
    
    def __init__(self):
        self.api = self.connect_api()
    

    def connect_api(self):
        """
        Connect to the API upon initalizing that class. You need to have
        your own credentials imported
        """
        auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
        auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
        api = tweepy.API(auth, wait_on_rate_limit= True, 
                         wait_on_rate_limit_notify=True, 
                         retry_count = 10, retry_delay = 5, 
                         retry_errors= set([401, 404, 500, 503]))
        
        return api
        
    def get_tweets(self, screen_name):
        """
        Given a screen name this method tries to fetch the last 3200 tweets (maximum allowed).
        """
        try:
            #initialize a list to hold all the tweepy Tweets
            all_tweets = []  
    
            #make initial request for most recent tweets
            new_tweets = self.api.user_timeline(screen_name = screen_name,
                                           count=200, tweet_mode = "extended")
    
            #save most recent tweets
            all_tweets.extend(new_tweets)
    
            #save the id of the oldest tweet less one
            oldest = all_tweets[-1].id - 1
            
            print("-"*66)
            print(f"Fetching {screen_name} Tweets")
            print("-"*66)
            
            #keep grabbing tweets until there are no tweets left to grab
            while len(new_tweets) > 0:
                print(f"getting tweets before {oldest}")
        
                #all subsiquent requests use the max_id param to prevent duplicates
                new_tweets = self.api.user_timeline(screen_name = screen_name,count=200,
                                               max_id=oldest, tweet_mode = "extended")
        
                #save most recent tweets
                all_tweets.extend(new_tweets)
        
                #update the id of the oldest tweet less one
                oldest = all_tweets[-1].id - 1
        
                print(f"...{len(all_tweets)} tweets downloaded so far")
            
        except:
            print(f"could not get tweets from {screen_name}")
            raise
            
        return all_tweets
    
    def build_df(self, screen_names):
        """
        Given a list of screen names, this methods builds a Pandas 
        DataFrame containing the 3200 latest tweets of that user.
        """
        
        if not isinstance(screen_names, list):
            screen_names = [screen_names]
            
        dfs = []
        
        for screen_name in tqdm(screen_names):
            
            tweets = self.get_tweets(screen_name)
            
            created_at = []
            full_text = []
            twitter_handle = []
            tweet_id = []
            
            
            for tweet in tweets:
                
                created_at.append(tweet.created_at)
                full_text.append(tweet.full_text)
                twitter_handle.append(screen_name)
                tweet_id.append(tweet.id)
            
            
            df = pd.DataFrame({"created_at":created_at,
                               "full_text":full_text,
                               "screen_name":screen_name,
                               "tweet_id":tweet_id})
            
            dfs.append(df)
            
        dfs = pd.concat(dfs)
            
        return dfs
        

In [6]:
#Load TweetColletor object
collector = TweetCollector()

In [9]:
#Fetch last 3200 tweets from the two first MPs
tweets = collector.build_df(handles[1:3])

  0%|          | 0/2 [00:00<?, ?it/s]

------------------------------------------------------------------
Fetching AlexHawkeMP Tweets
------------------------------------------------------------------
getting tweets before 1316590964434067455
...400 tweets downloaded so far
getting tweets before 1253780195128967167
...600 tweets downloaded so far
getting tweets before 1155708979068018687
...798 tweets downloaded so far
getting tweets before 977834708837920767
...997 tweets downloaded so far
getting tweets before 803801400798887936
...1195 tweets downloaded so far
getting tweets before 671168769222311935
...1393 tweets downloaded so far
getting tweets before 611440375157391359
...1592 tweets downloaded so far
getting tweets before 551484835282161664
...1792 tweets downloaded so far
getting tweets before 499776376929411072
...1992 tweets downloaded so far
getting tweets before 465731162652962815
...2192 tweets downloaded so far
getting tweets before 404037626165878783
...2391 tweets downloaded so far
getting tweets before 375

 50%|█████     | 1/2 [00:13<00:13, 13.95s/it]

...3192 tweets downloaded so far
------------------------------------------------------------------
Fetching AndrewLamingMP Tweets
------------------------------------------------------------------
getting tweets before 1214747749897211904
...400 tweets downloaded so far
getting tweets before 1185156634382876675
...600 tweets downloaded so far
getting tweets before 1127044942507991044
...800 tweets downloaded so far
getting tweets before 1111225845576884227
...1000 tweets downloaded so far
getting tweets before 1100861098247499775
...1200 tweets downloaded so far
getting tweets before 1091624631830761471
...1400 tweets downloaded so far
getting tweets before 1076202451005526016
...1600 tweets downloaded so far
getting tweets before 1062161305123999743
...1800 tweets downloaded so far
getting tweets before 1046224687380881407
...2000 tweets downloaded so far
getting tweets before 1024144169827164159
...2200 tweets downloaded so far
getting tweets before 997800853602844671
...2400 tweets

100%|██████████| 2/2 [00:26<00:00, 13.43s/it]

...3200 tweets downloaded so far





In [10]:
tweets

Unnamed: 0,created_at,full_text,screen_name,tweet_id
0,1384373672257855491,RT @austbahai: Thank you @AlexHawkeMP for your...,AlexHawkeMP,1384373672257855491
1,1383381955274711045,RT @NRL: AMBUSH! 💪\n\n#NRLRaidersEels https://...,AlexHawkeMP,1383381955274711045
2,1382223617744793603,Wishing a #HappyNewYear to the many South and ...,AlexHawkeMP,1382223617744793603
3,1381894428307034113,Wishing all Sikh Australians a wonderful Vaisa...,AlexHawkeMP,1381894428307034113
4,1381750186687406087,#ramadanmubarak to all those beginning their ...,AlexHawkeMP,1381750186687406087
...,...,...,...,...
3195,869370149572231168,The PM getting down and dirty with the real be...,AndrewLamingMP,869370149572231168
3196,869327552384454657,No shred of decency. Shorten flips on Medicare...,AndrewLamingMP,869327552384454657
3197,869327375414120448,Not a shred of decency. Shorten flips on Medic...,AndrewLamingMP,869327375414120448
3198,869324477888253954,Tennis great. Grand Slam winner. Opposes gay m...,AndrewLamingMP,869324477888253954
