In [2]:
#Necessary imports
import pandas as pd
import numpy as np
from tqdm import tqdm
import tweepy
from datetime import date
import pickle 
import time
import matplotlib.pyplot as plt

## Subset data

In [133]:
mp_df = pd.read_csv("data/full_member_info.csv")
mp_df.shape

(15918, 37)

In [134]:
tweets = pd.read_csv("data/mp_tweets", index_col=0, low_memory=False)
#Original shape of the data
print(tweets.shape)
tweets.head()

(335975, 12)


Unnamed: 0,screen_name,user_id,tweet_id,created_at,full_text,favorite_count,retweet_count,in_reply_to_screen_name,hashtags,user_mentions,url,image_url
0,AlanTudgeMP,185932331.0,1.389486e+18,2021-05-04 07:42:40,Research commercialisation in action. @QUT wor...,25,5.0,,[],"['QUT', 'TrevorEvansBne']",,
1,AlanTudgeMP,185932331.0,1.389434e+18,2021-05-04 04:18:43,Hearing first hand how our extra $1.7b boost t...,7,0.0,,[],"['GoodStartel', 'BertVanManen']",,
2,AlanTudgeMP,185932331.0,1.38943e+18,2021-05-04 04:01:51,RT @CISOZ: [NEW EDUCATION EVENT]\nTuesday 18 M...,0,5.0,,[],"['CISOZ', 'AlanTudgeMP']",,
3,AlanTudgeMP,185932331.0,1.389372e+18,2021-05-04 00:11:26,"Great to meet the kids, parents and teachers a...",5,1.0,,[],"['Kingscc', 'BertVanManen']",https://ministers.dese.gov.au/tudge/new-facili...,
4,AlanTudgeMP,185932331.0,1.389002e+18,2021-05-02 23:39:42,RT @pmc_gov_au: Media Release @JoshFrydenberg ...,0,4.0,,[],"['pmc_gov_au', 'JoshFrydenberg', 'MarisePayne'...",,


In [142]:
class DataProcessing:

    def clean_tweet_data(self, tweet_df):

        #Rename user id column for merging with members_info data
        tweet_df = tweet_df.rename(columns = {"user_id":"m.uid"})

        #Drop 6 tweets that are corrupt. Because of it only being 6 tweets we drop them instead of re-running the collection from the API
        remove_idx = [175522, 190414, 211953, 212012, 212013, 212298 ]
        tweet_df = tweet_df.drop(tweet_df.index[remove_idx])

        #Make data into date-time object, remove h-m-s from dt
        tweet_df["created_at"] = pd.to_datetime(tweet_df["created_at"]).dt.date
        
        tweet_df["m.uid"] = tweet_df["m.uid"].astype(int)
        
        return tweet_df
    
    def clean_mp_data(self, mp_df):
        
        mp_df = mp_df[['p.country', 'm.name', 'p.party', 'm.uid', 'lp.official_legislative_period']]
        mp_df = mp_df.loc[mp_df["p.country"]=="Australia"]
        
        #TODO: Fix doctor
        remove = r"(^Hon\s{1}|^Mr\s{1}|^Ms\s{1}|^Mrs\s{1}|\s{1}MP|\s{1}AM|\s{1}OAM|\s{1}AO|\s{1}QC|^\D\r\s{1})"
        mp_df["m.name"] = mp_df["m.name"].str.replace(remove, "")
        
        mp_df = mp_df.loc[mp_df["m.uid"] != "\\N"]
        mp_df["m.uid"] = mp_df["m.uid"].astype(int)
        
        return mp_df
    
    def merge_final_df(self, tweet_df, mp_df):
        
        
        tweet_df = self.clean_tweet_data(tweet_df)
        mp_df = self.clean_mp_data(mp_df)
        
        #Merge to final df
        final_df = tweet_df.merge(mp_df, on = "m.uid", how = "left")
        
        #final_df = final_df.loc[((final_df["lp.official_legislative_period"] == "45") & (final_df["created_at"] < "2019-07-01"))|
        #                        ((final_df["lp.official_legislative_period"] == "46") & (final_df["created_at"] > "2019-07-01"))]
        
        return final_df
    

In [143]:
processor = DataProcessing()
final_df = processor.merge_final_df(tweets, mp_df)

  mp_df["m.name"] = mp_df["m.name"].str.replace(remove, "")


In [144]:
#remove = r"(^Hon\s{1}|^Mr\s{1}|^Ms\s{1}|^Dr\s{1}|^Mrs\{s}|\s{1}MP)"
#final_df["m.name"] = final_df["m.name"].str.replace(remove, "")
final_df["m.name"].unique()

array(['Alan Tudge', 'Alex Hawke', 'Andrew Laming', 'Andrew Wallace',
       'Angus Taylor', 'Bert van Manen', 'Russell Broadbent',
       'Chris Crewther', 'Christian Porter', 'Christopher Pyne',
       'Craig Kelly', 'Dan Tehan', 'David Coleman', 'Greg Hunt',
       'Ian Goodenough', 'Jane Prentice', 'Jason Falinski', 'Jason Wood',
       'Josh Frydenberg', 'Julian Leeser', 'Julie Bishop',
       'Karen Andrews', "Kelly O'Dwyer", 'Ken Wyatt,', 'Ken Wyatt',
       'Kevin Andrews', 'Craig Laundy', 'Lucy Wicks', nan,
       'Michael Sukkar', 'Nicolle Flint', 'Nola Marino', 'Paul Fletcher',
       'Peter Dutton', 'Rowan Ramsey', 'Scott Buchholz', 'Scott Morrison',
       'Sarah Henderson', 'Steven Ciobo', 'Stuart Robert', 'Sussan Ley',
       "Ted O'Brien", 'Tim Wilson', 'Tony Abbott', 'Tony Pasin',
       'Malcolm Turnbull', 'Tony Smith', 'James Stevens', 'Gladys Liu',
       'Fiona Martin', 'Dave Sharma', 'Anthony Albanese',
       'Dr Andrew Leigh', 'Andrew Leigh', 'Amanda Rishworth',

In [99]:
len(final_df["m.name"].unique())

145

In [288]:
tweets = tweets.dropna(subset=["screen_name"])

### MP info

In [290]:
#Filter by Australian Parlamentarians
#Download the file full_member_info at http://twitterpoliticians.org/download
mp_df = mp_df[['p.country', 'm.name', 'p.party', 'm.uid', 'lp.official_legislative_period']].copy()
mp_df = mp_df.loc[mp_df["p.country"]=="Australia"]
mp_df["m.name"] = mp_df["m.name"].str.replace(r"^Hon\s{1}|^Mr\s{1}|\s{1}MP|\s{1}Ms", "")
mp_info =  mp_df.loc[mp_df["m.uid"] != "\\N"]
mp_info["m.uid"] = mp_info["m.uid"].astype(int)

  mp_df["m.name"] = mp_df["m.name"].str.replace(r"^Hon\s{1}|^Mr\s{1}|\s{1}MP", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mp_info["m.uid"] = mp_info["m.uid"].astype(int)


In [291]:
tweets["m.uid"] = tweets["m.uid"].astype(int)

In [292]:
tweets = tweets.merge(mp_info, on = "m.uid", how = "left")

In [293]:
tweets

Unnamed: 0,screen_name,m.uid,tweet_id,created_at,full_text,favorite_count,retweet_count,in_reply_to_screen_name,hashtags,user_mentions,url,image_url,p.country,m.name,p.party,lp.official_legislative_period
0,AlanTudgeMP,185932331,1.389486e+18,2021-05-04 07:42:40,Research commercialisation in action. @QUT wor...,25,5.0,,[],"['QUT', 'TrevorEvansBne']",,,Australia,Alan Tudge,Liberal Party of Australia,45
1,AlanTudgeMP,185932331,1.389486e+18,2021-05-04 07:42:40,Research commercialisation in action. @QUT wor...,25,5.0,,[],"['QUT', 'TrevorEvansBne']",,,Australia,Alan Tudge,Liberal Party of Australia,46
2,AlanTudgeMP,185932331,1.389434e+18,2021-05-04 04:18:43,Hearing first hand how our extra $1.7b boost t...,7,0.0,,[],"['GoodStartel', 'BertVanManen']",,,Australia,Alan Tudge,Liberal Party of Australia,45
3,AlanTudgeMP,185932331,1.389434e+18,2021-05-04 04:18:43,Hearing first hand how our extra $1.7b boost t...,7,0.0,,[],"['GoodStartel', 'BertVanManen']",,,Australia,Alan Tudge,Liberal Party of Australia,46
4,AlanTudgeMP,185932331,1.389430e+18,2021-05-04 04:01:51,RT @CISOZ: [NEW EDUCATION EVENT]\nTuesday 18 M...,0,5.0,,[],"['CISOZ', 'AlanTudgeMP']",,,Australia,Alan Tudge,Liberal Party of Australia,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575232,P_Thompson88,929896339307053056,9.302562e+17,2017-11-14 02:08:55,A #LivedExperiencedVeteran voice at the macro ...,0,0.0,,"['LivedExperiencedVeteran', 'SuicidePrevention...",[],,,Australia,Phillip Thompson,Liberal National Party of Queensland,46
575233,P_Thompson88,929896339307053056,9.302450e+17,2017-11-14 01:24:36,#NewProfilePic https://t.co/jVWltJHHRD,1,0.0,,['NewProfilePic'],[],,,Australia,Phillip Thompson,Liberal National Party of Queensland,46
575234,P_Thompson88,929896339307053056,9.298999e+17,2017-11-13 02:33:07,Very honoured to be named the 2018 QLD Young A...,13,6.0,,"['SuicidePrevention', 'veteransuicidepreventio...",[],,,Australia,Phillip Thompson,Liberal National Party of Queensland,46
575235,P_Thompson88,929896339307053056,9.298993e+17,2017-11-13 02:30:38,RT @ausoftheyear: A young veteran’s champion d...,0,5.0,,['QLD'],['ausoftheyear'],,,Australia,Phillip Thompson,Liberal National Party of Queensland,46


In [131]:
#tweets["created_at"] = pd.to_datetime(tweets["created_at"], format="%Y-%m-%d")

In [299]:
len(subset["tweet_id"].unique())

299139

In [302]:
subset.to_csv("tweets_subset")

## Check how many mps

In [336]:
mp_df = pd.read_csv("data/full_member_info.csv")
mp_df = mp_df[['p.country', 'm.member_id', 'm.name', 'p.party', 'm.uid', 'lp.official_legislative_period']].copy()
mp_df = mp_df.loc[mp_df["p.country"]=="Australia"]

In [337]:
tweet_users = pd.read_csv("data/australian_mps.csv", index_col= 0)

In [338]:
tweet_users.loc[tweet_users["twitter_handle"].isna()]

Unnamed: 0,p.country,m.name,p.party,m.uid,twitter_handle
13657,Australia,Mrs Ann Sudmalis MP,Liberal Party of Australia,835797426229256192,
13671,Australia,Dr John McVeigh MP,Liberal Party of Australia,929086908,
13673,Australia,Ms Julia Banks MP,Liberal Party of Australia,753170760252784641,
13682,Australia,Mr Luke Howarth MP,Liberal Party of Australia,4882704458,
13684,Australia,Hon Michael Keenan MP,Liberal Party of Australia,3476724493,
13695,Australia,Mr Steve Irons MP,Liberal Party of Australia,22147650,
13765,Australia,Ms Cathy O'Toole MP,Australian Labor Party,808712768,
13769,Australia,Ms Emma McBride MP,Australian Labor Party,702698265695760000,
13827,Australia,Stephen Jones,Australian Labor Party,117273312,
13887,Australia,Mr Andrew Gee MP,The Nationals,423966765,


In [359]:
len(mp_df["m.member_id"].unique())

297

In [331]:
#Unique twitter uid in the original mp_info data
len(tweet_users["m.uid"].unique())

158

In [332]:
#Unique twitter handles we were able to fetch based on uid
len(tweet_users["twitter_handle"].unique())

145

In [346]:
tweet_users["m.uid"] = tweet_users["m.uid"].astype(str)

In [348]:
merged = mp_df.merge(tweet_users, on = "m.uid", how = "left")

In [357]:
#merged = merged.drop_duplicates(subset="m.uid")

In [358]:
merged

Unnamed: 0,p.country_x,m.member_id,m.name_x,p.party_x,m.uid,lp.official_legislative_period,p.country_y,m.name_y,p.party_y,twitter_handle
0,Australia,9511,Mr Trevor Evans MP,Liberal Party of Australia,\N,45,,,,
5,Australia,9522,Hon Alan Tudge MP,Liberal Party of Australia,185932331,45,Australia,Hon Alan Tudge MP,Liberal Party of Australia,AlanTudgeMP
7,Australia,9525,Hon Alex Hawke MP,Liberal Party of Australia,18864066,45,Australia,Hon Alex Hawke MP,Liberal Party of Australia,AlexHawkeMP
9,Australia,9529,Mr Andrew Laming MP,Liberal Party of Australia,49499855,45,Australia,Mr Andrew Laming MP,Liberal Party of Australia,AndrewLamingMP
11,Australia,9530,Mr Andrew Wallace MP,Liberal Party of Australia,252016955,45,Australia,Mr Andrew Wallace MP,Liberal Party of Australia,AndrewWallaceMP
...,...,...,...,...,...,...,...,...,...,...
452,Australia,9565,"Ms Cathy McGowan AO, MP",Independent,364814582,45,Australia,"Ms Cathy McGowan AO, MP",Independent,Indigocathy
453,Australia,9659,Mr Andrew Wilkie MP,Independent,398422158,45,Australia,Mr Andrew Wilkie MP,Independent,WilkieMP
454,Australia,9627,Hon Bob Katter MP,Katter's Australian Party,310726037,45,Australia,Hon Bob Katter MP,Katter's Australian Party,RealBobKatter
461,Australia,14328,Trevor Evans,Liberal National Party of Queensland,1046961420493971456,46,Australia,Trevor Evans,Liberal National Party of Queensland,TrevorEvansBne
