<a href="https://colab.research.google.com/github/frederik-kilpinen/ASDS2/blob/main/Notebooks/data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Necessary imports
import pandas as pd
import numpy as np
from tqdm import tqdm
import tweepy
from datetime import date
import pickle 
import time
import matplotlib.pyplot as plt

In [None]:
# MP info dataset
mp_df = pd.read_csv("data/full_member_info.csv")

#Original shape of the data
mp_df.shape

(15918, 37)

In [None]:
# Twitter dataset
tweets = pd.read_csv("data/mp_tweets", index_col=0, low_memory=False)

#Original shape of the data
print(tweets.shape)

(335975, 12)


Unnamed: 0,screen_name,user_id,tweet_id,created_at,full_text,favorite_count,retweet_count,in_reply_to_screen_name,hashtags,user_mentions,url,image_url
0,AlanTudgeMP,185932331.0,1.389486e+18,2021-05-04 07:42:40,Research commercialisation in action. @QUT wor...,25,5.0,,[],"['QUT', 'TrevorEvansBne']",,
1,AlanTudgeMP,185932331.0,1.389434e+18,2021-05-04 04:18:43,Hearing first hand how our extra $1.7b boost t...,7,0.0,,[],"['GoodStartel', 'BertVanManen']",,
2,AlanTudgeMP,185932331.0,1.38943e+18,2021-05-04 04:01:51,RT @CISOZ: [NEW EDUCATION EVENT]\nTuesday 18 M...,0,5.0,,[],"['CISOZ', 'AlanTudgeMP']",,
3,AlanTudgeMP,185932331.0,1.389372e+18,2021-05-04 00:11:26,"Great to meet the kids, parents and teachers a...",5,1.0,,[],"['Kingscc', 'BertVanManen']",https://ministers.dese.gov.au/tudge/new-facili...,
4,AlanTudgeMP,185932331.0,1.389002e+18,2021-05-02 23:39:42,RT @pmc_gov_au: Media Release @JoshFrydenberg ...,0,4.0,,[],"['pmc_gov_au', 'JoshFrydenberg', 'MarisePayne'...",,


In [None]:
class DataProcessing:

    def clean_tweet_data(self, tweet_df):

        #Rename user id column for merging with members_info data
        tweet_df = tweet_df.rename(columns = {"user_id":"m.uid"})

        #Drop 6 tweets that are corrupt. Because of it only being 6 tweets we drop them instead of re-running the collection from the API
        remove_idx = [175522, 190414, 211953, 212012, 212013, 212298 ]
        tweet_df = tweet_df.drop(tweet_df.index[remove_idx])

        #Make data into date-time object, remove h-m-s from dt
        tweet_df["created_at"] = pd.to_datetime(pd.to_datetime(tweet_df["created_at"]).dt.date)
        
        tweet_df["m.uid"] = tweet_df["m.uid"].astype(int)
        
        return tweet_df
    
    def clean_mp_data(self, mp_df):
        
        mp_df = mp_df[['p.country', 'm.name', 'p.party', 'm.uid', 'lp.official_legislative_period']]
        mp_df = mp_df.loc[mp_df["p.country"]=="Australia"]
        
        #TODO: Fix doctor
        remove = r"(^Hon\s{1}|^Mr\s{1}|^Ms\s{1}|^Mrs\s{1}|\s{1}MP|\s{1}AM|\s{1}OAM|\s{1}AO|\s{1}QC|^\D\r\s{1}|,)"
        mp_df["m.name"] = mp_df["m.name"].str.replace(remove, "")
        
        mp_df = mp_df.loc[mp_df["m.uid"] != "\\N"]
        mp_df["m.uid"] = mp_df["m.uid"].astype(int)
        
        return mp_df
    
    def merge_final_df(self, tweet_df, mp_df):
        
        
        tweet_df = self.clean_tweet_data(tweet_df)
        mp_df = self.clean_mp_data(mp_df)
        
        #Merge to final df
        final_df = tweet_df.merge(mp_df, on = "m.uid", how = "left")
        
        #Subset on active MPs
        final_df = final_df.loc[((final_df["lp.official_legislative_period"] == "45") & (final_df["created_at"] < "2019-07-01"))|
                                ((final_df["lp.official_legislative_period"] == "46") & (final_df["created_at"] > "2019-07-01"))]
        
        # FREDERIKS CHANGE: Subset tweets from 1 year before the bushfire (1. June 2018) and 1 year after the bushfire (1. May 2021)
        final_df.loc[(final_df["created_at"] >= "2018-06-01") & (final_df["created_at"] <= "2021-05-01")]
                     
        # Restetting index for final df
        final_df = final_df.reset_index(drop = True)
        
        return final_df
    

In [None]:
processor = DataProcessing()
final_df = processor.merge_final_df(tweets, mp_df)

  mp_df["m.name"] = mp_df["m.name"].str.replace(remove, "")


In [None]:
final_df

Unnamed: 0,screen_name,m.uid,tweet_id,created_at,full_text,favorite_count,retweet_count,in_reply_to_screen_name,hashtags,user_mentions,url,image_url,p.country,m.name,p.party,lp.official_legislative_period
0,AlanTudgeMP,185932331,1.389486e+18,2021-05-04,Research commercialisation in action. @QUT wor...,25,5.0,,[],"['QUT', 'TrevorEvansBne']",,,Australia,Alan Tudge,Liberal Party of Australia,46
1,AlanTudgeMP,185932331,1.389434e+18,2021-05-04,Hearing first hand how our extra $1.7b boost t...,7,0.0,,[],"['GoodStartel', 'BertVanManen']",,,Australia,Alan Tudge,Liberal Party of Australia,46
2,AlanTudgeMP,185932331,1.389430e+18,2021-05-04,RT @CISOZ: [NEW EDUCATION EVENT]\nTuesday 18 M...,0,5.0,,[],"['CISOZ', 'AlanTudgeMP']",,,Australia,Alan Tudge,Liberal Party of Australia,46
3,AlanTudgeMP,185932331,1.389372e+18,2021-05-04,"Great to meet the kids, parents and teachers a...",5,1.0,,[],"['Kingscc', 'BertVanManen']",https://ministers.dese.gov.au/tudge/new-facili...,,Australia,Alan Tudge,Liberal Party of Australia,46
4,AlanTudgeMP,185932331,1.389002e+18,2021-05-02,RT @pmc_gov_au: Media Release @JoshFrydenberg ...,0,4.0,,[],"['pmc_gov_au', 'JoshFrydenberg', 'MarisePayne'...",,,Australia,Alan Tudge,Liberal Party of Australia,46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299035,P_Thompson88,929896339307053056,1.260309e+18,2020-05-12,REEF HQ FUNDING BOOST: We've just announced $2...,7,0.0,,[],['reefhqaquarium'],https://www.philthompson.com.au/media-release-...,,Australia,Phillip Thompson,Liberal National Party of Queensland,46
299036,P_Thompson88,929896339307053056,1.260183e+18,2020-05-12,We've been doing so well at keeping COVID-19 a...,3,0.0,,['TownsvilleStrong'],[],https://www.philthompson.com.au/media-release-...,,Australia,Phillip Thompson,Liberal National Party of Queensland,46
299037,P_Thompson88,929896339307053056,1.260103e+18,2020-05-12,JOBKEEPER Q&amp;A: I've asked Assistant Treasu...,2,0.0,,[],[],http://philthompson.com.au/teletownhall,,Australia,Phillip Thompson,Liberal National Party of Queensland,46
299038,P_Thompson88,929896339307053056,1.187588e+18,2019-10-25,The people of Townsville are tired of being he...,116,41.0,,[],[],,,Australia,Phillip Thompson,Liberal National Party of Queensland,46
