<a href="https://colab.research.google.com/github/frederik-kilpinen/ASDS2/blob/main/Notebooks/data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Processing

1. Data Wrangling and Mergin
2. Text processing

## Data Wranling and Merging

In [2]:
#Necessary imports
import pandas as pd
import numpy as np
from tqdm import tqdm
import tweepy
from datetime import date
import pickle 
import time
import matplotlib.pyplot as plt

In [3]:
# MP info dataset
mp_df = pd.read_csv("data/full_member_info.csv")

#Original shape of the data
mp_df.shape

(15918, 37)

In [5]:
# Twitter dataset
tweets = pd.read_csv("data/mp_tweets", index_col=0, low_memory=False)

#Original shape of the data
print(tweets.shape)

(335975, 12)


In [15]:
class DataProcessing:

    def clean_tweet_data(self, tweet_df):


        #Drop 6 tweets that are corrupt. Because of it only being 6 tweets we drop them instead of re-running the collection from the API
        remove_idx = [175522, 190414, 211953, 212012, 212013, 212298 ]
        tweet_df = tweet_df.drop(tweet_df.index[remove_idx])

        #Make data into date-time object, remove h-m-s from dt
        tweet_df["created_at"] = pd.to_datetime(pd.to_datetime(tweet_df["created_at"]).dt.date)
        
        tweet_df["user_id"] = tweet_df["user_id"].astype(int)
        
        return tweet_df
    
    def clean_mp_data(self, mp_df):
        
        
        
        mp_df = mp_df[['p.country', 'm.name', 'p.party', 'm.uid', 'lp.official_legislative_period']]
        mp_df = mp_df.loc[mp_df["p.country"]=="Australia"]
        
        #Drop australia column
        mp_df = mp_df.drop(columns = ["p.country"])
        #Rename some columns
        mp_df = mp_df.rename(columns = {"m.name":"name", "p.party":"party",
                                       "lp.official_legislative_period":"legislative_period"})
        
        #Rename user id column for merging with members_info data
        mp_df = mp_df.rename(columns = {"m.uid":"user_id"})
        
        
        remove = r"(^[A-Za-z]{2}\s{1}|\s{1}[A-Z]{2,}|^Hon\s{1}|^Mrs\s{1}|(Dr\s)|,)"
        mp_df["name"] = mp_df["name"].str.replace(remove, "", regex = True)
        
        mp_df = mp_df.loc[mp_df["user_id"] != "\\N"]
        mp_df["user_id"] = mp_df["user_id"].astype(int)
        
        # Merge the Nick Xenophon Team and Centre Alliance 
        mp_df["party"] = mp_df["party"].apply(lambda x: "Centre Alliance" if x == "Nick Xenophon Team" else x)
        
        

        return mp_df
    
    def merge_final_df(self, tweet_df, mp_df):
        
        
        tweet_df = self.clean_tweet_data(tweet_df)
        mp_df = self.clean_mp_data(mp_df)
        
        #Merge to final df
        final_df = tweet_df.merge(mp_df, on = "user_id", how = "left")
        
        #Subset on active MPs
        final_df = final_df.loc[((final_df["legislative_period"] == "45") & (final_df["created_at"] < "2019-07-01"))|
                                ((final_df["legislative_period"] == "46") & (final_df["created_at"] > "2019-07-01"))]
        
        # FREDERIKS CHANGE: Subset tweets from 1 year before the bushfire (1. June 2018) and 1 year after the bushfire (1. May 2021)
        final_df = final_df.loc[(final_df["created_at"] >= "2018-06-01") & (final_df["created_at"] <= "2021-04-30")]
           
        # Restetting index for final df
        final_df = final_df.reset_index(drop = True)
        
        return final_df
    

In [16]:
processor = DataProcessing()
final_df = processor.merge_final_df(tweets, mp_df)

In [17]:
final_df.to_csv("final_tweet_df.csv")

In [18]:
len(final_df["name"].unique())

134

## Process tweet text

In [20]:
final_df.shape

(170338, 15)