# Topic Classification

In this notebook we use the output from the topic model and our immersion journal/manual to subset a training set of tweets about the Australian Bushfires. 

In [4]:
#Importing relevant packages
import numpy as np 
import pandas as pd
from tqdm import tqdm #to create a progress bar

#Load custom function for preprocessing the text
from tokenizer import preprocess, preprocess_lemma, preprocess_stem

#Machine learning packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

#Packages to create DFM
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#Packages for cross-validation and parameter tuning
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

#Packages for getting model performance metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

#Packages for visualization
import matplotlib.pyplot as plt
import seaborn as sns

#
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None  # default='warn'


#Gary King et. al key-words
from keyword_algorithm import *

In [5]:
class PrepareData:
    """
    This class prepares the data for the topic classification model
    """
    
    def naive_classification(self, df, keywords):
        
        #Subset on relevant period
        df = df.loc[(df["created_at"] >= "2019-06-01") & (df["created_at"] <= "2020-06-01")]
        
        #Join keywords to regex
        keywords = '|'.join(keywords)
        df["bushfire"] = df["stemmed_text"].str.contains(keywords, regex = True).astype(int)
        
        #Remove the same keywords for text to avoid overfitting 
        df["stemmed_text"] = df["stemmed_text"].str.replace(keywords, "", regex = True)
        
        return df
        
        
    def prepare_text(self, df):
        
        df["clean_text"] = df["full_text"].apply(lambda x: preprocess(x))
        df["stemmed_text"] = df["clean_text"].apply(lambda x: preprocess_stem(x))
        
        return df
    
    
    def compile_df(self, df, keywords):
        """
        Performs classification and subsetting of tweets. 
        Returns cleaned 
        
        """
        
        df = self.prepare_text(df)
        df = self.naive_classification(df, keywords)
        
        
        return df
    
    

## Implementation of Gary Kings semi-automated keyword retrival

In [None]:
data = pd.read_csv("data/final_tweet_df", index_col=0)
#Subset period June 2019 – May 2020
subset = data.loc[(data["created_at"] >= "2019-06-01") & (data["created_at"] <= "2020-06-01")]
subset["id"] = subset.index


#Create a reference set of tweets
reference_set = subset.loc[subset["full_text"].str.contains("bushfire")]
#Create a search set of tweets
search_set = subset[~subset.index.isin(reference_set.index)]

reference_set.to_csv("data/reference_set.csv")
search_set.to_csv("data/search_set.csv")

In [6]:
#Import all the methods from King et. al.
#Note that some methods have been updated because of depreceated Pandas version
bushfire = Keywords()
bushfire.ReferenceSet(data='data/reference_set.csv', text_colname='full_text', id_colname='id')
bushfire.SearchSet(data='data/search_set.csv', text_colname='full_text', id_colname='id')
bushfire.ProcessData(remove_wordlist=[], keep_twitter_symbols=False)
bushfire.ReferenceKeywords()
bushfire.ClassifyDocs(algorithms=['nbayes', 'logit'])# 'tree', 'gboost'])
bushfire.FindTargetSet()
bushfire.FindKeywords()

Keyword object initialized.
Loaded reference set of size 1208 in 0.03 seconds.
Loaded search set of size 45265 in 0.36 seconds.
Time to process corpus: 14.19 seconds

4265 reference set keywords found.

Document Term Matrix: 46473 by 61309 with 655324 nonzero elements

Time to get document-term matrix: 0.51 seconds

Ref training size: 399; Search training size: 14937; Training size: 15336; Test size: 45265

Time for Naive Bayes: 0.01 seconds
Time for Logit: 1.08 seconds
169 documents in target set
45096 documents in non-target set
505 target set keywords found
4964 non-target set keywords found


In [89]:
bushfire.PrintKeywords()

   Reference                  Target                        Non-target
   ----------                 ----------                    ----------
1. bushfir                    bushfir                       auspol
2. support                    recoveri                      australian
3. affect                     cfsalert                      one
4. amp                        area                          great
5. communiti                  assist                        job
6. australia                  vicemerg                      peopl
7. today                      affect                        year
8. govern                     reservist                     liber
9. help                       fire                          mani
10. need                      brigad                        labor
11. crisi                     vicfir                        teen
12. recoveri                  chip                          markbaileymp
13. australian                intens                        

## Lasso Model

Here we use the key-words from Kings model to subset tweets. Next we use this subset to predict the party of the tweet and analyze the coefficients

In [90]:
#Extract most predictive keywords from model
keywords = list(set(bushfire.target_keywords[:1] + bushfire.reference_keywords[:1]))

#Initialize the 
dataprep = PrepareData()

data = pd.read_csv("data/final_tweet_df", index_col=0)
df = dataprep.compile_df(data, keywords)

In [91]:
#Aggregate the data on Party affiliation and day
df_agg = df.groupby(['created_at','party']).agg({"stemmed_text":" ".join}).reset_index()

In [92]:
df["party"].unique()

array(['Liberal Party of Australia',
       'Liberal National Party of Queensland', 'Australian Labor Party',
       'The Nationals', 'Centre Alliance', 'Nick Xenophon Team',
       'Australian Greens', 'Independent', "Katter's Australian Party"],
      dtype=object)

In [93]:
df_agg = df_agg.loc[df_agg["party"].isin(["Australian Greens","Australian Labor Party", "Liberal Party of Australia"])]

In [94]:
from sklearn.linear_model import LogisticRegressionCV
X, y = df_agg["stemmed_text"], df_agg["party"]

vect = CountVectorizer(max_features=10000)
X = vect.fit_transform(X)

clf = LogisticRegressionCV(cv=5, random_state=42, penalty="l1", solver="saga", 
                           max_iter = 10000, n_jobs = -1, verbose=10, 
                           class_weight = "balanced").fit(X, y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 1 epochs took 0 seconds
convergence after 3 epochs took 0 seconds
convergence after 3 epochs took 0 seconds
convergence after 3 epochs took 0 seconds
convergence after 3 epochs took 0 seconds
convergence after 20 epochs took 0 seconds
convergence after 18 epochs took 0 seconds
convergence after 21 epochs took 0 seconds
convergence after 23 epochs took 0 seconds
convergence after 21 epochs took 0 seconds
convergence after 254 epochs took 8 seconds
convergence after 261 epochs took 8 seconds
convergence after 323 epochs took 9 seconds
convergence after 337 epochs took 10 seconds
convergence after 387 epochs took 11 seconds
convergence after 1220 epochs took 36 seconds
convergence after 1254 epochs took 38 seconds
convergence after 1312 epochs took 40 seconds
convergence after 1604 epochs took 46 seconds
convergence after 2260 epochs took 63 seconds
convergence after 1516 epochs took 58 seconds
convergence after 1643 epochs took 61 seconds
convergence after 1452 epochs t

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  3.4min remaining:  5.1min


convergence after 22 epochs took 4 seconds
convergence after 3 epochs took 0 seconds
convergence after 2 epochs took 1 seconds


[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  3.4min remaining:  2.3min


convergence after 265 epochs took 32 seconds
convergence after 168 epochs took 17 seconds
convergence after 21 epochs took 5 seconds
convergence after 41 epochs took 7 seconds
convergence after 3 epochs took 1 seconds
convergence after 2 epochs took 0 seconds
convergence after 4 epochs took 1 seconds
convergence after 2 epochs took 1 seconds


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.6min finished


convergence after 317 epochs took 46 seconds


In [95]:
def marginal_effect(model):
    """
    This function takes the multinomial logistic model and calculates the marginal effect
    of one added occurance of word k in post i. 
    """
    
    
    # Get predictions and coefficients
    preds = model.predict_proba(X) # Shape discplines * N posts i.e. (6* approx. 246000)
    coefs = model.coef_ # Shape discplines * N word tokens i.e. (6*10000))
    
    # Define the denominator to calculate predicted 
    # probability of post i, for each field
   
    denominator = sum([np.exp(preds[:,label]) for label in range(len(model.classes_))])
    
    # Calculate probability of each post i being about each field
    probas = [(np.exp(preds[:,label]) / denominator) for label in range(len(model.classes_))]
    
    # initiate 
    MEs = []
    
    for index, label_proba in tqdm(enumerate(probas)):
        
        temp = []
        for beta_k in coefs[index]:
            p_sum = (label_proba* (1-label_proba)).sum()
            me = round((1 / len(probas[0])) * beta_k * p_sum, 3)
            temp.append(me)
        
        MEs.append(temp)
    
    X_tokens = vect.get_feature_names()
    df_data = {label:MEs[i] for i, label in enumerate(clf.classes_)}
    df_data["word"] = X_tokens #Add word tokens
    
    me_df = pd.DataFrame(df_data)
    
    
    return me_df

In [96]:
me_df = marginal_effect(clf)

3it [00:00,  6.62it/s]


In [99]:
me_df.sort_values("Australian Greens", ascending=False).head(20)

Unnamed: 0,Australian Greens,Australian Labor Party,Liberal Party of Australia,word
3867,0.211,-0.032,-0.041,green
1713,0.143,-0.008,-0.042,coal
1669,0.126,-0.002,-0.162,climat
3811,0.111,-0.013,-0.015,gov
6743,0.072,-0.011,-0.028,pm
6999,0.058,-0.023,-0.01,protest
7026,0.053,-0.01,-0.025,public
6577,0.051,-0.001,-0.065,peopl
7529,0.046,-0.02,-0.009,right
2163,0.041,-0.0,-0.042,crisi
