# Topic Classification

In this notebook we use the output from the topic model and our immersion journal/manual to subset a training set of tweets about the Australian Bushfires. 

In [2]:
#Importing relevant packages
import numpy as np 
import pandas as pd
from tqdm import tqdm #to create a progress bar
#Machine learning packages
from sklearn.linear_model import LogisticRegression
#Packages to create DFM
from sklearn.feature_extraction.text import CountVectorizer
#Packages for visualization
import matplotlib.pyplot as plt
import seaborn as sns
#Word embeddings
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
#Gary King et. al key-words
from keyword_algorithm import *
#Remove unwarranted warnings
pd.options.mode.chained_assignment = None  # default='warn'

## Implementation of Gary Kings semi-automated keyword retrival

In [3]:
#Load and prepare data
data = pd.read_csv("data/final_df.csv", index_col=0)
data = data.dropna(subset = ["lemmas"]).reset_index(drop = True)
data["index_col"] = data.index
#data.to_csv("data/query_df.csv")

In [4]:
#Download model from https://fasttext.cc/docs/en/english-vectors.html
model_dir = "data/wiki-news-300d-1M.vec"
fasttext = KeyedVectors.load_word2vec_format(model_dir)

In [9]:
class QueryBuilder:
    
    def __init__(self, emb_model):
        
        self.query = Keywords()
        #Load the data. Change path if necessary
        path = 'data/query_df.csv'
        self.query.LoadDataset(path, text_colname='lemmas', 
                    date_colname="created_at", id_colname="index_col")
        #Load Gensim word embeddings model 
        self.we = emb_model
            
    def get_keywords(self, its = 2, top_n = 10, refkeys = [], tarkeys = [], algorithms = ['nbayes', 'logit'], 
                     date_start = "2019-06-01", date_end = "2020-05-30"):
        """
        Loops over King. et. al algorithm to extract relevant keywords used for building a 
        boolean query to subset relevant Tweets in the dataset.
        ---------
        arguments:
            - its: Iterations to run the algorithm
            - top_n: integer of how many of the most predictive keywords to extract in each iteration
            - refkeys: list of initial keywords used to create reference set of tweets
            - tarkeys: list of initial keywords used to limit the search set
            - algorithms: list of classifiers to run for extracting keywords 
            - date_start: y/m/d of start date for relevant tweets
            - date_end: y/m/d of end date for relevant tweets
        -----------    
        returns:
            - dictionary of accepted, rejected and nontarget keywords
        """
        
        accepted_keywords = []
        rejected_keywords = []
        nontarget_keywords = []
        
        #Begin loop for mining search set
        for it in range(its):
            print("-"*66)
            print(f"STARTING ITERATION: {it}!")
            if it == 0:
                print("INITIAL REFERENCE KEYS: {refkeys} \n INITIAL TARGET KEYS: {tarkeys}")
            print("-"*66)
            
            #Build reference set of tweets
            self.query.ReferenceSet(any_words=refkeys, date_start=date_start, date_end=date_end)
            
            #Use accepted keys as search keys if not the first iteration
            if it > 0:
                self.query.SearchSet(any_words = accepted_keywords, 
                                     date_start=date_start, date_end=date_end)
            else:
                self.query.SearchSet(any_words = tarkeys, 
                                     date_start=date_start, date_end=date_end)
            
            
            #Run King algorithm to find keywords.
            self.query.ProcessData(stem = False, keep_twitter_symbols=False,
                                   remove_wordlist=refkeys)
            self.query.ReferenceKeywords()
            self.query.ClassifyDocs(min_df=5, ref_trainprop=1, algorithms=algorithms)
            self.query.FindTargetSet()
            self.query.FindKeywords()
            
            #Extract target keywords from algorithm results
            target_keywords = self.query.target_keywords[:top_n]
            #Also get the reference set keywords to loop over
            target_keywords += self.query.reference_keywords[:top_n]
            #Append unique nontarget keywords to list of nontarget keys
            for nonkey in self.query.nontarget_keywords[:100]:
                if nonkey not in nontarget_keywords:
                    nontarget_keywords.append(nonkey)
            
            #Loop over each relevant keyword from reference and found target keywords
            for keyword in target_keywords:
                #Check if keyword has already been rejected or accepted
                if keyword in accepted_keywords or keyword in rejected_keywords:
                    continue
                else:
                    inp = input(f"Keep {keyword.upper()} yes or no?")
                    if inp == "yes":
                        accepted_keywords.append(keyword)
                        #get similar keywords through most similar pretrained embeddings
                        inp2 = input(f"Look at {keyword.upper()}'s most similar word embeddings, yes or no?")
                        if inp2 == "yes":
                            #Look if keyword exist in embedding model dictionary
                            try:
                                embeddings = [emb[0] for emb in self.we.most_similar(keyword)]
                                for emb in embeddings:
                                    if emb.lower() in accepted_keywords or emb.lower() in rejected_keywords:
                                        continue
                                    else:
                                        inp3 = input(f"Keep embedding {emb.upper()} yes or no?")
                                        if inp3 == "yes":
                                            accepted_keywords.append(emb)
                                        elif inp3 =="no":
                                            rejected_keywords.append(emb)
                            except:
                                print(f"{keyword.upper()} embedding not present in Model!")
                                pass
                        elif inp2 == "no":
                            pass
                    elif inp == "no":
                        rejected_keywords.append(keyword)
                        
            #Add custom keyword(s) in the end of the loop. Either as list or single keyword
            inp4 = input(f"Do you wish to add any further keywords? If yes, Type keyword: ")
            if inp4:
                if isinstance(inp4, list):
                    [accepted_keywords.append(key) for key in inp4]
                else:
                     accepted_keywords.append(inp4)
            else:
                pass
            
            print("-"*66)
            print(" "*20, f"CURRENT KEYWORDS AFTER ITTERATION {it}")
            print("-"*66)
            print(f"ACCEPTED: \n {accepted_keywords}")
            print(f"REJECTED: \n {rejected_keywords}")
            
        
        keywords = {"accepted_keys":accepted_keywords, "rejected_keys":rejected_keywords,"nontarget_keys":nontarget_keywords}
        
        return keywords
        
        
        

In [10]:
query = QueryBuilder(fasttext)

Keyword object initialized.
Loaded corpus of size 148540 in 2.21 seconds.


In [131]:
keywords

{'accepted_keys': ['support',
  'supporting',
  'suppport',
  'supported',
  'recovery',
  'affect',
  'morrison',
  'koala',
  'fire',
  'fires',
  'flames',
  'three-alarm',
  'blaze'],
 'rejected_keys': ['oppose',
  'backing',
  'suport',
  'supports',
  'supprt',
  'help',
  'government',
  'chief',
  'rebuild',
  'business',
  'community',
  'announce',
  'today',
  'need',
  'auspol',
  'people',
  'two-alarm',
  'four-alarm'],
 'nontarget_keys': ['canberratimes',
  'coal',
  'via',
  'station',
  'north',
  'ban',
  'contain',
  'situation',
  'front',
  'volunteer',
  'activity',
  'qld',
  'power',
  'near',
  'total',
  'train',
  'tomorrow',
  'woman',
  'hour',
  'fight',
  'morning',
  'gas',
  'right',
  'burn',
  'declare',
  'energy',
  'unprecedented',
  'think',
  'yesterday',
  'hot',
  'hill',
  'friend',
  'close',
  'place',
  'time',
  'news',
  'firefighter',
  'green',
  'rain',
  'congratulation',
  'gov',
  'early',
  'line',
  'dangerous',
  'wind',
  'fight

In [130]:
keywords = query.get_keywords(2, top_n=10, refkeys=["#bushfire", "#bushfires", "bushfire", "bushfires"], 
                                 tarkeys = ["fire"])

------------------------------------------------------------------
STARTING ITERATION: 0!
INITIAL REFERENCE KEYS: {refkeys} 
 INITIAL TARGET KEYS: {tarkeys}
------------------------------------------------------------------
Loaded reference set of size 1682 in 2.09 seconds.
Loaded search set of size 1368 in 0.52 seconds.
Time to process corpus: 0.49 seconds

4159 reference set keywords found.

Document Term Matrix: 3050 by 1495 with 37198 nonzero elements

Time to get document-term matrix: 0.05 seconds

Ref training size: 1682; Search training size: 451; Training size: 2133; Test size: 1368

Time for Naive Bayes: 0.01 seconds
Time for Logit: 0.04 seconds
473 documents in target set
895 documents in non-target set
253 target set keywords found
185 non-target set keywords found


Keep SUPPORT yes or no? yes
Look at SUPPORT's most similar word embeddings, yes or no? yes
Keep embedding SUPPORTING yes or no? yes
Keep embedding SUPPPORT yes or no? yes
Keep embedding SUPPORTED yes or no? yes
Keep embedding OPPOSE yes or no? no
Keep embedding BACKING yes or no? no
Keep embedding SUPORT yes or no? no
Keep embedding SUPPORTS yes or no? no
Keep embedding SUPPRT yes or no? no
Keep embedding HELP yes or no? no
Keep GOVERNMENT yes or no? no
Keep RECOVERY yes or no? yes
Look at RECOVERY's most similar word embeddings, yes or no? no
Keep AFFECT yes or no? yes
Look at AFFECT's most similar word embeddings, yes or no? no
Keep MORRISON yes or no? yes
Look at MORRISON's most similar word embeddings, yes or no? no
Keep CHIEF yes or no? no
Keep REBUILD yes or no? no
Keep BUSINESS yes or no? no
Keep COMMUNITY yes or no? no
Keep ANNOUNCE yes or no? no
Keep TODAY yes or no? no
Keep NEED yes or no? no
Keep AUSPOL yes or no? no
Keep PEOPLE yes or no? no
Do you wish to add any further k

------------------------------------------------------------------
                     CURRENT KEYWORDS AFTER ITTERATION 0
------------------------------------------------------------------
ACCEPTED: 
 ['support', 'supporting', 'suppport', 'supported', 'recovery', 'affect', 'morrison', 'koala']
REJECTED: 
 ['oppose', 'backing', 'suport', 'supports', 'supprt', 'help', 'government', 'chief', 'rebuild', 'business', 'community', 'announce', 'today', 'need', 'auspol', 'people']
------------------------------------------------------------------
STARTING ITERATION: 1!
------------------------------------------------------------------
Loaded reference set of size 1682 in 1.89 seconds.
Loaded search set of size 18713 in 4.11 seconds.
Time to process corpus: 2.94 seconds

4159 reference set keywords found.

Document Term Matrix: 20395 by 4988 with 295035 nonzero elements

Time to get document-term matrix: 0.33 seconds

Ref training size: 1682; Search training size: 6175; Training size: 7857; Te

Keep FIRE yes or no? yes
Look at FIRE's most similar word embeddings, yes or no? yes
Keep embedding FIRES yes or no? yes
Keep embedding FLAMES yes or no? yes
Keep embedding THREE-ALARM yes or no? yes
Keep embedding TWO-ALARM yes or no? no
Keep embedding FOUR-ALARM yes or no? no
Keep embedding BLAZE yes or no? yes
Keep embedding FIVE-ALARM yes or no? 
Keep embedding FIRE. yes or no? 
Keep embedding CONFLAGRATION yes or no? 
Keep TYFYS yes or no? 
Keep DISASTER yes or no? 
Keep AREA yes or no? 
Keep YOURADF yes or no? 
Keep LOVEGIPPSLAND yes or no? 
Keep FLOOD yes or no? 
Do you wish to add any further keywords? If yes, Type keyword:  


------------------------------------------------------------------
                     CURRENT KEYWORDS AFTER ITTERATION 1
------------------------------------------------------------------
ACCEPTED: 
 ['support', 'supporting', 'suppport', 'supported', 'recovery', 'affect', 'morrison', 'koala', 'fire', 'fires', 'flames', 'three-alarm', 'blaze']
REJECTED: 
 ['oppose', 'backing', 'suport', 'supports', 'supprt', 'help', 'government', 'chief', 'rebuild', 'business', 'community', 'announce', 'today', 'need', 'auspol', 'people', 'two-alarm', 'four-alarm']


## Mlogit Model - Average Marginal Effect Estimation 

Here we use the key-words from Kings model to subset tweets. Next we use this subset to predict the party of the tweet and analyze the coefficients

In [118]:
class MlogitMargins:
    """
    Calculates marginal effect of multinomial logit coefficients with bootstraped confidence interval.
    See https://github.com/alicehwu/gendered_language/blob/master/gendered_language_2018.pdf for a 
    reference on a similar approach but with binary classification.
    """
    def __init__(self, X, y):
        #Define data
        self.vect = CountVectorizer(ngram_range=(1,2), min_df = 5)
        self.X = self.vect.fit_transform(X)
        self.y = y
        print("Fitting model and calculating margins...")
        #Fit model and calculate average marginal effects
        self.margins, self.fitted_model = self.avg_margins(self.X, self.y)
        

    def avg_margins(self, X, y):
        """
        Calculates average marginal effect of coefficients in multinomial logit model, where each coefficient is 
        a word token and is calculated as:
        avg_margin_jk = beta_jk * 1/N * sum{P(y_i = J) * 1-P(y_i = J)} for each token k and class j.
        See: https://math.stackexchange.com/questions/863258/deriving-marginal-effects-in-multinomial-logit-model
        -----------
        Returns:
            - DataFrame with average margins of shape classes_j * tokens_k
            - The fitted mlogit model 
        """
        #Fit the model
        model = self.fit_model(X, y)
        #Get probabilities for each obs i belonging to class j. shape = j * N
        probas = model.predict_proba(X)
        #Get coefficients. Shape j_classes * k_coefficients
        betas = model.coef_

        margins = {}
        #Loop over each class 
        for class_j in range(len(model.classes_)):
            #Extract corresponging betas. shape 1*k
            for beta_jk in betas[class_j]:
                
                #Calculate avg margins for the jth class and kth beta (token)
                margins_jk = beta_jk * 1/probas.shape[0] * (probas[:,class_j] * (1 - probas[:,class_j])).sum()
                
                #Append to dictionary
                if model.classes_[class_j] not in margins.keys():
                    margins[model.classes_[class_j]] = [margins_jk]
                else:
                    margins[model.classes_[class_j]].append(margins_jk)
        
        #Extract the token name corresponding to the avg margins   
        margins["token"] = self.vect.get_feature_names()
        margins_df = pd.DataFrame(margins)
        
        return margins_df, model
    
    def fit_model(self, X, y, max_iter = 10000, penalty = "l1", class_weight = "balanced",
                  verbose = False, fit_intercept = True, multi_class = "multinomial",
                  C = 1):
        
        """
        Fits sklearn multinomial logistic model on data.
        """

        mlogit = LogisticRegression(random_state=42, penalty=penalty, solver="saga", 
                                    max_iter = max_iter, class_weight = class_weight,
                                    fit_intercept=fit_intercept, multi_class=multi_class,
                                    verbose=verbose, C = C).fit(X, y)  

        return mlogit
    
    def bootstrap_ci(self, alpha = 0.05, n_samples = 500, sample_prop = 0.4):
        """
        Uses bootstraping to calculate confidence interval around average marginal effects estimate.
        -------------
        Arguments:
            - alpha: deterimines confidence level of the interval
            - n_samples: amount of bootstrap samples
            - sample_prop: bootstramp sample size as proportion of original sample
        -------------
        Return:
            - Pandas dataframe with confidence interval for average marginal effect
              of each coefficient.
        """
        statistics = [] #List for bootstrap results
        
        #Define the bootstrap sample size based on proportion of total
        n_size = int(self.X.shape[0] * sample_prop)
        print(f"Number of obs in bootstrap samples {n_size}...")
        start = time.time()
        for i in tqdm(range(n_samples)):

            #Draw random sample from X and y with replacement
            idx = np.random.choice(np.arange(self.X.shape[0]), n_size, replace=True)
            X_sample = self.X[idx]
            y_sample = self.y[idx]
            #Calculate marginal effect for sample
            margins = self.avg_margins(X_sample, y_sample)[0]
            statistics.append(margins)
        
        #Join the resulting dataframes of margins
        statistics = pd.concat(statistics)
        print(f"Boostraping Done. Calculating {1-alpha}% confidence interval...")
        #From bootstrap results get lower and upper CI limits based on alpha
        statistics = statistics.groupby("token").quantile([alpha, 1-alpha]).reset_index()        
        print("Time to complete: ", time.time() - start)

        return statistics
    

In [126]:
#NOTE: This data is just for testing - not final subset.
mlogit_data = data.loc[(data["lemmas"].str.contains("bushfire|bushfires|#bushfire|#bushfires|disaster|kaoala|fire|flames|firemen") == True) & 
                       (data["lemmas"].str.contains("corona|covid") == False) & 
                       (data["created_at"] >= "2019-06-01") & 
                       (data["created_at"] <= "2020-06-01")]

#Reduce classes to predict. Otherwise bad model performance.
mlogit_data = mlogit_data.loc[mlogit_data["party"].isin(["Australian Greens",
                                                         "Australian Labor Party", 
                                                         "Liberal Party of Australia",
                                                         "The Nationals",
                                                         "Centre Alliance"
                                                          ])].dropna(subset = ["lemmas"]).reset_index(drop = True)

In [127]:
X, y = mlogit_data["lemmas"], mlogit_data["party"]
mlogit = MlogitMargins(X, y)

Fitting model and calculating margins...




In [138]:
mlogit.margins.sort_values("The Nationals" , ascending=False).head(20)

Unnamed: 0,Australian Greens,Australian Labor Party,Centre Alliance,Liberal Party of Australia,The Nationals,token
1167,-0.012168,-0.111829,-0.0022,-0.074854,0.120708,lovegippsland
1972,-0.013101,-0.084058,-0.001108,-0.072949,0.103549,tyfys
1615,-0.016448,-0.010526,-0.001566,-0.045663,0.058891,rf
880,-0.015504,-0.014656,-0.001676,-0.030716,0.049876,gippsland
2024,-0.00428,-0.051423,-0.000895,-0.028659,0.049726,waggawagga
682,-0.021562,-0.02905,-0.002463,-0.001055,0.044723,everything
1963,-0.008681,-0.025396,-0.001055,-0.028007,0.043208,tumbarumba
103,-0.001146,-0.028951,-0.001928,-0.029873,0.042858,anyone
1299,-0.000282,-0.05961,-0.001104,-0.017175,0.04238,near
1126,-0.02718,-0.001116,-0.001494,-0.011814,0.041288,link


In [None]:
mlogit_ci = mlogit.bootstrap_ci(alpha=0.05, n_samples=25)

In [None]:
mlogit_ci.head(10)