# Topic Classification

In this notebook we use the output from the topic model and our immersion journal/manual to subset a training set of tweets about the Australian Bushfires. 

In [1]:
#Importing relevant packages
import numpy as np 
import pandas as pd
from tqdm import tqdm #to create a progress bar
#Machine learning packages
from sklearn.linear_model import LogisticRegression
#Packages to create DFM
from sklearn.feature_extraction.text import CountVectorizer
#Packages for visualization
import matplotlib.pyplot as plt
import seaborn as sns
#Word embeddings
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
#Gary King et. al key-words
from keyword_algorithm import *
#Remove unwarranted warnings
pd.options.mode.chained_assignment = None  # default='warn'

## Implementation of Gary Kings semi-automated keyword retrival

In [2]:
data = pd.read_csv("data/final_df.csv", index_col=0)
#Subset period June 2019 – May 2020
#data = data.loc[(data["created_at"] >= "2019-06-01") & (data["created_at"] <= "2020-06-01")]
#data["id"] = data.index
data = data.dropna(subset = ["lemmas"]).reset_index(drop = True)
data["index_col"] = data.index
#data.to_csv("data/query_df.csv")

In [81]:
model_dir = "data/wiki-news-300d-1M.vec"
fasttext = KeyedVectors.load_word2vec_format(model_dir)

In [338]:
class QueryBuilder:
    
    def __init__(self, emb_model):
        
        self.query = Keywords()
        self.query.LoadDataset('data/query_df.csv', text_colname='lemmas', 
                               date_colname="created_at", id_colname="index_col")
        self.we = emb_model
        
    def get_query(self, keywords):
        """
        Loops over generated keywords
        """
            
            keepers = set()
            
            for keyword in keywords["accepted_keys"]:
                inp = input(f"Should {keyword} go in ORs, ANDs")
                if inp == "ORs":
                    keepers.add(keyword)
                elif inp == "ANDs":
                    for keyword2 in keywords["accepted_keys"]:
                        if keyword2 != keyword:
                            inp2 = input(f"keep ({keyword} AND {keyword2}), yes or no?")
                            if inp2 == "yes":
                                keepers.add((f"(?=.*{keyword})(?=.*{keyword2})"))
            
                    for nokey in keywords["nontarget_keys"]:
                        inp3 = input(f"keep ({keyword} AND NOT {nokey}), yes or no?")
                        if inp3 == "yes":
                            keepers.add((f"(^(?!.*{nokey})(?=.*{keyword})"))
            return keepers
    
    def get_keywords(self, its = 2, top_n = 10, refkeys = [], tarkeys = []):
        
        
        accepted_keywords = []
        rejected_keywords = []
        nontarget_keywords = []
        
        #Begin loop for mining search set
        for it in range(its):
            print("-"*66)
            print(f"STARTING ITERATION: {it}!")
            if it == 0:
                print("INITIAL REFERENCE KEYS: {refkeys} \n INITIAL TARGET KEYS: {tarkeys}")
            print("-"*66)
            
            #Build reference set of tweets
            self.query.ReferenceSet(any_words=refkeys, date_start="2019-06-01", date_end="2020-05-30")
            if it > 0:
                #fit model on searchset and find target
                self.query.SearchSet(any_words= accepted_keywords, date_start="2019-06-01", date_end="2020-05-30")
            else:
                self.query.SearchSet(any_words = tarkeys, date_start="2019-06-01", date_end="2020-05-30")
            
            self.query.ProcessData(stem = False, keep_twitter_symbols=False,
                                   remove_wordlist=refkeys)
            self.query.ReferenceKeywords()
            self.query.ClassifyDocs(min_df=10, ref_trainprop=1, algorithms=['nbayes', 'logit'])
            self.query.FindTargetSet()
            self.query.FindKeywords()
            #Extract target keywords from model
            target_keywords = self.query.target_keywords[:top_n]
            #Also get the reference set keywords
            target_keywords += self.query.reference_keywords[:top_n]
            for nonkey in self.query.nontarget_keywords[:100]:
                if nonkey not in nontarget_keywords:
                    nontarget_keywords.append(nonkey)
            
            for keyword in target_keywords:
                if keyword in accepted_keywords or keyword in rejected_keywords:
                    continue
                else:
                    inp = input(f"Keep {keyword.upper()} yes or no?")
                    if inp == "yes":
                        accepted_keywords.append(keyword)
                
                        #get similar keywords through most similar pretrained embeddings
                        inp2 = input(f"Look at {keyword.upper()}'s most similar word embeddings, yes or no?")
                        if inp2 == "yes":
                            try:
                                embeddings = [emb[0] for emb in self.we.most_similar(keyword)]
                                for emb in embeddings:
                                    if emb.lower() in accepted_keywords or emb.lower() in rejected_keywords:
                                        continue
                                    else:
                                        inp3 = input(f"Keep embedding {emb.upper()} yes or no?")
                                        if inp3 == "yes":
                                            accepted_keywords.append(emb)
                                        elif inp3 =="no":
                                            rejected_keywords.append(emb)
                            except:
                                print(f"{keyword.upper()} embedding not present in Model!")
                                pass
                        elif inp2 == "no":
                            pass
                    elif inp == "no":
                        rejected_keywords.append(keyword)
                        
            #Add custom keyword(s)
            inp4 = input(f"Do you wish to add any further keywords? If yes, Type keyword: ")
            if inp4:
                if isinstance(inp4, list):
                    [accepted_keywords.append(key) for key in inp4]
                else:
                     accepted_keywords.append(inp4)
            else:
                pass
            
            print("-"*66)
            print(" "*20, f"CURRENT KEYWORDS AFTER ITTERATION {it}")
            print("-"*66)
            print(f"ACCEPTED: \n {accepted_keywords}")
            print(f"REJECTED: \n {rejected_keywords}")
            
        
        keywords = {"accepted_keys":accepted_keywords, "rejected_keys":rejected_keywords,"nontarget_keys":nontarget_keywords}
        
        return keywords
        
        
        

In [339]:
query = QueryBuilder(fasttext)

Keyword object initialized.
Loaded corpus of size 148540 in 3.0 seconds.


In [None]:
keywords = query.get_keywords(2, top_n=10, refkeys=["#bushfire", "#bushfires", "bushfire", "bushfires"], tarkeys = ["fire"])

In [250]:
query = "|".join(keywords["accepted_keys"])

## Lasso Model

Here we use the key-words from Kings model to subset tweets. Next we use this subset to predict the party of the tweet and analyze the coefficients

In [149]:
class MlogitMargins:
    """
    Calculates marginal effect of multinomial logit coefficients with bootstraped confidence interval.
    See https://github.com/alicehwu/gendered_language/blob/master/gendered_language_2018.pdf for a 
    reference on a similar approach but with binary classification.
    """
    def __init__(self, X, y):
        #Define data
        self.vect = CountVectorizer(max_features=10000, ngram_range=(1,2))
        self.X = self.vect.fit_transform(X)
        self.y = y
        print("Fitting model and calculating margins...")
        #Fit model and calculate average marginal effects
        self.margins, self.fitted_model = self.avg_margins(self.X, self.y)

    def bootstrap_ci(self, alpha = 0.05, n_samples = 500, sample_prop = 0.4):
        """
        Uses bootstraping to calculate confidence interval around average marginal effects estimate.
        -------------
        Arguments:
            - alpha: deterimines confidence level of the interval
            - n_samples: amount of bootstrap samples
            - sample_prop: bootstramp sample size as proportion of original sample
        -------------
        Return:
            - Pandas dataframe with confidence interval for average marginal effect
              of each coefficient.
        """
        statistics = [] #List for bootstrap results
        
        #Define the bootstrap sample size based on proportion of total
        n_size = int(self.X.shape[0] * sample_prop)
        print(f"Number of obs in bootstrap samples {n_size}...")
        for i in tqdm(range(n_samples)):

            #Draw random sample from X and y with replacement
            idx = np.random.choice(np.arange(self.X.shape[0]), n_size, replace=True)
            X_sample = self.X[idx]
            y_sample = self.y[idx]
            #Calculate marginal effect for sample
            margins = self.avg_margins(X_sample, y_sample)[0]
            statistics.append(margins)
        
        #Join the resulting dataframes of margins
        statistics = pd.concat(statistics)
        print(f"Boostraping Done. Calculating {1-alpha}% confidence interval...")
        start = time.time()
        #From bootstrap results get lower and upper CI limits based on alpha
        statistics = statistics.groupby("token").quantile([alpha, 1-alpha]).reset_index()        
        print("Time to calculate: ", time.time() - start)

        return statistics
        

    def fit_model(self, X, y, max_iter = 10000, penalty = "none", class_weight = "balanced",
                  verbose = False, fit_intercept = True, multi_class = "multinomial"):
        
        """
        Fits sklearn multinomial logistic model on data. Default penalty set to "none" for unbaised estimators.
        """

        mlogit = LogisticRegression(random_state=42, penalty=penalty, solver="saga", 
                                    max_iter = max_iter, class_weight = class_weight,
                                    fit_intercept=fit_intercept, multi_class="ovr",
                                    verbose=verbose).fit(X, y) #   

        return mlogit

    def avg_margins(self, X, y):
        """
        Calculates average marginal effect of coefficients in multinomial logit model, where each coefficient is 
        a word token and is calculated as:
        avg_margin_jk = beta_jk * 1/N * sum{P(y_i = J) * 1-P(y_i = J)} for each token k and class j.
        See: https://math.stackexchange.com/questions/863258/deriving-marginal-effects-in-multinomial-logit-model
        
        """
        #Fit the model
        model = self.fit_model(X, y)
        #Get probabilities for each obs i belonging to class j. shape = j * N
        probas = model.predict_proba(X)
        #Get coefficients. Shape j_classes * k_coefficients
        betas = model.coef_

        margins = {}
        #Loop over each class 
        for class_j in range(len(model.classes_)):
            #Extract corresponging betas. shape 1*k
            for beta_jk in betas[class_j]:
                
                #Calculate avg margins for the jth class and kth beta (token)
                margins_jk = beta_jk * 1/probas.shape[0] * (probas[:,class_j] * (1 - probas[:,class_j])).sum()
                
                #Append avg margins to dictionary
                if model.classes_[class_j] not in margins.keys():
                    margins[model.classes_[class_j]] = [margins_jk]
                else:
                    margins[model.classes_[class_j]].append(margins_jk)
        
        #Extract the token name corresponding to the avg margins   
        margins["token"] = self.vect.get_feature_names()
        margins_df = pd.DataFrame(margins)
        
        return margins_df, model
    

In [140]:
#query = "|".join(keywords["accepted_keys"])
#Extract most predictive keywords from model
mlogit_data = data.loc[(data["lemmas"].str.contains("bushfire|bushfires|#bushfire|#bushfires|disaster|kaoala|fire|flames|firemen") == True) & 
                       (data["lemmas"].str.contains("corona|covid") == False) & 
                       (data["created_at"] >= "2019-06-01") & 
                       (data["created_at"] <= "2020-06-01")]

mlogit_data = mlogit_data.loc[mlogit_data["party"].isin(["Australian Greens",
                                                         "Australian Labor Party", 
                                                         "Liberal Party of Australia",
                                                          ])].dropna(subset = ["lemmas"]).reset_index(drop = True)

In [150]:
X, y = mlogit_data["lemmas"], mlogit_data["party"]
mlogit = MlogitMargins(X, y)

Fitting model and calculating margins...


In [151]:
mlogit.margins.sort_values("Liberal Party of Australia", ascending=False).head(15)

Unnamed: 0,Australian Greens,Australian Labor Party,Liberal Party of Australia,token
3158,-0.00253,-0.014292,0.005408,join
367,-0.006856,-0.010107,0.004438,auspol
1897,-0.002164,-0.011049,0.004235,ensure
6022,-0.000277,-0.01002,0.004034,pm scottmorri
924,-0.001525,-0.009901,0.003847,canada
1371,-0.001331,-0.00935,0.003582,courage
1542,-0.000395,-0.010631,0.003562,defence force
64,-0.001,-0.009579,0.003561,additional
4490,-0.001865,-0.008943,0.0035,network
6322,-0.00074,-0.0092,0.003465,present special


In [155]:
mlogit_ci = mlogit.bootstrap_ci(alpha=0.05, n_samples=25)

  0%|          | 0/25 [00:00<?, ?it/s]

Number of obs in bootstrap samples 880...


100%|██████████| 25/25 [01:10<00:00,  2.80s/it]


Boostraping Done. Calculating 0.95% confidence interval...
Time to calculate:  0.403839111328125


In [157]:
mlogit_ci.head(10)

Unnamed: 0,token,level_1,Australian Greens,Australian Labor Party,Liberal Party of Australia
0,abandon,0.05,-0.000637,3.1e-05,-0.001071
1,abandon,0.95,-7e-06,0.001977,-1.7e-05
2,abandon morrison,0.05,-2.2e-05,0.0,-0.000259
3,abandon morrison,0.95,0.0,0.000866,0.0
4,abate,0.05,-0.000114,0.0,-0.00015
5,abate,0.95,0.0,0.000387,0.0
6,abate community,0.05,-0.000114,0.0,-0.00015
7,abate community,0.95,0.0,0.000387,0.0
8,abc,0.05,-0.000932,-0.000534,-0.001135
9,abc,0.95,-6.1e-05,0.002493,0.000369
