# Topic Classification

In this notebook we use the output from the topic model and our immersion journal/manual to subset a training set of tweets about the Australian Bushfires. 

In [2]:
#Importing relevant packages
import numpy as np 
import pandas as pd
from tqdm import tqdm #to create a progress bar

#Load custom function for preprocessing the text

#Machine learning packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

#Packages to create DFM
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#Packages for cross-validation and parameter tuning
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

#Packages for getting model performance metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

#Packages for visualization
import matplotlib.pyplot as plt
import seaborn as sns

#
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None  # default='warn'

#Word embeddings
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

#Gary King et. al key-words
from keyword_algorithm import *

## Implementation of Gary Kings semi-automated keyword retrival

In [3]:
data = pd.read_csv("data/final_df.csv", index_col=0)
#Subset period June 2019 – May 2020
#data = data.loc[(data["created_at"] >= "2019-06-01") & (data["created_at"] <= "2020-06-01")]
#data["id"] = data.index
data = data.dropna(subset = ["lemmas"]).reset_index(drop = True)
data["index_col"] = data.index
#data.to_csv("data/query_df.csv")

In [81]:
model_dir = "data/wiki-news-300d-1M.vec"
fasttext = KeyedVectors.load_word2vec_format(model_dir)

In [338]:
class QueryBuilder:
    
    def __init__(self, emb_model):
        
        self.query = Keywords()
        self.query.LoadDataset('data/query_df.csv', text_colname='lemmas', 
                               date_colname="created_at", id_colname="index_col")
        self.we = emb_model
        
    def get_query(self, keywords):
            
            keepers = set()
            
            for keyword in keywords["accepted_keys"]:
                inp = input(f"Should {keyword} go in ORs, ANDs")
                if inp == "ORs":
                    keepers.add(keyword)
                elif inp == "ANDs":
                    for keyword2 in keywords["accepted_keys"]:
                        if keyword2 != keyword:
                            inp2 = input(f"keep ({keyword} AND {keyword2}), yes or no?")
                            if inp2 == "yes":
                                keepers.add((f"(?=.*{keyword})(?=.*{keyword2})"))
            
                    for nokey in keywords["nontarget_keys"]:
                        inp3 = input(f"keep ({keyword} AND NOT {nokey}), yes or no?")
                        if inp3 == "yes":
                            keepers.add((f"(^(?!.*{nokey})(?=.*{keyword})"))
            return keepers
    
    def get_keywords(self, its = 2, top_n = 10, refkeys = [], tarkeys = []):
        
        
        accepted_keywords = []
        rejected_keywords = []
        nontarget_keywords = []
        
        #Begin loop for mining search set
        for it in range(its):
            print("-"*66)
            print(f"STARTING ITERATION: {it}! INITIAL REFERENCE KEYS: {refkeys}")
            print("-"*66)
            
            #Build reference set of tweets
            self.query.ReferenceSet(any_words=refkeys, date_start="2019-06-01", date_end="2020-05-30")
            if it > 0:
                #fit model on searchset and find target
                self.query.SearchSet(any_words= accepted_keywords, date_start="2019-06-01", date_end="2020-05-30")
            else:
                self.query.SearchSet(any_words = tarkeys, date_start="2019-06-01", date_end="2020-05-30")
            
            self.query.ProcessData(stem = False, keep_twitter_symbols=False,
                                   remove_wordlist=refkeys)
            self.query.ReferenceKeywords()
            self.query.ClassifyDocs(min_df=10, ref_trainprop=1, algorithms=['nbayes', 'logit'])
            self.query.FindTargetSet()
            self.query.FindKeywords()
            #Extract target keywords from model
            target_keywords = self.query.target_keywords[:top_n]
            #Also get the reference set keywords
            target_keywords += self.query.reference_keywords[:top_n]
            for nonkey in self.query.nontarget_keywords[:100]:
                if nonkey not in nontarget_keywords:
                    nontarget_keywords.append(nonkey)
            
            for keyword in target_keywords:
                if keyword in accepted_keywords or keyword in rejected_keywords:
                    continue
                else:
                    inp = input(f"Keep {keyword.upper()} yes or no?")
                    if inp == "yes":
                        accepted_keywords.append(keyword)
                
                        #get similar keywords through most similar pretrained embeddings
                        inp2 = input(f"Look at {keyword.upper()}'s most similar word embeddings, yes or no?")
                        if inp2 == "yes":
                            try:
                                embeddings = [emb[0] for emb in self.we.most_similar(keyword)]
                                for emb in embeddings:
                                    if emb.lower() in accepted_keywords or emb.lower() in rejected_keywords:
                                        continue
                                    else:
                                        inp3 = input(f"Keep embedding {emb.upper()} yes or no?")
                                        if inp3 == "yes":
                                            accepted_keywords.append(emb)
                                        elif inp3 =="no":
                                            rejected_keywords.append(emb)
                            except:
                                print(f"{keyword.upper()} embedding not present in Model!")
                                pass
                        elif inp2 == "no":
                            pass
                    elif inp == "no":
                        rejected_keywords.append(keyword)
                        
            #Add custom keyword(s)
            inp4 = input(f"Do you wish to add any further keywords? If yes, Type keyword: ")
            if inp4:
                if isinstance(inp4, list):
                    [accepted_keywords.append(key) for key in inp4]
                else:
                     accepted_keywords.append(inp4)
            else:
                pass
            
            print("-"*66)
            print(" "*20, f"CURRENT KEYWORDS AFTER ITTERATION {it}")
            print("-"*66)
            print(f"ACCEPTED: \n {accepted_keywords}")
            print(f"REJECTED: \n {rejected_keywords}")
            
        
        keywords = {"accepted_keys":accepted_keywords, "rejected_keys":rejected_keywords,"nontarget_keys":nontarget_keywords}
        
        return keywords
        
        
        

In [339]:
query = QueryBuilder(fasttext)

Keyword object initialized.
Loaded corpus of size 148540 in 3.0 seconds.


In [340]:
keywords = query.get_keywords(2, top_n=10, refkeys=["#bushfire", "#bushfires", "bushfire", "bushfires"], tarkeys = ["fire"])

------------------------------------------------------------------
STARTING ITERATION: 0! INITIAL REFERENCE KEYS: ['#bushfire', '#bushfires', 'bushfire', 'bushfires']
------------------------------------------------------------------
Loaded reference set of size 1682 in 1.63 seconds.
Loaded search set of size 1368 in 0.44 seconds.
Time to process corpus: 0.38 seconds

4159 reference set keywords found.

Document Term Matrix: 3050 by 857 with 33018 nonzero elements

Time to get document-term matrix: 0.05 seconds

Ref training size: 1682; Search training size: 451; Training size: 2133; Test size: 1368

Time for Naive Bayes: 0.0 seconds
Time for Logit: 0.03 seconds
479 documents in target set
889 documents in non-target set
260 target set keywords found
178 non-target set keywords found


Keep GOVERNMENT yes or no? aa


KeyboardInterrupt: Interrupted by user

Keep SUPPORT yes or no? aa


In [250]:
query = "|".join(keywords["accepted_keys"])

## Lasso Model

Here we use the key-words from Kings model to subset tweets. Next we use this subset to predict the party of the tweet and analyze the coefficients

In [172]:
class MlogitMargins:
    """
    Calculates marginal effect of logit coefficients with bootstraped confidence interval.
    """
    def __init__(self, X, y):
        
        #Define data
        self.vect = CountVectorizer(max_features=10000, ngram_range=(1,2))
        self.X = self.vect.fit_transform(X)
        self.y = y
        print("Fitting model and calculating margins...")
        self.margins, self.fitted_model = self.marginal_effect(self.X, self.y)

    def bootstrap(self, alpha = 0.05, n_iter = 500):
        
        
        statistics = []
        n_size = int(self.X.shape[0] * 0.60)
        print(f"Observations in bootstrap sample {n_size}.")
        for i in tqdm(range(n_iter)):

            idx = np.random.choice(np.arange(self.X.shape[0]), n_size, replace=True)
            X_sample = self.X[idx]
            y_sample = self.y[idx]
            
            margins = self.marginal_effect(X_sample, y_sample)[0]
            statistics.append(margins)
        
        #Join the resulting dataframes
        statistics = pd.concat(statistics)
        print(f"Boostraping Done. Calculating {alpha}-{1-alpha} confidence interval...")
        start = time.time()
        statistics = statistics.groupby("word").quantile([alpha, 1-alpha]).reset_index()
        
        #statistics = statistics.agg
        print("Time to calculate: ", time.time() - start)

        return statistics
        

    def fit_model(self, X, y):
        
        logit = LogisticRegression(random_state=42, penalty="none", solver="saga", 
                                   max_iter = 10000,  class_weight = "balanced").fit(X, y)

        return logit
        
    def marginal_effect(self, X, y):
        
        
        model = self.fit_model(X, y)
        
        # Get predictions and coefficients
        preds = model.predict_proba(X) # Shape discplines * N posts i.e. (6* approx. 246000)
        coefs = model.coef_ # Shape discplines * N word tokens i.e. (6*10000))

        # Define the denominator to calculate predicted 
        # probability of post i, for each field

        denominator = sum([np.exp(preds[:,label]) for label in range(len(model.classes_))])

        # Calculate probability of tweet i beloning to party k
        probas = [(np.exp(preds[:,label]) / denominator) for label in range(len(model.classes_))]

        # initiate 
        MEs = []

        for index, label_proba in tqdm(enumerate(probas)):

            temp = []
            for beta_k in coefs[index]:
                p_sum = (label_proba* (1-label_proba)).sum()
                me = round((1 / len(probas[0])) * beta_k * p_sum, 3)
                temp.append(me)

            MEs.append(temp)

        X_tokens = self.vect.get_feature_names()
        df_data = {label:MEs[i] for i, label in enumerate(model.classes_)}
        df_data["word"] = X_tokens #Add word tokens

        me_df = pd.DataFrame(df_data)


        return me_df, model
    

In [85]:
#query = "|".join(keywords["accepted_keys"])
#Extract most predictive keywords from model
lasso_data = data.loc[(data["lemmas"].str.contains("bushfire|bushfires|#bushfire|#bushfires|disaster|kaoala|fire|flames|firemen") == True) & 
                      (data["lemmas"].str.contains("corona|covid") == False) & 
                      (data["created_at"] >= "2019-06-01") & 
                      (data["created_at"] <= "2020-06-01")]

#df_agg = lasso_data.groupby(["created_at", "name"]).agg({"lemmas":" ".join, "party":"first"}).reset_index()
df_agg = df_agg.loc[df_agg["party"].isin(["Australian Greens","Australian Labor Party", "Liberal Party of Australia"])].dropna().reset_index(drop = True)

In [1]:
X, y = df_agg["lemmas"], df_agg["party"]
Mlogit = MlogitMargins(X, y)

NameError: name 'df_agg' is not defined

In [174]:
bs_res = Mlogit.bootstrap(n_iter=5)

  0%|          | 0/5 [00:00<?, ?it/s]

Observations in bootstrap sample 835.


3it [00:00,  7.29it/s]
3it [00:00,  7.98it/s]
3it [00:00,  7.86it/s]
3it [00:00,  7.06it/s]
3it [00:00,  7.58it/s]
100%|██████████| 5/5 [00:12<00:00,  2.44s/it]


Boostraping Done. Calculating 0.05-0.95 confidence interval...
Time to calculate:  0.057392120361328125


In [176]:
bs_res.sort_values("Australian Greens", ascending=False)

Unnamed: 0,word,level_1,Australian Greens,Australian Labor Party,Liberal Party of Australia,bootstrap_iteration
2235,climate,0.95,0.2848,-0.0674,-0.1646,3.8
2299,coal,0.95,0.2744,-0.1222,-0.0906,3.8
1963,case,0.95,0.2362,0.0536,-0.0148,3.8
3881,especially,0.95,0.2332,0.0198,0.0638,3.8
2234,climate,0.05,0.2294,-0.1514,-0.1890,0.2
...,...,...,...,...,...,...
16274,support,0.05,-0.1814,-0.1076,0.1516,0.2
2422,community,0.05,-0.1896,-0.0596,0.1258,0.2
736,auspol,0.05,-0.1976,0.2162,-0.1290,0.2
3324,disaster,0.05,-0.2076,0.2968,-0.1494,0.2
