## Table of Contents

* [Chapter 1](#chapter1): Preprocessing
* [Chapter 2](#chapter2): Topic Modelling
    * [Section 2.1](#section_2_1): LDA
    * [Section 2.2](#section_2_2): NMF
    * [Section 2.3](#section_2_3): Word Embeddings
* [Chapter 3](#chapter3): Visualization Preparation
    * [Section 3.1](#section_3_1): Dimensionality Reduction
    * [Section 3.2](#section_3_2): Network Analysis

## Chapter 1: <a class="anchor" id="chapter1"></a> Preprocessing

In [1]:
#Performing required installations
#nltk.download('stopwords')
#nltk.download('punkt')

#pip install spacy

In [2]:
#Importing libraries
#Data processing
import pandas as pd
import numpy as np
import re
import ast

#Natural language processing
from nltk.corpus import stopwords
import string as st
import spacy

#Topic modelling
import gensim
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from gensim import corpora, models
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans

#Dimensionality reduction
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

#Network analysis
from pyvis.network import Network
import itertools

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
import matplotlib.patches as mpatches
import pyLDAvis
#import pyLDAvis.gensim_models as gensimvis
#pyLDAvis.enable_notebook()

#Other
import os
import warnings
import time
from tqdm import tqdm
tqdm.pandas()

In [3]:
#Suppressing warnings
warnings.simplefilter(action = "ignore")

In [4]:
#Reading in CSVs
os.chdir("..")
os.chdir("..")
df = pd.read_csv("Outputs/Articles/df_final.csv", index_col = 0, parse_dates = ["pubtime", "pubday", "pubmonth"])
df_preproc = pd.read_csv("Outputs/Articles/df_final_preprocessed.csv", index_col = 0, parse_dates = ["pubtime", "pubday", "pubmonth"])

df_entities = pd.read_csv("Inputs/Articles/entities.csv")
df_key_entities = pd.read_csv("Inputs/Articles/key_entities_large.csv", index_col = 0)
df_key_entities_small = pd.read_csv("Inputs/Articles/key_entities.csv", index_col = 0)
os.chdir("Notebooks/Articles")

In [5]:
#Setting entities
entities = list(df_entities["designed_entity"].unique())
entities

['Ueli_Maurer',
 'Guy_Parmelin',
 'Ignazio_Cassis',
 'Karin_Keller_Sutter',
 'Simonetta_Sommaruga',
 'Alain_Berset',
 'Viola_Amherd',
 'Bundesrat',
 'Tanja_Stadler',
 'Marcel_Tanner',
 'Martin_Ackermann',
 'Matthias_Egger',
 'Taskforce',
 'Christoph_Berger',
 'EKIF',
 'Stefan_Kuster',
 'Pascal_Strupler',
 'Virginie_Masserey',
 'Anne_Levy',
 'Patrick_Mathys',
 'Marcel_Salathe',
 'Daniel_Koch',
 'BAG',
 'Swissmedic',
 'Lukas_Engelberger',
 'GDK',
 'SVP',
 'SP',
 'FDP',
 'Die_Mitte',
 'Die_Gruene',
 'Gruenliberale',
 'Juso',
 'Befuerworter',
 'Ja_Lager',
 'Gegner',
 'Leugner',
 'Skeptiker',
 'Kritiker',
 'Opposition',
 'Nein_Lager',
 'Demonstranten',
 'Freunde_Der_Verfassung',
 'Mass_Voll']

In [7]:
#Setting key entities
key_entities_small = list(df_key_entities_small["key_entities"])
key_entities_small

['Ueli_Maurer',
 'Alain_Berset',
 'Bundesrat',
 'Taskforce',
 'BAG',
 'Daniel_Koch',
 'Tanja_Stadler',
 'Christoph_Berger',
 'Lukas_Engelberger',
 'Patrick_Mathys',
 'SVP',
 'SP',
 'FDP',
 'Die_Mitte']

In [8]:
#Instantiating nlp
nlp = spacy.load("de_core_news_md", disable = ["tagger", "parser", "ner"])

In [9]:
#Defining function to lemmatize tokens
def lemmatize(tokens):
    spacy_tokens = [nlp(token) for token in tokens]
    lemmas = [spacy_token[0].lemma_ for spacy_token in spacy_tokens]
    return lemmas

In [10]:
#Defining function to remove entity from tokens
def remove_entity(tokens, entity):
    names = entity.split("_")
    regex_match_list = ["\w*" + name + "\w*" for name in names]
    regex_match = "|".join(regex_match_list)
    tokens = [x for x in tokens if re.search(regex_match, x) == None]
    return tokens

In [11]:
#Defining preprocessing function
def preprocess(df):
    #Creating new columns
    df["passage_tokens"] = df["original_passage"]
      
    #Tokenizing
    df["passage_tokens"] = df["passage_tokens"].progress_apply(lambda x: x.split())
    
    #Removing stopwords
    stopword = set(stopwords.words("german"))
    
    df["passage_tokens"] = df["passage_tokens"].progress_apply(lambda x: [token for token in x if token not in stopword])
        
    #Removing punctuation
    punctuation = list(st.punctuation)
    
    df["passage_tokens"] = df["passage_tokens"].progress_apply(lambda x: [token for token in x if token not in punctuation])
        
    #Removing [NEG_ENT] tokens
    df["passage_tokens"] = df["passage_tokens"].progress_apply(lambda x: [token for token in x if token != "[NEG_ENT]"])
    
    #Removing entity names
    df["passage_tokens"] = df.progress_apply(lambda x: remove_entity(x["passage_tokens"], x["entity_name"].lower()), axis = 1)

    #Lemmatizing
    df["passage_tokens_lemmatized"] = df["passage_tokens"].progress_apply(lambda x: lemmatize(x))
    
    return df

In [12]:
#Dropping duplicates
#df_unique = df.drop_duplicates(subset = ["sentence_ABSA", "entity_name"])

In [13]:
#Preprocessing
#df_preproc = preprocess(df_unique)

In [14]:
#Saving to CSV
#os.chdir("..")
#os.chdir("..")
#df_preproc.to_csv("Outputs/Articles/df_final_preprocessed.csv")
#os.chdir("Notebooks/Articles")

In [15]:
#Filtering to negative statements only
df_filtered = df_preproc[df_preproc["sentiment"] == -1]

In [16]:
#Setting tokens to list
df_filtered["passage_tokens_lemmatized"] = df_filtered["passage_tokens_lemmatized"].apply(lambda x: ast.literal_eval(x))

## Chapter 2: <a class="anchor" id="chapter2"></a> Topic Modelling

### Section 2.1: <a class="anchor" id="section_2_1"></a> LDA

In [17]:
#Defining function to identify topics of criticism for given entity
def get_topics_of_criticism_lda(entity, num_topics, no_above_dict, tfidf, alpha, eta, decay):
    #Filtering by entity
    data = df_filtered[df_filtered["entity_name"] == entity]
    
    #Creating dictionary
    dictionary = gensim.corpora.Dictionary(data["passage_tokens_lemmatized"])
    dictionary.filter_extremes(no_below = 10, no_above = no_above_dict)
    
    #Creating Doc2Bow corpus
    corpus = [dictionary.doc2bow(doc) for doc in data["passage_tokens_lemmatized"]]
    
    #TF-IDF vectorizing corpus
    tf = models.TfidfModel(corpus)
    corpus_tfidf = tf[corpus]
    
    #Generating final corpus
    if tfidf == False:
        corpus_final = corpus
    else:
        corpus_final = corpus_tfidf
        
    #Fitting LDA model on count vectorized corpus
    lda_model = gensim.models.LdaMulticore(corpus_final, 
                                           num_topics = num_topics, 
                                           id2word = dictionary, 
                                           alpha = alpha,
                                           eta = eta,
                                           decay = decay,
                                           passes = 5,
                                           workers = 5)
    
    #Calculating perplexity
    perplexity = lda_model.log_perplexity(corpus_final)
        
    #Calculating coherence
    coherence_model = CoherenceModel(model = lda_model, 
                                     texts = data["passage_tokens_lemmatized"], 
                                     dictionary = dictionary, 
                                     coherence = "c_v")
    coherence = coherence_model.get_coherence()
    
    return lda_model, corpus_final, perplexity, coherence

In [18]:
#Defining function to save topics
def generate_lda_topics(num_topics, no_above_dict, tfidf, alpha, eta, decay, entities):
    topics_df = pd.DataFrame(index = [x for x in range(num_topics)])
    models = []
    corpora = []
    perplexities = []
    coherences = []
    
    for entity in entities:
        try:
            model, corpus, perplexity, coherence = get_topics_of_criticism_lda(entity,
                                                                               num_topics,
                                                                               no_above_dict,
                                                                               tfidf, 
                                                                               alpha, 
                                                                               eta, 
                                                                               decay)
            all_words = []
            for index in range(num_topics):
                words = model.print_topics(-1)[index][1].split("+")
                words = [re.sub("[0-9]\d{0,}\.[0-9]\d{0,}", "", x) for x in words]
                words = [re.sub("\*", "", x) for x in words]
                words = [re.sub('"', "", x) for x in words]
                words = [re.sub("\ ", "", x) for x in words]
                all_words.append(words)
            topics_df[entity] = all_words

            models.append(model)
            corpora.append(corpus)
            perplexities.append(perplexity)
            coherences.append(coherence)
        
        except:
            topics_df[entity] = 0
            
    avg_perplexity = np.mean(perplexities)
    avg_coherence = np.mean(coherences)
    
    return topics_df, models, corpora, avg_perplexity, avg_coherence

In [19]:
#Getting topics
lda_topics_df, lda_models, lda_corpora, perplexity_lda, coherence_lda = generate_lda_topics(3, 0.6, True, "symmetric", "auto", 0, key_entities_small)

In [22]:
#Printing evaluation metrics
print("Perplexity: ", round(perplexity_lda,1))
print("\nCoherence: ", round(coherence_lda,1))

Perplexity:  -5.0

Coherence:  0.3


### Section 2.2: <a class="anchor" id="section_2_2"></a> NMF

In [23]:
#Defining function to identify topics of criticism for given entity
def get_topics_of_criticism_nmf(entity, num_topics, tfidf, alpha, l1_ratio):
    #Filtering by entity
    data = df_filtered[df_filtered["entity_name"] == entity]
    
    #Count vectorizing corpus
    cv = CountVectorizer()
    cv.fit(data["passage_tokens_lemmatized"].apply(lambda x: " ".join(x)))
    passage_cv = cv.transform(data["passage_tokens_lemmatized"].apply(lambda x: " ".join(x)))
    
    #TF-IDF vectorizing corpus
    tf = TfidfVectorizer()
    tf.fit(data["passage_tokens_lemmatized"].apply(lambda x: " ".join(x)))
    passage_tf = tf.transform(data["passage_tokens_lemmatized"].apply(lambda x: " ".join(x)))
    
    nmf_model = NMF(n_components = num_topics, 
                    alpha = alpha, 
                    l1_ratio = l1_ratio,
                    random_state = 1)
    if tfidf == False:
        #Fitting NMF model on count vectorized corpus
        nmf = nmf_model.fit(passage_cv)
        components_df = pd.DataFrame(nmf_model.components_, columns = cv.get_feature_names())
    else:
        #Fitting NMF model on TF-IDF vectorized corpus
        nmf = nmf_model.fit(passage_tf)
        components_df = pd.DataFrame(nmf_model.components_, columns = tf.get_feature_names())
    
    return components_df

In [24]:
#Defining function to save topics
def generate_nmf_topics(num_topics_dict, tfidf, alpha_dict, l1_dict, entities):
    topics_df = pd.DataFrame(index = [x for x in range(max(num_topics_dict.values()))])
    components_dfs = []
    
    for entity in entities:
        try:
            alpha = alpha_dict[entity]
            l1_ratio = l1_dict[entity]
            num_topics = num_topics_dict[entity]
        except:
            alpha = 0.1
            l1_ratio = 0.1
            num_topics = 3
        
        try:
            components_df = get_topics_of_criticism_nmf(entity, 
                                                            num_topics,
                                                            tfidf, 
                                                            alpha, 
                                                            l1_ratio)
            for index in range(num_topics):
                tmp = components_df.iloc[index]
                words = list(tmp.nlargest(15).index)
                topics_df.at[index, entity] = ", ".join(words)
            components_dfs.append(components_df)
        except:
            topics_df[entity] = 0
            components_dfs.append(0)
            
    return topics_df, components_dfs

In [25]:
#Setting parameters
alpha_dict = {"Ueli_Maurer": 0.1, 
              "Alain_Berset": 0.1,
              "Bundesrat": 0.075, 
              "Taskforce": 0.1, 
              "BAG": 0.2, 
              "Daniel_Koch": 0.2}
l1_dict = {"Ueli_Maurer": 0.6, 
           "Alain_Berset": 0.6, 
           "Bundesrat": 0.7, 
           "Taskforce": 0.1, 
           "BAG": 0.5, 
           "Daniel_Koch": 0.4}
num_topics_dict = {"Ueli_Maurer": 5, 
                   "Alain_Berset": 7, 
                   "Bundesrat": 4, 
                   "Taskforce": 5, 
                   "BAG": 3, 
                   "Daniel_Koch": 4}

In [26]:
#Getting topics
nmf_topics_df, nmf_components_dfs = generate_nmf_topics(num_topics_dict, True, alpha_dict, l1_dict, key_entities_small)

In [29]:
#Reading in CSV
#os.chdir("..")
#os.chdir("..")
#nmf_labelled_topics_df = pd.read_csv("Outputs/Articles/Topic Models/nmf.csv", index_col = 0)
#os.chdir("Notebooks/Articles")

In [30]:
#Renaming columns
nmf_labelled_topics_df = nmf_topics_df.copy()
nmf_labelled_topics_df.rename(columns = {x: x + "_words" for x in nmf_labelled_topics_df.columns}, 
                              inplace = True)

In [31]:
#Adding columns with topic
for column in [x.replace("_words", "_topic") for x in nmf_labelled_topics_df.columns]:
    nmf_labelled_topics_df[column] = np.nan

In [None]:
#Manually checking topics
num_topics = max(num_topics_dict.values())
for entity in key_entities_small:
    print(f"Entity {entity}:")
    print("\n")
    for topic in range(num_topics):
        if pd.notnull(nmf_labelled_topics_df.loc[topic, entity + '_words']):
            print(f"Topic number {topic}:")
            print(f"Words: {nmf_labelled_topics_df.loc[topic, entity + '_words']}")
            print(f"Topic: {nmf_labelled_topics_df.loc[topic, entity + '_topic']}")
            topic_label = input("Topic: ")
            if topic_label == "n/a":
                topic_label = np.nan
                nmf_labelled_topics_df.loc[topic, entity + "_topic"] = topic_label
            else:
                nmf_labelled_topics_df.loc[topic, entity + "_topic"] = topic_label
        else:
            continue
    print("\n---\n")

In [50]:
#Saving to CSV
os.chdir("..")
os.chdir("..")
nmf_labelled_topics_df.to_csv("Outputs/Articles/Topic Models/nmf.csv")
os.chdir("Notebooks/Articles")

### Section 2.3: <a class="anchor" id="section_2_3"></a> Word Embeddings

In [34]:
#Instantiating nlp
nlp = spacy.load("de_core_news_md", disable = ["tagger", "parser", "ner"])

In [35]:
#Defining function to return Spacy vectors
def get_spacy_vectors(tokens):
    tokens = [token_split for token in tokens for token_split in token.split("_")]
    tokens_string = " ".join(tokens)
    dim_x = 300
    try:
        doc = nlp(tokens_string)
        dim_y = len(doc)
        vectors = np.empty((dim_y, dim_x))
        for index, token in enumerate(doc):
            try:
                vector = token.vector
                vectors[index] = vector
            except:
                vectors[index] = 0
    except:
        vectors = np.zeros((1, dim_x))
    return vectors

In [36]:
#Creating embedding dataframe
df_embed = df_filtered.copy()

In [37]:
#Getting Spacy vectors
df_embed["passage_spacy_vectors"] = df_embed["passage_tokens_lemmatized"].progress_apply(lambda x: get_spacy_vectors(x).mean(axis = 0))

100%|████████████████████████████████████| 15886/15886 [00:40<00:00, 388.95it/s]


In [38]:
#Defining function to identify topics of criticism for given entity
def get_topics_of_criticism_kmeans(entity, num_clusters, tfidf):
    #Filtering by entity
    data = df_embed[df_embed["entity_name"] == entity]
    
    #Fitting KMeans
    km = KMeans(
        n_clusters = num_clusters, 
        init = "random",
        n_init = 10, 
        max_iter = 300, 
        random_state = 1)
    clusters = km.fit_predict(data["passage_spacy_vectors"].to_list())
    
    #Assigning clusters
    indeces = data.index
    df_embed["cluster"] = np.nan
    df_embed.loc[indeces, "cluster"] = clusters
    
    if tfidf == False:
        #Finding most common words via CountVectorizer
        cv = CountVectorizer()
        all_topics = []
        for x in range(num_clusters):
            cluster = df_embed[(df_embed["entity_name"] == entity) & (df_embed["cluster"] == x)]["passage_tokens_lemmatized"]
            cv.fit(cluster.apply(lambda x: " ".join(x)))
            passage_cv = cv.transform(cluster.apply(lambda x: " ".join(x)))
            sum_words = passage_cv.sum(axis = 0)
            freq_words = [(word, sum_words[0, index]) for word, index in cv.vocabulary_.items()]
            freq_words = sorted(freq_words, key = lambda x: x[1], reverse = True)
            top_15_words = freq_words[:15]
            top_15_words = [x[0] for x in top_15_words]
            all_topics.append(top_15_words)
    else:
        #Finding most common words via TF-IDF
        tf = TfidfVectorizer()
        all_topics = []
        for x in range(num_clusters):
            cluster = df_embed[(df_embed["entity_name"] == entity) & (df_embed["cluster"] == x)]["passage_tokens_lemmatized"]
            tf.fit(cluster.apply(lambda x: " ".join(x)))
            passage_tf = tf.transform(cluster.apply(lambda x: " ".join(x)))
            sum_words = passage_tf.sum(axis = 0)
            freq_words = [(word, sum_words[0, index]) for word, index in tf.vocabulary_.items()]
            freq_words = sorted(freq_words, key = lambda x: x[1], reverse = True)
            top_15_words = freq_words[:15]
            top_15_words = [x[0] for x in top_15_words]
            all_topics.append(top_15_words)
    
    return all_topics

In [39]:
#Defining function to save topics
def generate_kmeans_topics(num_clusters, tfidf, entities):
    topics_df = pd.DataFrame(index = [x for x in range(num_clusters)])
    for entity in entities:
        try:
            all_topics = get_topics_of_criticism_kmeans(entity, 
                                                        num_clusters,
                                                        tfidf)
            topics_df[entity] = all_topics
        except:
            topics_df[entity] = 0
    return topics_df

In [40]:
#Getting topics
kmeans_topics_df = generate_kmeans_topics(3, True, key_entities_small)

## Chapter 3: <a class="anchor" id="chapter3"></a> Visualization Preparation

### Section 3.1: <a class="anchor" id="section_3_1"></a> Dimensionality Reduction

In [43]:
#Defining function to restructure NMF dataframe, which contains all words and how related they are to a certain topic
def restructure_df(component_df, nmf_labelled_topics_df, entity):
    topic_map = {str(index): topic for index, topic in enumerate(nmf_labelled_topics_df[entity+"_topic"])}
    df_restructured = component_df.T.reset_index().rename(columns = {"index": "word", 
                                                                     0: "topic_0",
                                                                     1: "topic_1",
                                                                     2: "topic_2"})
    df_restructured["topic_index"] = np.array(df_restructured.loc[:,"topic_0":"topic_2"]).argmax(axis = 1)
    df_restructured["word_weight"] = np.array(df_restructured.loc[:,"topic_0":"topic_2"]).max(axis = 1)
    df_restructured["topic"] = df_restructured["topic_index"].apply(lambda x: topic_map[str(x)])
    return df_restructured

In [44]:
#Defining function to fit PCA or TSNE on NMF dataframe to project topics onto two dimensions
def fit_dim_red(df_restructured, pca, entity):
    if pca:
        #Fitting PCA for dimensionality reduction into 2 dimensions
        pca = PCA(n_components = 2, random_state = 1)
        components = pca.fit_transform(df_restructured.loc[:,"topic_0":"topic_2"])

        #Creating dataframe 
        df_reduced = pd.DataFrame(data = components, 
                                  columns = ["component_1", 
                                             "component_2"])
        df_reduced["entity"] = entity
        df_reduced["word"] = df_restructured["word"]
        df_reduced["topic"] = df_restructured["topic"]
        
    else:
        #Fitting TSNE for dimensionality reduction into 2 dimensions
        tsne = TSNE(n_components = 2, random_state = 1)
        components = tsne.fit_transform(df_restructured.loc[:,"topic_0":"topic_2"])

        #Creating dataframe 
        df_reduced = pd.DataFrame(data = components, 
                                  columns = ["component_1", 
                                             "component_2"])
        df_reduced["entity"] = entity
        df_reduced["word"] = df_restructured["word"]
        df_reduced["word_weight"] = df_restructured["word_weight"]
        df_reduced["topic"] = df_restructured["topic"]
        
    return df_reduced

In [45]:
os.chdir("..")
os.chdir("..")
for entity, component_df in zip(key_entities_small, nmf_components_dfs):
    #Calculate restructured and reduced dataframes
    df_restructured = restructure_df(component_df, nmf_labelled_topics_df, entity)
    df_reduced = fit_dim_red(df_restructured, False, entity)
    
    #Save to CSV
    df_restructured.to_csv(f"Outputs/Articles/Topic Models/Dimensionality Reduction/restructured_{entity}.csv")
    df_reduced.to_csv(f"Outputs/Articles/Topic Models/Dimensionality Reduction/reduced_{entity}.csv")
os.chdir("Notebooks/Articles")

### Section 3.2: <a class="anchor" id="section_3_2"></a> Network Analysis

In [46]:
#Defining function to check whether word is closely related to topic
def check_topic_affiliation(df, word, topic):
    if df.loc[topic, word] > 0.05:
        return df.loc[topic, word]
    else:
        return 0

In [47]:
#Defining function to check whether word is closely related to another word
def check_word_affiliation(df, word, topic):
    if df.loc[topic, word] > 0.15:
        return 1
    else:
        return 0

In [48]:
#Defining function to calulate network dataframe, which contains all words and how related they are to each other and another topic
def calculate_network(entity, component_df, labelled_topics_df):
    #Getting corpus words
    words = component_df.columns
    key_words = [x for x in words if any(component_df[x] > 0.05)]
    key_word_index_map = {key: value for value, key in enumerate(key_words)}
    
    #Getting topics
    topics = labelled_topics_df[entity + "_topic"][labelled_topics_df[entity + "_topic"].notnull()]
    
    #Getting combos of all key corpus words
    combos = list(itertools.combinations(key_words, 2))
    
    #Creating dataframe with key corpus word pairs
    df_words_network = pd.DataFrame(data = {"word_combos": combos})
    df_words_network["word_1"] = df_words_network["word_combos"].apply(lambda x: list(x)[0])
    df_words_network["word_2"] = df_words_network["word_combos"].apply(lambda x: list(x)[1])
    df_words_network.drop("word_combos", axis = 1, inplace = True)
    
    #Getting affiliation of words with topic
    new_columns = [word + "_topic_" + str(topic) for word in ["word_1", "word_2"] for topic in range(len(topics))]
    columns = [word for word in ["word_1", "word_2"] for topic in range(len(topics))]
    topic_indeces = [topic for word in ["word_1", "word_2"] for topic in range(len(topics))]
    for new_column, column, topic in zip(new_columns, columns, topic_indeces):
        df_words_network[new_column] = df_words_network[column].apply(lambda x: check_word_affiliation(component_df, x, topic))
        
    #Getting topics that both words are related to 
    new_columns = ["intersection_topic_" + str(topic) for topic in range(len(topics))]
    columns_1 = ["word_1_topic_" + str(topic) for topic in range(len(topics))]
    columns_2 = ["word_2_topic_" + str(topic) for topic in range(len(topics))]
    for new_column, column_1, column_2 in zip(new_columns, columns_1, columns_2):
        df_words_network[new_column] = df_words_network.apply(lambda x: 1 if (x[column_1] + x[column_2]) == 2 else 0, axis = 1)
    
    #Calculating total intersection strength between words
    df_words_network["intersection_total"] = df_words_network.apply(lambda x: sum(x["intersection_topic_0":"intersection_topic_"+str(len(topics)-1)]), axis = 1)
    
    #Creating dataframe with key corpus word and topic pairs
    df_word_topic_network = pd.DataFrame(data = {"word": key_words})
    
    #Getting affiliation of words with topic
    columns = ["word_topic_" + str(topic) for topic in range(len(topics))]
    topic_indeces = [topic for topic in range(len(topics))]
    for column, topic in zip(columns, topic_indeces):
        df_word_topic_network[column] = df_word_topic_network["word"].apply(lambda x: check_topic_affiliation(component_df, x, topic))
    
    return df_words_network, df_word_topic_network

In [49]:
#Looping through entities
os.chdir("..")
os.chdir("..")
for entity, component_df in zip(key_entities_small, nmf_components_dfs):
    #Calculate network dataframes
    df_words_network, df_word_topic_network = calculate_network(entity, component_df, nmf_labelled_topics_df)
    
    #Save to CSV
    df_words_network.to_csv(f"Outputs/Articles/Topic Models/Network/words_network_{entity}.csv")
    df_word_topic_network.to_csv(f"Outputs/Articles/Topic Models/Network/word_topic_network_{entity}.csv")
    component_df.to_csv(f"Outputs/Articles/Topic Models/Network/component_df_{entity}.csv")
os.chdir("Notebooks/Articles")