In [None]:
%load_ext autoreload
%autoreload 2
# networkx has to be networkx-3.0
from utils import *
import pandas as pd
from datetime import datetime
import scipy
import statistics
import matplotlib.pyplot as plt # !pip install matplotlib -U # 3.7.0
import numpy as np
%config InlineBackend.figure_format='retina'
from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter
import seaborn as sns
from kneed import KneeLocator
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Load data

In [None]:
docs_df = pd.read_pickle('processed_docs/loaded_txt_files.pkl')

In [None]:
docs_df.head(3) # Excerpt of the loaded documents

# 1. Select country

In [None]:
# Note: .copy() To esure that the original df "docs_df" is not affected by
# any further cleaning steps

selection = docs_df.loc[(docs_df['country']=='us') & 
                       (docs_df['category']=='strategy')].copy()

In [None]:
selection.head(5)

# 2. Clean and prepare the text

The custom_stop_words list below is iteratively filled based on preliminary results of each country (i.e., each country may have different stopwords).

In [None]:
custom_stop_words = ['use', 'due', 'well', 'however', 
                     'within', 'must',
                     'also', 'since', 'table', 'text',
                     'finally', 'day', 'sometimes', 'issue',
                     'section', 'set', 'used', 'belief', 'thus',
                     'may', 'stated', 'system', 'forth', 'outlined',
                     'including', 'three', 'example', 'some', 'ass',
                     'step', 'take', 'call', 'whether', 'number', 'make',
                     'much', 'shall', 'using', 'data', 'therefore', 'agency', 'yet',
                     'date', 'title', 'subject', 'february', 'nearly', 'chief', 'officer',
                     'secretary', 'head', 'director', 'year', 'annual', 'etc', 'new', 'many', 'little', 
                     'purely', 'would', 'will', 'last', 'today', 'often', 'past', 'already', 'put', 
                     'another', 'simply', 'without', 'widely', 'otherwise', 'one', 'moreover', 'better',
                     'fully', 'could', 'can', 'should', 'upon', 'every','bring', 'written', 'recent', 'mean', 'fit', 
                     'although', 'seeing', 'fill', 'select', 'part', 'turn', 'might', 'likely', 'taken', 
                     'eighth', 'indeed', '1960s', 'five', 'six', 'second', 'annex', 'lastly', 'firstly', 'along', 
                     'million', 'going', 'head', '20year', 'futherance', 'third', 'subsection', 
                     'always', 'forgoing', 'orginally', 'see', 'team', 'forbearing', 'even', 'given', 
                     'making', 'among', 'two', 'unnecessarily', 'necessarily'
                    ] 

In [None]:
%%time
# Depending on the size of the text and number of documents this might take a while
# Note: The `prepare_text` function needs a list of text as input
selection['prepared_text'] = selection['text'].apply(lambda x: prepare_text([x], custom_stop_words))

In [None]:
# Set the document's name as index
selection.set_index('file', inplace=True)

In [None]:
# Excerpt of the final data set
selection.head(5)

# 3. Create co-occurrence matrix

* How often words occur together in a sentence. 
* The concept of term-context matrix is used, in which each sentence is represented as a context. If two terms (words) occur in the same context, they are said to have occured in the same occurence context.

In [None]:
# Note: When applying the `.values.tolist()` the result is too nested
# list(itertools.chain.from_iterable()) takes care of that
prepared_input = list(itertools.chain.from_iterable(selection['prepared_text'].values.tolist()))

In [None]:
%%time
# Get nodes and co-occurrence matrix from prepared text
nodes, matrix = create_context_matrix(prepared_input)

In [None]:
print(f"Number of nodes: {len(nodes)} - Shape of co-occurrence matrix: {matrix.shape}")

# 4. Pre-cleaning to shrink the data before creating the graph

1. remove some words from the matrix using tf-idf
    
2. remove some co-occurences (edges) from the matrix 

### 4.1 Using tf-idf as a filter to reduce the no. of words by removing words that have lower importance

In [None]:
%%time
# TfidfVectorizer needs the (cleaned) text in a sentence format
# Therefore, another column is created containing the prepared text as sentences
def format_prepared_text(text):
    """
    [[w1, w2], [wa, wb]] -> w1 w2. wa wb.
    """
    y = [' '.join(i) for i in text]
    return '. '.join(y)    

selection['prepared_text_tfidf'] = selection['prepared_text'].apply(lambda x: format_prepared_text(x))

In [None]:
selection.head(5)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize and apply the TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(selection['prepared_text_tfidf'].values.tolist())

# Create a DataFrame from the output
tfidf_df = pd.DataFrame(X.toarray().transpose(),
             columns=selection.index,
             index=vectorizer.get_feature_names_out())

tfidf_df['sum'] = tfidf_df.sum(axis=1) # sum up each words' tf-idf scores across all documents to get total
tfidf_df.sort_values(by="sum", ascending=False,inplace=True)

#### Define threshold to drop words: 
* `0.25` = the 25th percentile (this is chosen to be most appropriate)
* `0.5` = median
* `0.75` = the 75th percentile

In [None]:
words_to_remove_tfidf = tfidf_df.loc[tfidf_df['sum'] < tfidf_df['sum'].quantile(0.25)].index.to_list()
len(words_to_remove_tfidf)

print(f"By applying this filter {len(words_to_remove_tfidf)} nodes would be dropped from {len(nodes)}.")

In [None]:
tfidf_filtered_matrix = matrix.drop(index=words_to_remove_tfidf, 
                            columns=words_to_remove_tfidf)

In [None]:
tfidf_filtered_matrix.shape

### 4.2 Remove co-occurences (edges) between words that appear too little times to be considered important

In [None]:
selected_matrix = tfidf_filtered_matrix

In [None]:
# Note: The edge_weights series has the same shape as nodes * nodes
# So that every combination between each word is represented
edge_weights = pd.Series(selected_matrix.to_numpy(copy=True).flatten()) # one-dimensional list

In [None]:
# Create elbow plot for edges
alt = pd.DataFrame(edge_weights, columns=["edge_weight"])
alt = alt.reset_index()
alt = alt[alt["edge_weight"]>0]#0 = no connection
alt = alt.groupby(["edge_weight"], as_index=False).count()
alt.columns=["edge_weight", "edge_freq"]

In [None]:
fig, ax = plt.subplots(figsize=(5,3))
ax.plot(alt["edge_weight"],alt["edge_freq"])

kl = KneeLocator(alt["edge_weight"], alt["edge_freq"], S=1, curve='convex', direction='decreasing')
knee_point = alt[alt["edge_weight"]==kl.knee]

ax.plot(knee_point["edge_weight"], knee_point["edge_freq"],marker="o")
ax.annotate(knee_point["edge_weight"].values[0], 
            (knee_point['edge_weight'].values[0], 
             knee_point['edge_freq'].values[0]))
ax.set_ylabel("Edge Frequency")
ax.set_xlabel("Edge Weight")
ax.set_title("Edge Weight vs. Edge Frequency for the US")

current_values = plt.gca().get_yticks()
plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in current_values])

plt.tight_layout()

plt.savefig('elbow_US.png', dpi=300)
plt.show()

In [None]:
knee_point

#### Define cut-off threshold and apply shrinking

In [None]:
edge_threshold = 5
shrinked_matrix = selected_matrix.mask(selected_matrix < edge_threshold).fillna(0)

# 5. Build graph

In [None]:
%%time
graph = create_nxgraph(shrinked_matrix)

In [None]:
# there may be some isolated nodes (have no connection to other nodes)
# hence drop them from the graph
graph.remove_nodes_from(list(nx.isolates(graph)))

In [None]:
# Final result
get_graph_attributes(graph)

# 6. Create/identify communities via Leiden

In [None]:
graph_com = create_community_graph(graph) 

In [None]:
# Get information about the identified clusterse or communities
get_community_summary(graph_com)

In [None]:
# get top n words per community 
top_n_words_by_community(graph_com).head(10)

### 6.1 Select the communities large enough for visualization (e.g., at least 20 nodes)

In [None]:
selected_nodes = [x for x,y in graph_com.nodes(data=True) if y['community'] in [0, 1, 2, 3, 4, 5, 6, 7, 8]]

# Note: When creating a subgraph from a graph - a connection to the graph remains
# Changes made on the subgraph apply on the graph. To avoid this use the following:
# graph_com_top9 = graph_com.subgraph(selected_nodes) # connection would remain

graph_com_top9 = graph_com.subgraph(selected_nodes).copy()

### 6.2 Get some descriptive analysis about the top n words per community

In [None]:
top_n_words_by_community(graph_com_top9)

# 7. Vizualizations 
vizualize one selected community each time via matplotlib or to be exported to Gephi

In [None]:
# create graph for each individual community
com_one = graph_per_community(graph_com_top9, 0)
com_two = graph_per_community(graph_com_top9, 1)
com_three = graph_per_community(graph_com_top9, 2)
com_four = graph_per_community(graph_com_top9, 3)
com_five = graph_per_community(graph_com_top9, 4)
com_six = graph_per_community(graph_com_top9, 5)
com_seven = graph_per_community(graph_com_top9, 6)
com_eight = graph_per_community(graph_com_top9, 7)
com_nine = graph_per_community(graph_com_top9, 8)

In [None]:
plot_community_graph(com_one)

In [None]:
get_graph_attributes(com_one)

In [None]:
graph_to_gml(com_one, 'graph_com_one_US')

End of Notebook 