# Co-word analysis
- ...

**TODO**
- When `sampling = True`, some runs lead to an error.

In [None]:
import sys
import matplotlib.pyplot as plt
import igraph as ig

from pathlib import Path

# Add the src directory to the Python path
src_path = Path("../") / "src"
if src_path.resolve() not in sys.path:
    sys.path.insert(0, str(src_path.resolve()))

from config import *
from utilities import *
from co_terms import *

In [None]:
# Input parameters
# -----------------------
biblio_project_dir = 'example_project'          # directory for the data and models of your bibliographic project
biblio_input_dir = 'processed'                  # directory containing the input file
biblio_input_file = 'biblio_example_all.csv'    # input file (bibligraphic dataset)

output_dir = 'results'                          # directory where you want to save the graph in GraphML format
output_file = f'co_word_example_100.graphml'    # filename of the bibliographic dataset; leave empty if you don't want to save the data

n_rows = 100                # the maximum number of rows read for each dataset; set to '0' if you want to read all the data
sampling = False            # will randomly sample n_rows from the input bibliographic dataset

keyword_col = 'kws'         # the column with keywords for which to construct the co-word graph
min_count = 4               # words with occurrence count below min_count are not included in the graph
exclude_terms = ['human', 'male', 'female']     # exclude these terms from the graph
# -----------------------

In [None]:
# 1. Read the bibliographic datasets
biblio_df = read_biblio_csv_files_to_df(biblio_project_dir = biblio_project_dir, 
                                        input_dir = biblio_input_dir,
                                        input_files = biblio_input_file,
                                        biblio_source = BiblioSource.BIBLIO,
                                        n_rows = n_rows,
                                        sample = sampling)

# 2. Construct the co-word graph
graph, vertices, pair_counter = create_co_term_graph(biblio_df[keyword_col], 
                                           min_count = min_count,
                                           singularise = True,
                                           synonymise = False,
                                           stem = False,
                                           exclude_terms = exclude_terms)

# 3. Print the word and word pair frequencies

# Convert pair_counter to a DataFrame
pairs_df = pd.DataFrame.from_dict(pair_counter, orient='index', columns=['Count'])

# Reset the index and rename the column
pairs_df = pairs_df.reset_index().rename(columns={'index': 'Pair'})

# Print the word and word pair frequencies
[print(f"{string:20} {count}") for string, count in Counter(vertices).most_common()]
print(pairs_df.sort_values(by = 'Count', ascending = False).reset_index(drop = True))

# 4. Plot the co-word graph

# Add the edge labels to the graph based on the count of distinct pairs
for (string1, string2), count in pair_counter.items():
    graph.es[graph.get_eid(string1, string2)]['label'] = count

fig, ax = plt.subplots()

ig.plot(graph, 
        target = ax,
        vertex_size = 0.1,
        vertex_label = graph.vs["name"],
        edge_label=graph.es["label"])

plt.show()

graph.write(get_root_dir() / 'data' / biblio_project_dir / output_dir / output_file, format = 'graphml')

