In [None]:
# Harper Adams University
# Harper Food Innovation (HFI)

# Production title: 'Bibliografix' (V0.1)

# see https://github.com/glados-mcspud/bibliography for 'readme'

# Striving for Open Access! 

# Note: comments may be excessive for established users of python; please note that this is intended to be used 
# in part for teaching and training purposes in addition to being a useful script for researchers/students/librarians and so on. 

In [None]:
# *** Ensure all necessary packages/modules from packages are installed ***
# *** More may be added to future versions as new functions are provided/trialled ***

# Essential/standard/common packages (see PyPi.org)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import glob
import os

# *** For wordclouds and quantitative analyses later ***
from matplotlib.colors import LinearSegmentedColormap  
from wordcloud import WordCloud
from nltk.corpus import stopwords
# Note: the purpose of a corpus is to provide a rich dataset for analysis, research, [...] 
# and training computational models, such as machine learning algorithms for language understanding

import re
from sklearn.feature_extraction.text import TfidfVectorizer
# Term Frequency (TF): Measures how often a given term appears in a document relative to the total number of terms in that document. 
# It reflects the importance of a term within a document.

from sklearn.decomposition import PCA, LatentDirichletAllocation
# LDA: A probabilistic method for topic modeling. 
# It identifies hidden topics in a collection of documents by clustering words that frequently appear together.

from sklearn.cluster import KMeans
# KMeans is a popular clustering algorithm for partitioning data into a predefined number of clusters (e.g., grouping similar documents).

import nltk        # For advanced natural language processing tasks.
import spacy       # For sophisticated text processing and entity recognition.
import sklearn     # For statistical analyses, clustering, or topic modeling.

from sklearn.feature_extraction.text import CountVectorizer
# (1) Tokenization: Splits the text into individual words or tokens.
# (2) Vocabulary Building: Creates a dictionary of unique words across the aforementioned corpus (or corpora).
# (3) Feature Matrix Generation: Constructs a sparse matrix (or dense array) where rows correspond to documents and columns correspond to unique words. 
# Each cell contains the count of a word's appearance in the document. This does NOT feature in the current version.

# Standard packages/modules (see PyPi.org)
import plotly.express as px
from collections import Counter
import plotly.express as px
import plotly.io as pio

print("Active Conda environment:", os.environ.get("CONDA_DEFAULT_ENV"))

In [None]:
# *** Step 1: Dynamically identify and list your files ***
# *** This will be selectable for the full user-friendly version ***
files = glob.glob("search_*.xlsx")
if not files:
    raise ValueError("No files found with the pattern 'search_*.xlsx'.")

def extract_number(filename):
    match = re.search(r"(\d+)", os.path.basename(filename))
    return int(match.group(1)) if match else 0

files = sorted(files, key=extract_number)
print("Files to process:", files)

# *** Step 2: Load the first file (with header) ***
first_file = files[0]
first_number = extract_number(first_file)
sheet_name_first = f"savedrecs_{first_number}"
df_first = pd.read_excel(first_file, sheet_name=sheet_name_first)
lib_start = df_first.copy()  # Preserve the header from the first file

# *** Step 3: Loop through the remaining files ***
for file in files[1:]:
    file_number = extract_number(file)
    sheet_name = f"savedrecs_{file_number}"
    # Read the file without its header (skip the first row)
    df_temp = pd.read_excel(file, sheet_name=sheet_name, header=None, skiprows=1)
    # Use the header from the first file
    df_temp.columns = df_first.columns
    # Concatenate this file's data into the master DataFrame
    lib_start = pd.concat([lib_start, df_temp], ignore_index=True)

# *** Step 4: Drop Empty Columns ***
lib_start = lib_start.dropna(axis=1, how='all')

# *** Step 5: Remove Duplicate Rows ***
lib_start = lib_start.drop_duplicates(ignore_index=True)

# *** Step 6: Prepare the environment for deeper literature analysis ***
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

# *** Step 7: Check for empty rows (rows where all values are NaN) ***
empty_rows = lib_start[lib_start.isnull().all(axis=1)]

if empty_rows.empty:
    print("No empty rows found in lib_start. Yay, let's dive in :)")
else:
    print(f"Found {empty_rows.shape[0]} empty row(s):")
    print(empty_rows)

# *** Step 8: Print the full dimensions pre- and post deletion of non-journal entries for inspection ***
print("Dimensions of lib_start dataframe (all entries):", lib_start.shape) 
lib_start = lib_start[lib_start["Publication Type"] == "J"].reset_index(drop=True) # remove non-article entries
print("Dimensions of lib_start dataframe (only journal entries):", lib_start.shape) #reprint
    
    # *** [8-A]: It looks like there are some non-English journals, let's check and, if so, remove them ***
    # *** Rows where Language equals ('==') "English" will be kept; all others will be discarded ***
    
lib_start = lib_start[lib_start["Language"] == "English"].reset_index(drop=True)
    
# *** Verify any removals by printing potentially new dimensions ***
print("Dimensions after filtering non-English journals:", lib_start.shape)

# *** Step 9: Print all headers to make analysis easier later (e.g., we know exactly what datapoints we have to work with)
print("Available headers in 'lib_start':")
list(lib_start.columns)

In [None]:
from matplotlib.colors import LinearSegmentedColormap, Normalize
######VISUALS########
# *** Step 1: Build the custom ordering based on publication counts ***
journal_counts = lib_start["Source Title"].value_counts()

# Separate journals with more than one publication and those with exactly one.
more_than_one = journal_counts[journal_counts > 1]
one_pub = journal_counts[journal_counts == 1]

# Order journals with >1 publication descending by count, then journals with one publication alphabetically.
ordered_more = more_than_one.sort_values(ascending=False).index.tolist()
ordered_one = sorted(one_pub.index.tolist())
final_order = ordered_more + ordered_one

# *** Step 2: Create a custom palette (smooth gradient from light green to dark green) ***
# *** Normalize publication counts to the range [0, 1]. ***
min_count = journal_counts.min()
max_count = journal_counts.max()
norm = Normalize(vmin=min_count, vmax=max_count)

# *** Reverse the color gradient by setting the lower end to "lightgreen" and the upper end to "darkgreen". ***
cmap = LinearSegmentedColormap.from_list("GreenGradient", ["lightgreen", "darkgreen"])

# *** Assign each journal a color based solely on its publication count. ***
# *** Journals with the same count will receive the same color. ***
journal_colors = {journal: cmap(norm(journal_counts[journal])) for journal in final_order}

# *** Step 3: Plot the countplot with hue assigned (to enforce our custom palette) and disable the legend ***
plt.figure(figsize=(12, 8))
sns.countplot(data=lib_start,
              y="Source Title",
              order=final_order,
              hue="Source Title",  # Use hue to apply our custom color mapping.
              palette=journal_colors,
              dodge=False)

plt.legend([], [], frameon=False)  # Disable the legend.
plt.title("Distribution of Articles by Journal")
plt.xlabel("Number of Articles")
plt.ylabel("")
# plt.tight_layout()
# plt.savefig("distribution_of_articles.png", format="png", dpi=300, bbox_inches="tight")  # Save as PNG
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Update the overall font size to 14
plt.rcParams.update({'font.size': 14})

wos_counts = lib_start['WoS Categories'].value_counts()

# Convert the Series into a DataFrame:
df_counts = wos_counts.reset_index()
df_counts.columns = ['Category', 'Count']

# Create a new column "CountLabel" from the count values.
# This ensures that categories with the same count get the same label and hence the same color.
df_counts['CountLabel'] = df_counts['Count'].astype(str)

# Set a larger figure size so that the plot matches the title length
plt.figure(figsize=(12, 8))

# Plot the count bar plot, using "CountLabel" for hue; this makes sure identical counts have the same color.
sns.barplot(
    x='Count', 
    y='Category', 
    hue='CountLabel',
    data=df_counts, 
    palette="coolwarm", 
    dodge=False, 
    legend=False
)

plt.xlabel("Number of Articles")
plt.ylabel("")
plt.title("Distribution of Articles by WoS Categories", loc="left")
#plt.tight_layout()
plt.figure(figsize=(12, 8))
plt.show()

In [None]:
# *** Filter rows where "WoS Categories" contains "Dance" (case insensitive) ***
# - that's an interesting result warranting exploration!
dance_rows = lib_start[lib_start["WoS Categories"].str.contains("Dance", case=False, na=False)]

# Print the resulting rows
print(dance_rows)

# Print the abstract from the row with index 50
print(lib_start.loc[50, "Abstract"])

In [None]:
# *** Step 1: Preprocess Abstracts (Remove duplicate words per abstract and filter out short words) ***
abstracts = lib_start["Abstract"].dropna().tolist()
processed_abstracts = []
for abstract in abstracts:
    words = abstract.split()
    # Only keep words with more than four characters
    filtered_words = [word for word in words if len(word) > 4]
    # Remove duplicate words within this abstract (order is not preserved)
    unique_words = set(filtered_words)
    processed_abstracts.append(" ".join(unique_words))
    
# Combine all processed abstracts into one large text string
combined_text = " ".join(processed_abstracts)

# *** Step 2: Define Stopwords using NLTK's English Stopwords ***
stop_words = set(stopwords.words("english"))

# *** Step 3: Generate the word cloud for abstracts ***
wordcloud = WordCloud(width=800, 
                      height=400, 
                      background_color='white', 
                      stopwords=stop_words, 
                      max_words=40, 
                      contour_width=10, 
                      contour_color='darkgrey', 
                      collocations=False).generate(combined_text)

# *** Step 4: Display the word cloud ***
plt.figure(figsize=(15, 7.5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
import re
import pandas as pd
import matplotlib.pyplot as plt

# *** Step 1: Define search terms/patterns and custom labels ***
search_patterns = [r"food", r"agri\w*", r"feed\w*", r"nutrit\w*", r"diet\w*"]
custom_labels = ["Food", "Agri*", "Feed*", "Nutrit*", "Diet*"]  # Custom labels corresponding to search patterns

# *** Step 2: Specify the columns to be searched ***
columns_to_search = ["Article Title", "Abstract", "Author Keywords", "Keywords Plus"]

# *** Step 3: Count frequency of each search term across the specified columns ***
# Initialise a dictionary to hold the overall counts for each pattern.
frequency_counts = {pattern: 0 for pattern in search_patterns}

# Initialise a nested dictionary to hold counts per column for each search pattern.
frequency_counts_by_column = {pattern: {col: 0 for col in columns_to_search} for pattern in search_patterns}

# Iterate through each specified column and each pattern using nested 'for' loops to find matches to search words
for col in columns_to_search:
    if col in lib_start.columns:
        # Replace missing values with empty strings to avoid errors
        col_series = lib_start[col].fillna("")
        for pattern in search_patterns:
            count = col_series.str.count(pattern, flags=re.IGNORECASE).sum()
            frequency_counts[pattern] += count
            frequency_counts_by_column[pattern][col] = count

# *** Step 4: Print the overall frequency counts for each search term ***
print("Overall Frequency Counts for each Search Term:")
for label, pattern in zip(custom_labels, search_patterns):
    print(f"Label '{label}': {frequency_counts[pattern]}")

# *** Step 5: Visualisation Option 1 - Bar Chart of Overall Frequency Counts ***
patterns = list(frequency_counts.keys())
counts = [frequency_counts[p] for p in patterns]

plt.figure(figsize=(10, 6))
plt.bar(custom_labels, counts, color='skyblue')
plt.title("Overall Frequency of Each Search Term")
# plt.xlabel("Search Term") 
plt.ylabel("Frequency Count")
plt.show()

# *** Step 6: Visualisation Option 2 - Stacked Bar Chart by Column ***
# Convert the nested dictionary into a DataFrame for easier plotting.
freq_df = pd.DataFrame(frequency_counts_by_column).T  # Rows: search patterns, Columns: search columns

# Replace row index names with custom labels
freq_df.index = custom_labels

plt.figure(figsize=(10, 6))
freq_df.plot(kind='bar', stacked=True, figsize=(10, 6), colormap=plt.cm.Dark2)

# Rotate x-axis labels diagonally for improved legibility
plt.xticks(rotation=45, ha="right")

plt.title("Frequency of Each Search Term by Column")
# plt.xlabel("Search Term")
plt.ylabel("Frequency Count")
plt.legend(title="Column", bbox_to_anchor=(1.05, 1), loc='upper left')
# plt.tight_layout()
plt.show()

In [None]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import squarify  # For the treemap visualisation
# from matplotlib_venn import venn2  # (Not used in final visualisation)

# *** Step 1: Define search terms/patterns and custom label mapping ***
search_patterns = [r"food", r"agri\w*", r"feed\w*", r"nutri\w*", r"diet\w*"]

# Define custom labels for each search pattern
custom_label_map = {
    r"food": "Food",
    r"agri\w*": "Agri*",
    r"feed\w*": "Feed*",
    r"nutri\w*": "Nutri*",
    r"diet\w*": "Diet*"
}

# Create sorted lists based on the custom labels (alphabetical order)
sorted_custom = sorted(custom_label_map.items(), key=lambda x: x[1])
sorted_patterns = [item[0] for item in sorted_custom]  # e.g. [r"agri\w*", r"diet\w*", r"feed\w*", r"food", r"nutrit\w*"]
sorted_labels   = [item[1] for item in sorted_custom]  # e.g. ["Agri*", "Diet*", "Feed*", "Food", "Nutrit*"]

# *** Step 2: Specify the columns to be searched ***
columns_to_search = ["Article Title", "Abstract", "Author Keywords", "Keywords Plus"]

# *** Step 3: Count frequency of each search term across the specified columns ***
# Initialise a dictionary to hold the overall counts for each pattern.
frequency_counts = {pattern: 0 for pattern in search_patterns}
# Initialise a nested dictionary to hold counts per column for each search pattern.
frequency_counts_by_column = {pattern: {col: 0 for col in columns_to_search} for pattern in search_patterns}

# Loop through each specified column and each pattern, counting all occurrences.
for col in columns_to_search:
    if col in lib_start.columns:
        # Replace missing values with empty strings to avoid errors.
        col_series = lib_start[col].fillna("")
        for pattern in search_patterns:
            # Series.str.count treats the pattern as a regex by default.
            count = col_series.str.count(pattern, flags=re.IGNORECASE).sum()
            frequency_counts[pattern] += count
            frequency_counts_by_column[pattern][col] = count

# *** Step 4: Print the overall frequency counts for each search term ***
print("Overall Frequency Counts for each Search Term:")
for pattern in sorted_patterns:
    print(f"Pattern '{custom_label_map[pattern]}': {frequency_counts[pattern]}")

# *** Step 5: Visualisation 1: Stacked bar chart by column ***
# Convert the nested dictionary into a DataFrame and re-order rows based on our sorted_patterns.
freq_df = pd.DataFrame(frequency_counts_by_column).T
freq_df = freq_df.reindex(sorted_patterns)
# Replace the row index names with our custom labels.
freq_df.index = sorted_labels

plt.figure(figsize=(10,6))
freq_df.plot(kind='bar', stacked=True, figsize=(10,6), colormap='Paired')
plt.title("Frequency of Each Search Term per Source")
plt.xlabel("Search Term")
plt.ylabel("Frequency Count")
plt.legend(title="Column", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# *** Step 6: Visualisation Option - Treemap of Overall Search Term Frequencies ***
# Filter out search terms with zero frequency to avoid errors.
treemap_data = [(custom_label_map[pattern], frequency_counts[pattern])
                for pattern in sorted_patterns if frequency_counts[pattern] > 0]

if treemap_data:
    treemap_labels, sizes = zip(*treemap_data)
    # Create labels that show both the custom label and frequency.
    treemap_labels = [f"{label}\n({int(size)})" for label, size in zip(treemap_labels, sizes)]
    
    plt.figure(figsize=(10, 6))
    squarify.plot(sizes=sizes, label=treemap_labels, 
                  color=['lightblue', 'lightgreen', 'khaki', 'salmon', 'plum'][:len(sizes)], alpha=0.8)
    plt.title("Treemap of Overall Search Term Frequencies")
    plt.axis('off')
    plt.show()
else:
    print("No nonzero frequencies available for treemap visualisation.")

# *** Step 7: Print indices, authors, article titles, and abstracts for all matching records, including primary matching term ***
# First, create an overall mask that flags any record that matches any of the search patterns in the specified columns.
mask = pd.Series(False, index=lib_start.index)
for col in columns_to_search:
    if col in lib_start.columns:
        for pattern in search_patterns:
            mask |= lib_start[col].str.contains(pattern, flags=re.IGNORECASE, regex=True, na=False)

results = lib_start[mask]

print("\nMatching Records:")
for idx, row in results.iterrows():
    # For each row, count occurrences for each search pattern in the given columns.
    pattern_counts = {}
    for pattern in search_patterns:
        total_count = 0
        for col in columns_to_search:
            if col in lib_start.columns:
                text = str(row[col])
                total_count += len(re.findall(pattern, text, flags=re.IGNORECASE))
        pattern_counts[pattern] = total_count
    
    # Determine the primary matching term by choosing the pattern(s) with the highest count.
    max_count = max(pattern_counts.values())
    primary_terms = [custom_label_map[pattern] for pattern, count in pattern_counts.items() if count == max_count and count > 0]
    primary_term_str = ", ".join(primary_terms) if primary_terms else "None"
    
    authors = row["Authors"] if "Authors" in row and pd.notnull(row["Authors"]) else "No Authors Available"
    title = row["Article Title"] if "Article Title" in row and pd.notnull(row["Article Title"]) else "No Title Available"
    abstract = row["Abstract"] if "Abstract" in row and pd.notnull(row["Abstract"]) else "No Abstract Available"
    
    print(f"Index: {idx}")
    print(f"Authors: {authors}")
    print(f"Article Title: {title}")
    print(f"Abstract: {abstract}")
    print(f"Primary Matching Term: {primary_term_str}")
    print("-" * 80)

In [None]:

# *** Step 1: Subset the Data for Selected Indices ***
selected_indices = [17, 22, 24, 40, 41, 47, 53, 54, 58, 65, 66, 69, 78]
df_subset = lib_start.loc[selected_indices].copy()
# For further analyses we will work primarily with the 'Abstract' text field.
# Fill in missing abstracts with an empty string.
documents = df_subset["Abstract"].fillna("")

# *** Step 2: Text Vectorization with TF-IDF ***
# We use TF-IDF to convert our abstracts into numerical feature vectors.
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.85)
X = vectorizer.fit_transform(documents)

# *** Step 3a: Clustering with KMeans ***
# Choose a number of clusters 
# Following trials, 5 (or 3) clusters seem sensible when working with abstract-level information; we are using 5 here
# There may be a benefit to increasing clusters if scanning entire and multiple documents (!!TBC!!)
num_clusters = 5
kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans_model.fit_predict(X)
# Add cluster labels for reference.
df_subset["Cluster"] = cluster_labels

# *** Step 3b: Dimensionality Reduction for Visualisation ***
# Use PCA to reduce the high-dimensional TF-IDF vectors to 2 dimensions.
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X.toarray())

# Plot the clusters on a 2D scatter plot.
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0],
                y=X_pca[:, 1],
                hue=cluster_labels,
                palette="deep",
                s=100)

# Annotate each point with its original index
for i, idx in enumerate(df_subset.index):
    plt.text(X_pca[i, 0] + 0.01, X_pca[i, 1] + 0.01, str(idx), fontsize=9)

plt.title("PCA of Abstracts with KMeans Clusters")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()

# *** Step 4: Topic Modeling with Latent Dirichlet Allocation (LDA) ***
# Apply LDA to extract latent topics among the selected articles.
n_topics = 5
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X)

def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        # Get indices of the top words and map them to words.
        top_features_ind = topic.argsort()[:-num_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        print(f"Topic {topic_idx}: {' '.join(top_features)}")

num_top_words = 10
feature_names = vectorizer.get_feature_names_out()

print("Extracted Topics:")
display_topics(lda, feature_names, num_top_words)


In [None]:
from matplotlib.patches import Patch

# *** Step 1: Subset the Data for Selected Indices ***
# Select a subset of document indices from our input data (i.e., the ones identified as focussing on agri-food system LCA/enviro pedagogy).
selected_indices = [17, 22, 24, 40, 41, 47, 53, 54, 58, 65, 66, 69, 78]
df_subset = lib_start.loc[selected_indices].copy()
# For further analyses let's work primarily with the 'Abstract' field as it provide most relevant prose.
# Missing abstracts are filled with an empty string.
documents = df_subset["Abstract"].fillna("")

# *** Step 2: Text Vectorisation with TF-IDF ***
# Convert our abstracts into numerical feature vectors.
# Remove common English stopwords and ignore terms appearing in more than 85% of documents (this seems to be commonly adopted on StackExchange etc.).
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.85)
X = vectorizer.fit_transform(documents)

# *** Step 3a: Clustering with KMeans ***
# Define the desired number of clusters (here 5) and perform clustering.
num_clusters = 5
kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans_model.fit_predict(X)
df_subset["Cluster"] = cluster_labels

# *** Step 3b: Print Each Extracted Cluster ***
# Now print out each cluster's document indices so these can be used for the image caption.
print("Extracted Clusters (document indices):")
for i in range(num_clusters):
    # Get the document indices for the current cluster.
    cluster_docs = df_subset[df_subset["Cluster"] == i].index.tolist()
    # Add 1 to the cluster label for presentation (i.e. cluster numbering begins at 1).
    print(f"Cluster {i+1}: {', '.join(str(doc) for doc in cluster_docs)}")

# *** Step 3c: Dimensionality Reduction for Visualisation ***
# Reduce our high-dimensional TF-IDF vectors to 2 dimensions with PCA, for visualisation purposes.
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X.toarray())

# Begin plot creation with a fixed figure size.
plt.figure(figsize=(10, 6))
ax = plt.gca()

# Create a scatter plot using Seaborn,
# where each point is coloured according to its cluster.
sns.scatterplot(x=X_pca[:, 0],
                y=X_pca[:, 1],
                hue=cluster_labels,
                palette="deep",
                s=100,
                legend=False,
                ax=ax)

# Annotate each point with its original index.
for i, idx in enumerate(df_subset.index):
    plt.text(X_pca[i, 0] + 0.01, X_pca[i, 1] + 0.01, str(idx), fontsize=9)

plt.title("PCA of Abstracts with KMeans Clusters")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")

# *** Step 3d: Create a Simple Legend ***
# Now create a basic legend showing the cluster number (starting at 1).
palette = sns.color_palette("deep", num_clusters)
handles = [Patch(facecolor=palette[i], label=f"Cluster {i+1}") for i in range(num_clusters)]
plt.legend(handles=handles,
           loc='upper center',
           bbox_to_anchor=(0.5, -0.1),
           ncol=num_clusters)

plt.tight_layout()
plt.show()

# *** Step 4: Topic Modelling with Latent Dirichlet Allocation (LDA) ***
# Apply LDA to extract latent topics among the selected articles.
n_topics = 5
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X)

def display_topics(model, feature_names, num_top_words):
    # For each topic, list the top words to give a feel for its content.
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-num_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind if len(feature_names[i]) >= 4]
        print(f"Topic {topic_idx}: {' '.join(top_features)}")

num_top_words = 10
feature_names = vectorizer.get_feature_names_out()

print("Extracted Topics:")
display_topics(lda, feature_names, num_top_words)

In [None]:
#Cell 15

# Let's try make this more sensible!!

# *** Step 1: Subset the Data ***
# Selected indices based on prior exploration.
selected_indices = [17, 22, 24, 40, 41, 47, 53, 54, 58, 65, 66, 69, 78]
df_subset = lib_start.loc[selected_indices].copy()

# Combine "Article Title" and "Abstract" to capture the teaching narrative for this domain.
# If either is missing, replace with an empty string.
df_subset['Combined_Text'] = (
    df_subset['Article Title'].fillna('') + '. ' + df_subset['Abstract'].fillna('')
)

# *** Step 2: Vectorize the Text using CountVectorizer *** 
# (preferred for LDA as it expects raw counts)

# According to multiple discussions on stackexchange it's best to use a CountVectorizer with English stop words.
# Adjust max_df/min_df to remove overly common or extremely rare terms (if needed).
count_vectorizer = CountVectorizer(stop_words="english", max_df=0.85, min_df=2)
count_data = count_vectorizer.fit_transform(df_subset['Combined_Text'])


# *** Step 3: Topic Modeling with LDA ***

# For our targeted narrative we choose a modest number of topics.
# You might start with 3 topics and then adjust based on domain interpretation.
n_topics = 3
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=10)
lda_topic_matrix = lda_model.fit_transform(count_data)

# -------------------------------
# Helper: Display Topics Function
# -------------------------------
def get_topic_keywords(model, feature_names, no_top_words=10):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        # Get indices of the top words for this topic
        top_words_idx = topic.argsort()[:-no_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append(top_words)
    return topics

no_top_words = 10
feature_names = count_vectorizer.get_feature_names_out()
topic_keywords = get_topic_keywords(lda_model, feature_names, no_top_words)

# -------------------------------
# Step 4: Annotate Each Document with a Dominant Topic
# -------------------------------
# For each document, determine which topic has the highest probability.
dominant_topics = np.argmax(lda_topic_matrix, axis=1)
max_topic_weights = np.max(lda_topic_matrix, axis=1)

df_subset['Dominant_Topic'] = dominant_topics
df_subset['Topic_Weight'] = max_topic_weights

# -------------------------------
# Step 5: Produce a Legible Summary Table
# -------------------------------
# Build a summary table that reports:
# - Original index (as in the full DataFrame)
# - Article Title (for context)
# - Dominant Topic & Topic Weight
# - Top keywords for the assigned topic
# - A snippet of the abstract for quick human interpretation
summary_columns = ['Article Title', 'Dominant_Topic', 'Topic_Weight', 'Abstract']

summary_table = df_subset[summary_columns].copy()
summary_table['Topic_Keywords'] = df_subset['Dominant_Topic'].apply(
    lambda t: ", ".join(topic_keywords[t])
)
summary_table['Abstract_Snippet'] = df_subset['Abstract'].fillna("").apply(
    lambda text: text if len(text) < 250 else text[:250] + "..."
)

# For clarity, print a neat summary for each document:
print("Summary of Selected Documents and LDA Topics:\n")
for idx, row in summary_table.iterrows():
    # Use the original DataFrame's index value for clarity.
    print(f"Document (DataFrame Index): {idx}")
    print(f"Article Title    : {row['Article Title']}")
    print(f"Dominant Topic   : {row['Dominant_Topic']} (Weight: {row['Topic_Weight']:.2f})")
    print(f"Topic Keywords   : {row['Topic_Keywords']}")
    print("Abstract Snippet :")
    print(row['Abstract_Snippet'])
    print("-" * 100)

# Optional: Visualize Topic Distribution across these selected documents
plt.figure(figsize=(8, 5))
sns.countplot(x='Dominant_Topic', data=df_subset, palette="Set2")
plt.xlabel("Dominant Topic")
plt.ylabel("Number of Documents")
plt.title("Topic Distribution for Selected Documents")
plt.tight_layout()
plt.show()

# -------------------------------
# Interpretation Guidance (disclaimer: assistance from Co-Pilot here):
# -------------------------------
# The printed summary shows for each document:
#  - Which topic (0, 1, or 2) is most dominant along with the weight (a proxy for confidence).
#  - The top words of that topic provide a cue to what theme it captures.
#
# Given our narrative focus, you might interpret:
#  • A topic whose keywords include terms like "lifecycle", "assessment", "sustain" may represent teaching approaches to LCA.
#  • Another topic with terms like "carbon", "footprint", "environment" may group around sustainability or carbon footprint assessments.
#  • Further interpretation may be refined by reading the abstract snippets.
#
# You can refine this analysis by:
#  - Adjusting the number of topics.
#  - Re-processing the text with additional domain-specific stopwords or lemmatization.
#  - Comparing these results with clustering (e.g., KMeans) for robustness.


In [None]:
#Cell 16

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

# -------------------------------
# Step 1: Subset the Data
# -------------------------------
# Selected indices based on prior interest.
selected_indices = [17, 22, 24, 40, 41, 47, 53, 54, 58, 65, 66, 69, 78]
df_subset = lib_start.loc[selected_indices].copy()

# Combine "Article Title" and "Abstract" to capture the narrative.
# (Missing values are replaced with empty strings.)
df_subset['Combined_Text'] = (df_subset['Article Title'].fillna('') + '. ' +
                               df_subset['Abstract'].fillna(''))

# -------------------------------
# Step 2: Vectorize the Text using CountVectorizer 
# -------------------------------
# We use CountVectorizer (which expects raw counts) to build the document-term matrix.
# Adjust max_df and min_df as needed.
count_vectorizer = CountVectorizer(stop_words="english", max_df=0.85, min_df=2)
count_data = count_vectorizer.fit_transform(df_subset['Combined_Text'])

# -------------------------------
# Step 3: Clustering with KMeans
# -------------------------------
# We choose 5 clusters to reflect 5 topic groupings.
num_clusters = 5
kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans_model.fit_predict(count_data)
df_subset["Cluster"] = cluster_labels

# Extract cluster "keywords":
# For each cluster, sort the cluster center’s coefficients in descending order
# to get a list of top words that characterize that cluster.
order_centroids = kmeans_model.cluster_centers_.argsort()[:, ::-1]
feature_names = count_vectorizer.get_feature_names_out()
cluster_keywords = {}
for i in range(num_clusters):
    top_words = [feature_names[ind] for ind in order_centroids[i, :10]]
    cluster_keywords[i] = top_words
    print(f"Cluster {i}: {' '.join(top_words)}")

# Optionally compute a distance measure to the assigned cluster center for each document.
# Lower distance values indicate closer membership.
distances = kmeans_model.transform(count_data)  # shape: (# docs, # clusters)
min_distances = np.min(distances, axis=1)
df_subset['Cluster_Distance'] = min_distances

# -------------------------------
# Step 4: Produce a Legible Summary Table
# -------------------------------
# Build a summary table preenting:
# - The original document index
# - Article title for context
# - Assigned cluster and the distance (a proxy for confidence)
# - The cluster's top keywords
# - A snippet of the abstract
summary_columns = ['Article Title', 'Cluster', 'Cluster_Distance', 'Abstract']
summary_table = df_subset[summary_columns].copy()

# Add a column with the cluster keywords (joined by commas)
summary_table['Cluster_Keywords'] = df_subset['Cluster'].apply(
    lambda c: ", ".join(cluster_keywords[c])
)
# Create an abstract snippet (first 250 characters)
summary_table['Abstract_Snippet'] = df_subset['Abstract'].fillna("").apply(
    lambda text: text if len(text) < 250 else text[:250] + "..."
)

print("Summary of Selected Documents and KMeans Clusters:\n")
for idx, row in summary_table.iterrows():
    print(f"Document (DataFrame Index): {idx}")
    print(f"Article Title    : {row['Article Title']}")
    print(f"Cluster          : {row['Cluster']} (Distance: {row['Cluster_Distance']:.2f})")
    print(f"Cluster Keywords : {row['Cluster_Keywords']}")
    print("Abstract Snippet :")
    print(row['Abstract_Snippet'])
    print("-" * 100)

# -------------------------------
# Step 5: Visualize Cluster Distribution
# -------------------------------
plt.figure(figsize=(8, 5))
sns.countplot(x='Cluster', data=df_subset, palette="Set2")
plt.xlabel("Cluster")
plt.ylabel("Number of Documents")
plt.title("Cluster Distribution for Selected Documents")
plt.tight_layout()
plt.show()

# -------------------------------
# Interpretation Guidance:
# -------------------------------
# - Each document is assigned one of 5 clusters by the KMeans algorithm.
# - The printed "Cluster Keywords" for each cluster are derived from the top words
#   in that cluster's centroid; these can help you judge the semantics of each group.
# - The Cluster_Distance is a measure of how close a document is to its cluster center.
# - You can refine this pipeline by adjusting vectorizer parameters, the number of clusters,
#   or even integrating domain-specific stopwords/seed words for guidance.

In [None]:
#Cell 17

# -------------------------------
# Step 1: Prepare the Text from ALL Entries
# -------------------------------
# Use the entire DataFrame (not just manually selected indices)
df_all = lib_start.copy()

# Combine "Article Title" and "Abstract" to capture context.
# Replace missing values with an empty string.
df_all['Combined_Text'] = (
    df_all['Article Title'].fillna('') + ". " + df_all['Abstract'].fillna('')
)

# -------------------------------
# Step 2: Vectorize the Text using CountVectorizer 
# -------------------------------
# CountVectorizer is preferred for LDA since it expects raw counts.
# Adjust max_df and min_df as needed to filter out overly common and rare terms.
count_vectorizer = CountVectorizer(stop_words="english", max_df=0.85, min_df=2)
count_data = count_vectorizer.fit_transform(df_all['Combined_Text'])

# -------------------------------
# Step 3: Run Topic Modeling with LDA
# -------------------------------
# Choose a modest number of topics (e.g., 3) based on your narrative needs.
n_topics = 3
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=10)
lda_topic_matrix = lda_model.fit_transform(count_data)

# -------------------------------
# Helper: Extract Top Keywords for Each Topic
# -------------------------------
def get_topic_keywords(model, feature_names, no_top_words=10):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[:-no_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append(top_words)
    return topics

no_top_words = 10
feature_names = count_vectorizer.get_feature_names_out()
topic_keywords = get_topic_keywords(lda_model, feature_names, no_top_words)

# -------------------------------
# Step 4: Annotate Each Document with the Dominant Topic
# -------------------------------
# For each document, determine which topic has the highest probability.
dominant_topics = np.argmax(lda_topic_matrix, axis=1)
max_topic_weights = np.max(lda_topic_matrix, axis=1)

df_all['Dominant_Topic'] = dominant_topics
df_all['Topic_Weight'] = max_topic_weights

# -------------------------------
# Step 5: Produce a Legible Summary Table
# -------------------------------
# Build a summary table that includes:
# - Document index (from the main DataFrame)
# - Article Title for context
# - Dominant Topic along with its weight (confidence)
# - Top keywords for that topic
# - A snippet of the abstract for a quick read
summary_columns = ['Article Title', 'Dominant_Topic', 'Topic_Weight', 'Abstract']
summary_table = df_all[summary_columns].copy()

# Map each dominant topic to its top keywords, then join as a comma‑separated string.
summary_table['Topic_Keywords'] = df_all['Dominant_Topic'].apply(
    lambda t: ", ".join(topic_keywords[t])
)

# Create a snippet for the abstract (first 250 characters or the full abstract if shorter)
summary_table['Abstract_Snippet'] = df_all['Abstract'].fillna("").apply(
    lambda text: text if len(text) < 250 else text[:250] + "..."
)

# Print out a neat summary for all documents:
print("Summary of All Documents and LDA Topics:\n")
for idx, row in summary_table.iterrows():
    print(f"Document (DataFrame Index): {idx}")
    print(f"Article Title    : {row['Article Title']}")
    print(f"Dominant Topic   : {row['Dominant_Topic']} (Weight: {row['Topic_Weight']:.2f})")
    print(f"Topic Keywords   : {row['Topic_Keywords']}")
    print("Abstract Snippet :")
    print(row['Abstract_Snippet'])
    print("-" * 100)

# -------------------------------
# Step 6: Visualize Topic Distribution Across All Documents
# -------------------------------
plt.figure(figsize=(8, 5))
sns.countplot(x='Dominant_Topic', data=df_all, palette="Set2")
plt.xlabel("Dominant Topic")
plt.ylabel("Number of Documents")
plt.title("Topic Distribution for Entire Dataset")
plt.tight_layout()
plt.show()

# -------------------------------
# Interpretation Guidance:
# -------------------------------
# The printed summary shows, for each document in the dataset:
#   • The document's article title.
#   • The dominant topic assigned to it along with the topic weight (a measure of confidence).
#   • The top keywords for that topic.
#   • A snippet of the abstract.
#
# This can help interpret themes (e.g., teaching approaches to lifecycle assessment, 
# sustainability, carbon footprints) across your entire data collection.
#
# You can refine this analysis by adjusting the number of topics, 
# further pre-processing the text (e.g., lemmatization), or trying different stopword lists.

In [None]:
#Cell 18

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# -------------------------------
# Step 1: Prepare the Text from ALL Entries
# -------------------------------
# Use the entire DataFrame.
df_all = lib_start.copy()

# Combine "Article Title" and "Abstract" to capture context.
# Replace missing values with an empty string.
df_all['Combined_Text'] = (
    df_all['Article Title'].fillna('') + ". " + df_all['Abstract'].fillna('')
)

# -------------------------------
# Step 2: Vectorize the Text using CountVectorizer 
# -------------------------------
# CountVectorizer expects raw counts for LDA, so we filter out overly common and rare words,
# and we use a token pattern that only allows words of 5 or more alphabetical characters.
count_vectorizer = CountVectorizer(
    stop_words="english", 
    max_df=0.85, 
    min_df=2,
    token_pattern=r'\b[a-zA-Z]{5,}\b'
)
count_data = count_vectorizer.fit_transform(df_all['Combined_Text'])

# -------------------------------
# Step 3: Run Topic Modeling with LDA
# -------------------------------
# Set the number of topics to 5.
n_topics = 5
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=10)
lda_topic_matrix = lda_model.fit_transform(count_data)

# -------------------------------
# Helper: Extract Top Keywords for Each Topic
# -------------------------------
def get_topic_keywords(model, feature_names, no_top_words=10):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        # Get indices of the top words for this topic
        top_words_idx = topic.argsort()[:-no_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append(top_words)
    return topics

no_top_words = 10
feature_names = count_vectorizer.get_feature_names_out()
topic_keywords = get_topic_keywords(lda_model, feature_names, no_top_words)

# -------------------------------
# Step 4: Annotate Each Document with the Dominant Topic
# -------------------------------
# For each document, determine which topic has the highest probability.
dominant_topics = np.argmax(lda_topic_matrix, axis=1)
max_topic_weights = np.max(lda_topic_matrix, axis=1)

df_all['Dominant_Topic'] = dominant_topics
df_all['Topic_Weight'] = max_topic_weights

# -------------------------------
# Step 5: Produce a Legible Summary Table
# -------------------------------
# Build a summary table that includes:
# - Document index (from the main DataFrame)
# - Article Title for context
# - Dominant Topic along with its weight
# - Top keywords for that topic
# - A snippet of the abstract for quick inspection
summary_columns = ['Article Title', 'Dominant_Topic', 'Topic_Weight', 'Abstract']
summary_table = df_all[summary_columns].copy()

# Map each dominant topic to its top keywords, then join as a comma-separated string.
summary_table['Topic_Keywords'] = df_all['Dominant_Topic'].apply(
    lambda t: ", ".join(topic_keywords[t])
)

# Create an abstract snippet (first 250 characters)
summary_table['Abstract_Snippet'] = df_all['Abstract'].fillna("").apply(
    lambda text: text if len(text) < 250 else text[:250] + "..."
)

print("Summary of All Documents and LDA Topics:\n")
for idx, row in summary_table.iterrows():
    print(f"Document (DataFrame Index): {idx}")
    print(f"Article Title    : {row['Article Title']}")
    print(f"Dominant Topic   : {row['Dominant_Topic']} (Weight: {row['Topic_Weight']:.2f})")
    print(f"Topic Keywords   : {row['Topic_Keywords']}")
    print("Abstract Snippet :")
    print(row['Abstract_Snippet'])
    print("-" * 100)

# -------------------------------
# Step 6: Visualize Topic Distribution Across All Documents
# -------------------------------
plt.figure(figsize=(8, 5))
sns.countplot(x='Dominant_Topic', data=df_all, palette="Set2")
plt.xlabel("Dominant Topic")
plt.ylabel("Number of Documents")
plt.title("Topic Distribution for Entire Dataset")
plt.tight_layout()
plt.show()

# -------------------------------
# Interpretation Guidance:
# -------------------------------
# The printed summary lists each document with:
#   • Its article title,
#   • The dominant topic (with a confidence weight),
#   • The top keywords associated with that topic,
#   • And a snippet of the abstract.
#
# This can help in a qualitative evaluation of how well the topics align with themes in your dataset.

In [None]:
#Cell 19

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, LatentDirichletAllocation
from sklearn.cluster import KMeans

# -------------------------------
# Step 1: Prepare the Text for All Entries
# -------------------------------
# Use the entire DataFrame (assumed to be lib_start)
# Fill missing abstracts with an empty string.
documents = lib_start["Abstract"].fillna("")

# -------------------------------
# Step 2: Text Vectorization with TF‑IDF
# -------------------------------
# We use a Count TF‑IDF Vectorizer with English stop words.
# Adjust max_df as needed to exclude overly common words.
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.85)
X = vectorizer.fit_transform(documents)

# -------------------------------
# Step 3a: Clustering with KMeans
# -------------------------------
# Choose a number of clusters. For demonstration, we use 3.
num_clusters = 3
kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans_model.fit_predict(X)
# Add cluster labels to the main DataFrame for reference.
lib_start["Cluster"] = cluster_labels

# -------------------------------
# Step 3b: Dimensionality Reduction (PCA)
# -------------------------------
# Use PCA to reduce the high-dimensional TF‑IDF features to 2 dimensions for visualization.
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X.toarray())

# Plot the clusters on a 2D scatter plot.
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0],
                y=X_pca[:, 1],
                hue=cluster_labels,
                palette="deep",
                s=100)

# Annotate each point with its original DataFrame index.
for i, idx in enumerate(lib_start.index):
    plt.text(X_pca[i, 0] + 0.01, X_pca[i, 1] + 0.01, str(idx), fontsize=9)

plt.title("PCA of Abstracts with KMeans Clusters")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()

# -------------------------------
# Step 4: Topic Modeling with Latent Dirichlet Allocation (LDA)
# -------------------------------
# Apply LDA on the same TF‑IDF data to extract latent topics.
n_topics = 3
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_topic_matrix = lda.fit_transform(X)

def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        # Get indices of the top words for this topic.
        top_features_ind = topic.argsort()[:-num_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        print(f"Topic {topic_idx}: {' '.join(top_features)}")

num_top_words = 10
feature_names = vectorizer.get_feature_names_out()

print("Extracted Topics:")
display_topics(lda, feature_names, num_top_words)

In [None]:
#Cell 20

import pandas as pd
from collections import Counter

# Define a set of US state abbreviations.
US_STATES = {
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID",
    "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS",
    "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK",
    "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV",
    "WI", "WY"
}

def standardize_country(country):
    """
    Standardizes a country token:
      - If it exactly matches 'usa', 'united states', etc.
      - Or if it is a known US state abbreviation (typically 2 letters),
        then return 'USA'.
    """
    token = country.strip()
    # Check for variations of USA
    if token.lower() in {"usa", "united states", "u.s.a.", "us"}:
        return "USA"
    # If token is 2 letters and is a known US state, treat it as USA.
    if len(token) == 2 and token.upper() in US_STATES:
        return "USA"
    return token

def extract_countries(address):
    """
    Given an address string that may contain multiple addresses separated by semicolons,
    extract the country (assumed to be the substring after the last comma in each address),
    and standardize it.
    """
    if pd.isna(address):
        return []
    # Split the address into individual addresses.
    addresses = address.split(";")
    countries = []
    for addr in addresses:
        parts = addr.split(',')
        if len(parts) > 1:
            # Take the last element and standardize it.
            country = standardize_country(parts[-1])
            if country:
                countries.append(country)
    return countries

# Apply extraction and standardization on the 'Addresses' column.
lib_start['Countries'] = lib_start['Addresses'].apply(extract_countries)

# Aggregate counts: go through each row's list of countries and count occurrences.
country_counter = Counter()
for country_list in lib_start['Countries'].dropna():
    country_counter.update(country_list)

# (Optional) Print out the counter to verify that all US addresses are now combined.
# print(country_counter)

# Convert the counter to a DataFrame.
country_counts_df = pd.DataFrame.from_dict(country_counter, orient='index', columns=['Author_Count'])
country_counts_df = country_counts_df.reset_index().rename(columns={'index': 'Country'})

# Optional: Remove any empty strings (if any) from the country counts.
country_counts_df = country_counts_df[country_counts_df['Country'] != ""]

# Create a bubble map using Plotly Express.
fig = px.scatter_geo(
    country_counts_df,
    locations="Country",            # Use the country names for locations.
    locationmode="country names",   # Match based on full country names.
    size="Author_Count",            # Bubble size based on count of authors.
    text="Author_Count",            # Display the count inside the bubble.
    size_max=50,                    # Maximum bubble size.
#    projection="natural earth"      # A pleasant global projection.
    projection="robinson"      # A pleasant global projection.

)

# Adjust text to be centered within the bubbles.
fig.update_traces(textposition='middle center', marker=dict(sizemode='area'))

# Remove the graph title and customize the layout minimally.
fig.update_layout(
    title_text="", 
    geo=dict(
        showland=True, 
        landcolor="LightGreen",
        lakecolor='white'
        
    ),
    margin=dict(l=0, r=0, t=0, b=0)
)
#---------------------------------


fig.update_layout(
    paper_bgcolor="black",       # Sets the background outside the map.
    plot_bgcolor="black"         # Sets the background for the plotting area.
)


fig.show()

In [None]:
#Cell 21

import pandas as pd
from collections import Counter
import re
import plotly.express as px

# Define a set of US state abbreviations.
US_STATES = {
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID",
    "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS",
    "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK",
    "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV",
    "WI", "WY"
}

def standardize_country(country):
    """
    Clean and standardize a token:
      - Remove extraneous punctuation.
      - If the token exactly matches common USA expressions, return "USA".
      - If the token is exactly two letters and in US_STATES, return "USA".
      - If the token starts with a US state abbreviation (e.g., "CA 94043"),
        return "USA".
      - Otherwise, return the token in title-case.
    """
    token = country.strip().replace('.', '')
    token_upper = token.upper()
    
    # Check for explicit variants of USA.
    if token_upper in {"USA", "UNITED STATES", "US", "U S A"}:
        return "USA"
    
    # If token is exactly two letters and a known US state.
    if re.fullmatch(r"[A-Z]{2}", token_upper) and token_upper in US_STATES:
        return "USA"
    
    # If token has multiple parts, check if the first token is a US state abbreviation.
    tokens = token_upper.split()
    if tokens and tokens[0] in US_STATES:
        return "USA"
    
    return token.title()

def extract_countries(address):
    """
    Given an address string that may contain multiple addresses separated by semicolons,
    extract the country (assumed to be the substring after the last comma in each address),
    and standardize it.
    """
    if pd.isna(address):
        return []
    addresses = address.split(";")
    countries = []
    for addr in addresses:
        parts = addr.split(',')
        if len(parts) > 1:
            candidate = parts[-1]
            standardized = standardize_country(candidate)
            if standardized:
                countries.append(standardized)
    return countries

# Apply extraction and standardization on the 'Addresses' column.
lib_start['Countries'] = lib_start['Addresses'].apply(extract_countries)

# Aggregate counts: go through each row's list of countries and count occurrences.
country_counter = Counter()
for country_list in lib_start['Countries'].dropna():
    country_counter.update(country_list)

# Convert the counter to a DataFrame.
country_counts_df = pd.DataFrame.from_dict(country_counter, orient='index', columns=['Author_Count'])
country_counts_df = country_counts_df.reset_index().rename(columns={'index': 'Country'})
country_counts_df = country_counts_df[country_counts_df['Country'] != ""]

# Create a bubble map using Plotly Express.
fig = px.scatter_geo(
    country_counts_df,
    locations="Country",            # Country names for locations.
    locationmode="country names",   # Match based on full country names.
    size="Author_Count",            # Bubble size based on count of authors.
    text="Author_Count",            # Display the count inside the bubble.
    size_max=50,                    # Maximum bubble size.
    projection="robinson"           # A pleasant global projection.
)

# Adjust text to be centered within the bubbles.
fig.update_traces(textposition='middle center', marker=dict(sizemode='area'))

# Customize the layout minimally.
fig.update_layout(
    title_text="",
    geo=dict(
        showland=True,
        landcolor="LightGreen",
        lakecolor='white'
    ),
    margin=dict(l=0, r=0, t=0, b=0),
    paper_bgcolor="black",       # Sets the background outside the map.
    plot_bgcolor="black"         # Sets the background for the plotting area.
)

# Display the figure.
fig.show()

In [None]:
#Cell 22

import pandas as pd
from collections import Counter
import re
import plotly.express as px

# Define a set of US state abbreviations.
US_STATES = {
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID",
    "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS",
    "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK",
    "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV",
    "WI", "WY"
}

def standardize_country(country):
    """
    Clean and standardize a token:
      - Remove extraneous punctuation.
      - If the token exactly matches common USA expressions, return "USA".
      - If the token is exactly two letters and in US_STATES, return "USA".
      - If the token starts with a US state abbreviation (e.g., "CA 94043"),
        return "USA".
      - Otherwise, return the token in title-case.
    """
    token = country.strip().replace('.', '')
    token_upper = token.upper()
    
    # Check for explicit variants of USA.
    if token_upper in {"USA", "UNITED STATES", "US", "U S A"}:
        return "USA"
    
    # If token is exactly two letters and a known US state.
    if re.fullmatch(r"[A-Z]{2}", token_upper) and token_upper in US_STATES:
        return "USA"
    
    # If token has multiple parts, check if the first token is a US state abbreviation.
    tokens = token_upper.split()
    if tokens and tokens[0] in US_STATES:
        return "USA"
    
    return token.title()

def extract_countries(address):
    """
    Given an address string that may contain multiple addresses separated by semicolons,
    extract the country (assumed to be the substring after the last comma in each address),
    and standardize it.
    """
    if pd.isna(address):
        return []
    addresses = address.split(";")
    countries = []
    for addr in addresses:
        parts = addr.split(',')
        if len(parts) > 1:
            candidate = parts[-1]
            standardized = standardize_country(candidate)
            if standardized:
                countries.append(standardized)
    return countries

# Apply extraction and standardization on the 'Addresses' column.
lib_start['Countries'] = lib_start['Addresses'].apply(extract_countries)

# Aggregate counts: go through each row's list of countries and count occurrences.
country_counter = Counter()
for country_list in lib_start['Countries'].dropna():
    country_counter.update(country_list)

# Convert the counter to a DataFrame.
country_counts_df = pd.DataFrame.from_dict(
    country_counter, orient='index', columns=['Author_Count']
)
country_counts_df = country_counts_df.reset_index().rename(columns={'index': 'Country'})
country_counts_df = country_counts_df[country_counts_df['Country'] != ""]

# Create a bubble map using Plotly Express.
fig = px.scatter_geo(
    country_counts_df,
    locations="Country",            # Use the country names for locations.
    locationmode="country names",   # Match based on full country names.
    size="Author_Count",            # Bubble size based on count of authors.
    text="Author_Count",            # Display the count inside the bubble.
    size_max=50,                    # Maximum bubble size.
    projection="mercator"           # A pleasant global projection.
)

# Adjust text to be centered within the bubbles.
fig.update_traces(textposition='middle center', marker=dict(sizemode='area'))

# Add country borders by updating the geo layout.
fig.update_layout(
    title_text="",
    geo=dict(
        showland=True,
        landcolor="LightGreen",
        lakecolor='white',
        showcountries=True,    # Enable country borders.
        countrycolor="white",  # Border color.
        countrywidth=1         # Border line thickness.
    ),
    margin=dict(l=0, r=0, t=0, b=0)
)

# Set the overall background.
fig.update_layout(
    paper_bgcolor="black",       # Sets the background outside the map.
    plot_bgcolor="black"         # Sets the background for the plotting area.
)

# Display the figure.
fig.show()

In [None]:
#Cell 23

# --------------------
# Step 1: Function to extract and standardize country names
# --------------------
def extract_countries(address):
    """
    Given an address string that may contain multiple addresses separated by semicolons,
    extract the country (assumed to be the substring after the last comma in each address).
    """
    if pd.isna(address):
        return []
    addresses = address.split(";")
    countries = []
    for addr in addresses:
        parts = addr.split(',')
        if len(parts) >= 2:
            country = parts[-1].strip()
            if country:
                countries.append(country)
    return countries

def standardize_country(country):
    """
    Standardizes country names:
      - Converts various representations of the United States to 'USA'
    """
    if country.lower() in ["usa", "united states", "u.s.a.", "us"]:
        return "USA"
    return country

# Apply extraction and standardization on the 'Addresses' column.
lib_start['Countries'] = lib_start['Addresses'].apply(lambda addr: [standardize_country(c) for c in extract_countries(addr)])

# --------------------
# Step 2: Aggregate author counts by country
# --------------------
all_countries = []
for country_list in lib_start['Countries'].dropna():
    all_countries.extend(country_list)

# Use a Counter to sum up the occurrences.
country_counter = Counter(all_countries)

# Convert the counter into a DataFrame.
df_country_counts = pd.DataFrame.from_dict(country_counter, orient='index', columns=['Author_Count']).reset_index()
df_country_counts = df_country_counts.rename(columns={'index': 'Country'})
df_country_counts = df_country_counts[df_country_counts['Country'] != ""]  # Remove any empty strings

# --------------------
# Step 3: Create a Bubble Map (Static Image)
# --------------------
fig = px.scatter_geo(
    df_country_counts,
    locations="Country",          # Country names for locations.
    locationmode="country names", # Use full country names.
    size="Author_Count",          # Bubble size corresponds to author count.
    text="Author_Count",          # Display the count inside the bubble.
    projection="natural earth",   # Global projection.
    size_max=100                  # Adjust maximum bubble size if needed.
)

# Enlarge the map for clarity.
fig.update_layout(width=1200, height=800)

# Position text labels above the bubbles.
fig.update_traces(textposition="top center", marker=dict(sizemode='area'))

# Disable hover effects for a static look.
fig.update_traces(hoverinfo="skip")

# Save the map as a static image, explicitly specifying kaleido as the engine.
fig.write_image("static_bubble_map.png", engine="kaleido")

# Optionally display the static image in the notebook.
fig.show(renderer="png")