# Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import seaborn as sns
import spacy
from itertools import combinations
from collections import Counter
import math
import networkx as nx
import ipywidgets as widgets
from ipywidgets import interact
from wordcloud import WordCloud
from PIL import Image
import requests
import warnings
from sklearn.manifold import TSNE
import plotly.express as px
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
warnings.filterwarnings("ignore")
nlp = spacy.load("en_core_web_sm")

# Functions to get the data

In [None]:
def get_df(file_year):
  base_url = "https://raw.githubusercontent.com/edavgaun/ASEM-Analysis-App/refs/heads/main/Data/CP_{}.csv"
  file_name=base_url.format(file_year)
  df = pd.read_csv(file_name, index_col="Unnamed: 0")
  if file_year!=2015:
    df["Paper"] = df["Title"].str.lower()+ ", " \
                   + df["KeyWords"].str.lower() + ", " \
                   + df["Abstract"].str.lower()+", "
  else:
    df["Paper"] = df["Title"].str.lower()+ ", " \
                + df["Abstract"].str.lower()+", "
  return df

In [None]:
def get_corpus(df, year):
  corpus=", ".join([t.lower() if type(t)!=float else "" for t in df.Title.values])
  if year!=2015:
    corpus+=", ".join([t.lower() if type(t)!=float else "" for t in df.KeyWords.values])
  corpus+=", ".join([t.lower() if type(t)!=float else "" for t in df.Abstract.values])
  corpus=", ".join([c.replace(" ", "-").replace(";", ",").replace(".", ",").replace("-,", "") for c in corpus.split(", ")])
  corpus=corpus.replace("4,0", "4.0").replace("5,0", "5.0")
  return corpus

In [None]:
def get_tokens(corpus, nlp=nlp):
  concepts=[t.replace("--", "-").replace("-", " ") for t in set(corpus.split(", ")) if len(t)>=2]
  doc = nlp(", ".join(concepts))
  return doc

In [None]:
def get_bow(tokens):
  bow_kw={}
  for token in tokens:
    if (not token.is_stop) and (not token.is_punct) and (not token.is_digit) \
        and (len(token)>=2):
      try:
        bow_kw[token.lemma_]+=1
      except:
        bow_kw[token.lemma_]=1
  return bow_kw

In [None]:
def get_bow_df(bow):
  df_kw=pd.DataFrame({"Word":bow.keys(), "frq":bow.values()}).sort_values("frq", ascending=False
                                                                                ).reset_index(drop=True)
  return df_kw

$$
\large
C\binom{n}{r} = \frac{n!}{r!(n-r)!}
$$

In [None]:
def get_dict():
  url="https://raw.githubusercontent.com/edavgaun/ASEM-Analysis-App/main/Data/own_stopwords.txt"
  response = requests.get(url)
  content = response.text
  content = content.strip().replace("\n", ", ").split(", ")
  content.sort()
  return content

In [None]:
def get_topN_word_bow_df(num_word, bow_df, own_stopwords=own_stopwords):
  return bow_df[~bow_df.Word.isin(own_stopwords)].head(num_word).Word.values

In [None]:
def get_word_frq(bow_df, Top_KW):
  return bow_df[bow_df.Word.isin(Top_KW)].set_index("Word").to_dict()["frq"]

In [None]:
# Get top N words
def get_combinations(df, bow_df, KW_values):
  # Initialize co-occurrence counter
  pair_counter = Counter()

  # Process each paper efficiently
  for paper in df["Paper"].dropna():  # Remove NaNs
      words_in_paper = set(paper.split())  # Tokenize paper into a set of words
      for w1, w2 in combinations(KW_values, 2):  # Generate word pairs dynamically
          if w1 in words_in_paper and w2 in words_in_paper:
              pair_counter[(w1, w2)] += 1

  # Convert the Counter dictionary to a DataFrame
  df_comb = pd.DataFrame(pair_counter.items(), columns=["Word_Pair", "Count"])
  df_comb[["Word1", "Word2"]] = pd.DataFrame(df_comb["Word_Pair"].tolist(), index=df_comb.index)
  df_comb.drop(columns=["Word_Pair"], inplace=True)

  return df_comb

In [None]:
start=2015
dfs, corpuses, tokenses, bows, bow_dfs = {}, {}, {}, {}, {}
for n in range(10):
  try:
    dfs[start+n]=get_df(start+n)
    corpuses[start+n]=get_corpus(dfs[start+n], start+n)
    tokenses[start+n]=get_tokens(corpuses[start+n])
    bows[start+n]=get_bow(tokenses[start+n])
    bow_dfs[start+n]=get_bow_df(bows[start+n])
  except:
    continue

# Network Graph

In [None]:
def draw_Network(data_year, num_word=10, random_loc=0):
  df=dfs[data_year]
  corpus=corpuses[data_year]
  tokens=tokenses[data_year]
  bow=bows[data_year]
  bow_df=bow_dfs[data_year]
  KW=get_topN_word_bow_df(num_word, bow_df)
  word_frequencies = get_word_frq(bow_df, KW)
  df_comb = get_combinations(df, bow_df, KW)

  # Efficient word co-occurrence counting using Counter
  pair_counter = Counter()

  # Create the network graph
  G = nx.Graph()

  # Add edges (word pairs) with weight as count
  for _, row in df_comb.iterrows():
      if row["Count"] > 0:
          G.add_edge(row["Word1"], row["Word2"], weight=row["Count"] / num_word)

  # Compute node degrees
  node_degrees = dict(G.degree())
  nx.set_node_attributes(G, node_degrees, "degree")

  # Normalize node degrees for color mapping
  max_degree = max(node_degrees.values()) if node_degrees else 1
  node_colors = [node_degrees[node] / max_degree for node in G.nodes]

  # Choose a colormap
  cmap = cm.plasma
  norm = mcolors.Normalize(vmin=0, vmax=50)

  # Generate positions using spring layout
  pos = nx.spring_layout(G, seed=int(random_loc), k=0.7)

  # Scale node sizes based on frequency
  node_sizes = [word_frequencies.get(word, 1) * 5 for word in G.nodes()]  # Default size if missing

  # Create figure
  fig, ax = plt.subplots(figsize=(12, 9))

  # Draw edges
  edge_weights = [data["weight"] * 7 for _, _, data in G.edges(data=True)]
  nx.draw_networkx_edges(G, pos, alpha=0.6, width=edge_weights, edge_color="gray")

  # Draw nodes with color mapping (no 'norm' in draw_networkx_nodes)
  nodes = nx.draw_networkx_nodes(
      G, pos, node_size=node_sizes, node_color=node_colors, cmap=cmap, edgecolors="black", alpha=0.9
  )

  # Draw labels
  nx.draw_networkx_labels(G, pos, font_size=10, font_weight="bold", verticalalignment="top")

  # Add colorbar
  sm = cm.ScalarMappable(cmap=cmap, norm=norm)
  sm.set_array([])  # Empty array for colorbar to work
  cbar = plt.colorbar(sm, ax=ax, fraction=0.02, pad=0.04,)
  cbar.set_label("Degree Centrality", fontsize=12)

  # Final adjustments
  plt.title("Word Co-occurrence Network, {} (Semantic Clustering)".format(data_year), fontsize=14)
  plt.axis("off")
  plt.show()

In [None]:
interact(draw_Network, data_year=(2015,2025,1),
                       num_word=(3,50,1),
                       random_loc="0")

# Word Cloud

In [None]:
def draw_word_cloud(data_year, num_word=10):
  import imageio.v2 as imageio
  from io import BytesIO

  logo_url = "https://raw.githubusercontent.com/edavgaun/ASEM-Analysis-App/main/assets/asem_logo.png"
  df=get_df(data_year)
  corpus=get_corpus(df, data_year)
  tokens=get_tokens(corpus)
  bow=get_bow(tokens)
  bow_df=get_bow_df(bow)
  own_stopwords=get_dict()
  word_freq = dict(zip(bow_df[bow_df.Word.isin(own_stopwords)]['Word'],
                     bow_df[bow_df.Word.isin(own_stopwords)]['frq']))

  response = requests.get(logo_url)
  mask = imageio.imread(BytesIO(response.content))
  mask = np.where(mask > 128, 255, 0)  # Apply a threshold to get a binary mask

  wordcloud = WordCloud(width=1000, height=700, mask=mask,
                        background_color='white',contour_width=0.5, contour_color='Blue'
                        ).generate_from_frequencies(word_freq)

  # Plot the word cloud
  plt.figure(figsize=(10, 5))
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis('off')  # Turn off axis labels
  plt.show()

#Radar Chart

In [None]:
def radar_chart(year, word, topN_Words, ax, color):
  word=word.lower()
  df=dfs[year]
  bow_df=bow_dfs[year]
  KW=get_topN_word_bow_df(topN_Words, bow_df)
  word_frequencies = get_word_frq(bow_df, KW)
  df_comb=get_combinations(df, bow_df, KW)
  rank=bow_df[bow_df["Word"]==word].index.values[0]
  df_comb_word=df_comb[(df_comb.Word1==word) | (df_comb.Word2==word)]
  arr=df_comb_word.iloc[:,1:].values
  df_comb_word.loc[:, "label"]=arr[arr != word]
  df_comb_word=df_comb_word.sort_values("label").reset_index()
  labels = df_comb_word.label.values.tolist()
  values = df_comb_word.Count.values
  norm = np.linalg.norm(values)
  norm_values=list(values/norm)

  # Convert to radians for the radar chart
  num_vars = len(labels)
  angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()

  # Close the radar chart (connect last point to first)
  norm_values += norm_values[:1]
  angles += angles[:1]

  # Plot the data
  ax.fill(angles, norm_values, color=color, alpha=0.25)  # Fill area
  ax.plot(angles, norm_values, color=color, linewidth=2)  # Line plot

  # Add category labels
  ax.set_xticks(angles[:-1])
  ax.set_xticklabels(labels, fontsize=10)
  ax.set_yticks([0.1, 0.2, 0.3, 0.4, 0.5])
  ax.set_yticklabels([0.1, 0.2, 0.3, 0.4, 0.5])

  # Display the chart
  ax.text(0,0,"{}\n{}\nrank{}".format(word.upper(), year,rank+1), ha='center', va="center")

In [None]:
def compare_radar(word, topN_Words, year1, year2):

  fig, ax = plt.subplots(1,2, figsize=(14, 6), subplot_kw=dict(polar=True))
  try:
    radar_chart(year1, word, topN_Words, ax[0], "red")
  except:
    ax[0].text(0,0,"Missing data\nfor this topic\nthis year", ha="center", va="center")
  try:
    radar_chart(year2, word, topN_Words, ax[1], "blue")
  except:
    ax[1].text(0,0,"Missing data\nfor this topic\nthis year", ha="center", va="center")
  plt.suptitle('Relationship between "{}" and Other Topics'.format(
                                                        word.title() ),
                                                        fontsize=20,
                                                        y=1.025)
  plt.show()

In [None]:
interact(compare_radar, word="", topN_Words=(50,100,10), year1=[2015+n for n in range(10)],
                                                         year2=[2015+n for n in range(10)])

# Getting wide and long data frames

In [None]:
def get_wide_df():
  url = "https://raw.githubusercontent.com/edavgaun/ASEM-Analysis-App/refs/heads/main/Data/data_full.csv"
  df = pd.read_csv(url)
  return df

In [None]:
def get_long_df():
  url = "https://raw.githubusercontent.com/edavgaun/ASEM-Analysis-App/refs/heads/main/Data/df_long.csv"
  df = pd.read_csv(url, index_col="Unnamed: 0")
  return df

# Bubble chart

In [None]:
def Bubble_chart(*top_words):
  df_long=get_long_df()
  data_full=get_wide_df()
  try:
    top_words = list(top_words)
    top_words.sort()

    df_plot = df_long[df_long.Word.isin(top_words)]

    fig, axs = plt.subplots(figsize=(10, max(2, len(top_words) * 0.9)))

    sns.scatterplot(
        data=df_plot,
        x='year', y='Word', size='frq', hue='frq',
        palette='Blues', sizes=(100, 3000),
        edgecolor='k', ax=axs,
        legend=False
    )

    # Add bubble value annotations
    for index, row in df_plot.iterrows():
        txt_color = "black"
        txt_weight = None
        if row['frq'] > int(df_plot.frq.max()*3/5):
            txt_color = "white"
            txt_weight = "bold"

        axs.text(row['year'], row['Word'], str(row['frq']),
                 ha='center', va='center', fontsize=8, alpha=0.7,
                 color=txt_color, fontweight=txt_weight)

    axs.set_ylim(-0.6, len(top_words) - 0.4)
    axs.grid(axis='both', linestyle='--', alpha=0.4)
    axs.spines['top'].set_visible(False)
    axs.spines['right'].set_visible(False)

    plt.tight_layout()
    plt.show()
  except:
    pass

In [None]:
def WS():
  data_full=get_wide_df()
  word_selector = widgets.SelectMultiple(
      options=data_full.Word,
      description='Words to choose:',
      rows=5,  # number of rows shown
      style={'description_width': 'initial'},
      layout=widgets.Layout(width='50%')
  )
  return word_selector

In [None]:
@interact(top_words=WS())
def update_bubble_chart(top_words):
    Bubble_chart(*top_words)

# Scatterplot

In [None]:
def scatterplot3D():
  data_full=get_wide_df().set_index("Word")
  df_long=get_long_df()
  X = data_full.values
  X_embedded = TSNE(n_components=3, learning_rate='auto',
                    init='random', perplexity=3).fit_transform(X)
  df = px.data.iris()
  fig = px.scatter_3d(x=X_embedded[:,0], y=X_embedded[:,1], z=X_embedded[:,2],
                color=data_full.Cluster, size=df_long.groupby("Word")["frq"].sum())
  fig.show()