In [None]:
!pip install sentence_transformers==2.2.0
!pip install pandas
!pip install chardet
!pip install detect_delimiter
!pip install keybert
!pip install huggingface-hub==0.10.1

In [2]:
import sys
import time
import sys
import pandas as pd
import chardet
import codecs
from detect_delimiter import detect
from keybert import KeyBERT
from google.colab import files
from sentence_transformers import SentenceTransformer, util

# Mounting Google drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd drive/MyDrive/ARDUOUS/articledata2

/content/drive/MyDrive/ARDUOUS/articledata2


Check if the directory "results" exists, creates it if it doesn't exist, or empties all its files if it already exists.

In [5]:
import os
import shutil

# Path of the directory you want to create or empty
dir_path = "results"

# Check if the directory exists
if not os.path.exists(dir_path):
    # Create the directory if it doesn't exist
    os.mkdir(dir_path)
else:
    # Empty the directory if it exists
    for filename in os.listdir(dir_path):
        file_path = os.path.join(dir_path, filename)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}: {e}")

# Extract keywords from the papers

Loading relevant information from json files.

In [7]:
import json
import os

all_pubs = []

# Get all files in the current directory
all_files = os.listdir(".")

# Loop through all files in the directory, skipping certain files
for file in all_files:
    # Skip the "results" directory
    if os.path.isdir(file) and file == "results":
        continue
    
    # Open the file and load the data as JSON
    with open(file) as f:
        data = json.load(f)
    
    # Extract relevant fields from the JSON data and add them to a dictionary
    pub = {
        'publicationYear': data['publicationYear'],
        'title': data['title'],
        'INSPEC: Controlled Indexing': [],
        'INSPEC: Non-Controlled Indexing': [],
        'IEEE Keywords': []
    }
    for keyword in data['keywords']:
        key = keyword['type']
        pub[key] = keyword['kwd']
    pub['abstract'] = data['abstract']
    
    # Add the dictionary to the list of all publications
    all_pubs.append(pub)


## Using KeyBERT for key extraction

In [8]:
from keybert import KeyBERT

# Create a KeyBERT object
kw_model = KeyBERT()

# Loop through all publications
for pub in all_pubs:
    # Extract keywords from the publication's abstract using KeyBERT
    keywords = kw_model.extract_keywords(pub['abstract'])
    
    # Extract monogram keywords from the publication's abstract using KeyBERT
    pub['keyMonoBERT'] = kw_model.extract_keywords(
        pub['abstract'], 
        keyphrase_ngram_range=(1, 1), 
        stop_words=None,
        use_mmr=False,
        nr_candidates=20,
        top_n=5
    )
    
    # Extract bigram keywords from the publication's abstract using KeyBERT
    pub['keyBiBERT'] = kw_model.extract_keywords(
        pub['abstract'], 
        keyphrase_ngram_range=(1, 2), 
        stop_words=None,
        use_mmr=False,
        nr_candidates=20,
        top_n=5
    )

     # Extract bigram keywords from the publication's title using KeyBERT
    pub['keyBiTitle'] = kw_model.extract_keywords(
        pub['title'], 
        keyphrase_ngram_range=(1, 2), 
        stop_words=None,
        use_mmr=False,
        nr_candidates=20,
        top_n=5
    )

     # Extract monogram keywords from the publication's title using KeyBERT
    pub['keyMonoTitle'] = kw_model.extract_keywords(
        pub['title'], 
        keyphrase_ngram_range=(1, 1), 
        stop_words=None,
        use_mmr=False,
        nr_candidates=20,
        top_n=5
    )

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [9]:
# Create an empty list to store dictionaries of extracted keywords
keywords = []

# Loop through all publications and extract relevant keywords
for pub in all_pubs:
    # Create a dictionary containing relevant information about the publication and its extracted keywords
    dict_key = {
        'publicationYear': pub['publicationYear'],
        'title': pub['title'],
        'keyMonoTitle': pub['keyMonoTitle'],
        'keyBiTitle': pub['keyBiTitle'],
        'IEEE Keywords': pub['IEEE Keywords'],
        'keyMonoBERT': pub['keyMonoBERT'],
        'keyBiBERT': pub['keyBiBERT'],
        'INSPEC: Controlled Indexing': pub['INSPEC: Controlled Indexing'],
        'INSPEC: Non-Controlled Indexing': pub['INSPEC: Non-Controlled Indexing']
    }
    # Add the dictionary to the list of extracted keywords
    keywords.append(dict_key)


## Using Distil-BERT for key extraction

In [10]:
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

df_sen = pd.DataFrame.from_dict(all_pubs)
# Load a pre-trained Sentence-BERT model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Define a function to extract keywords using Sentence-BERT and TF-IDF
def extract_keywords_bert_tfidf(text, n_keywords=20):
    # Encode the text into sentence embeddings using Sentence-BERT
    embeddings = model.encode([text])

    # Compute the cosine similarity matrix between sentence embeddings
    cosine_sim = embeddings @ embeddings.T

    # Convert the similarity matrix into a document-term matrix
    cv = CountVectorizer(tokenizer=lambda x: x.split(), lowercase=False)
    dtm = cv.fit_transform([text])

    # Compute the TF-IDF matrix from the document-term matrix
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform([text])

    # Compute the top N keywords using TF-IDF scores and cosine similarity
    keywords_idx = tfidf_matrix.toarray()[0].argsort()[-n_keywords:][::-1]
    keywords = [tfidf.get_feature_names_out()[i] for i in keywords_idx]

    return keywords

# Define a function to extract keywords from the abstract and title columns
def extract_keywords(row):
    texts = [row['abstract'], row['title']]
    keywords = [keyword for text in texts for keyword in extract_keywords_bert_tfidf(text)]
    return keywords

# Apply the function to each row of the DataFrame
df_sen['keywords'] = df_sen.apply(extract_keywords, axis=1)


Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]



In [None]:
df_sen

## Putting all the keywords together

In [None]:
df = pd.DataFrame.from_dict(keywords)

# Adding sentence berts keywords
df['keywords_sen'] = df_sen['keywords']

# define the operation to apply
def my_operation(x):
    return [item[0] for item in x]

df[['keyMonoTitle', 'keyBiTitle', 'keyMonoBERT', 'keyBiBERT']] = df[['keyMonoTitle', 'keyBiTitle', 'keyMonoBERT', 'keyBiBERT']].applymap(my_operation)


# define a function to merge the columns into a single list
def merge_columns(row):
    return row['keyMonoTitle'] + row['keyBiTitle'] + row['keyMonoBERT'] + row['keyBiBERT'] + row['INSPEC: Non-Controlled Indexing'] + row['INSPEC: Controlled Indexing'] + row['keywords_sen']

# apply the function to each row using apply
df['merged_Keywords'] = df.apply(merge_columns, axis=1).tolist()

df.sort_values(by=['publicationYear'])

## Removing unwanted columns

In [13]:
df_mergedKeys = df.drop(['keyMonoTitle', 'keyBiTitle', 'keyMonoBERT','keyBiBERT','INSPEC: Non-Controlled Indexing','INSPEC: Controlled Indexing','IEEE Keywords', 'keywords_sen'], axis=1)

# Pre-processing the keywords.


1. Removing stop words,
2. Lemmatizing and, 
3. Removing duplicates.



In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

# download necessary packages
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# create a sample DataFrame
df = df_mergedKeys

# remove duplicates and preprocess text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

df_mergedKeys['merged_Keywords'] = df_mergedKeys['merged_Keywords'].apply(
    lambda x: list(set([lemmatizer.lemmatize(token.lower()) for token in word_tokenize(' '.join(x)) if token.lower() not in stop_words and re.search(r'\b\w{2,}\b', token)]))
)

# remove duplicate keywords in each list
df_mergedKeys['merged_Keywords'] = df_mergedKeys['merged_Keywords'].apply(lambda x: list(set(x)))

# print the updated DataFrame
df_mergedKeys.sort_values(by=['publicationYear'])

In [17]:
%cd results
df_mergedKeys.to_csv('keys_per_title.csv', index=False)

/content/drive/MyDrive/ARDUOUS/articledata2/results


## Get all the keywords together, remove duplicates and save keywords file.

In [18]:
# explode the column into multiple rows
exploded = df_mergedKeys['merged_Keywords'].explode()

# get the unique values
unique_values = exploded.unique().tolist()


In [19]:
# explode the column into multiple rows
exploded = df_mergedKeys['merged_Keywords'].explode()

# get the unique values as a list
unique_keywords = exploded.unique().tolist()

# create a DataFrame with the unique values
df_unique = pd.DataFrame({'Keywords': unique_keywords})

In [20]:
df_unique.to_csv('keywords.csv', index=False)

#Clustering Keywords

##Setting cluster accuracy and size

In [21]:
cluster_accuracy = 45 #50 # 0-100 (100 = very tight clusters, but higher percentage of no_cluster groups)
min_cluster_size = 4 #3  # set the minimum size of cluster groups. (Lower number = tighter groups)

## Choose a Sentence Transformer
Download Pre-Trained Models: https://www.sbert.net/docs/pretrained_models.html

In [22]:
transformer = 'all-mpnet-base-v2'  # provides the best quality
#transformer = 'all-MiniLM-L6-v2'  # 5 times faster and still offers good quality

In [None]:
# create a dataframe using the detected delimiter and encoding type
df = df_unique
count_rows = len(df)
if count_rows > 50_000:
  print("WARNING: You May Experience Crashes When Processing Over 50,000 Keywords at Once. Please consider smaller batches!")
print("Uploaded Keyword CSV File Successfully!")
df

In [24]:
# standardise the keyword columns
df.rename(columns={"Search term": "Keyword", "keyword": "Keyword", "query": "Keyword", "query": "Keyword", "Top queries": "Keyword", "queries": "Keyword", "Keywords": "Keyword","keywords": "Keyword", "Search terms report": "Keyword"}, inplace=True)

if "Keyword" not in df.columns:
  print("Error! Please make sure your csv file contains a column named 'Keyword!")

# store the data
cluster_name_list = []
corpus_sentences_list = []
df_all = []

corpus_set = set(df['Keyword'])
corpus_set_all = corpus_set
cluster = True


##Clustering Keywords - This can take a while!

In [None]:
# keep looping through until no more clusters are created

cluster_accuracy = cluster_accuracy / 100
model = SentenceTransformer(transformer)

while cluster:

    corpus_sentences = list(corpus_set)
    check_len = len(corpus_sentences)

    corpus_embeddings = model.encode(corpus_sentences, batch_size=256, show_progress_bar=True, convert_to_tensor=True)
    clusters = util.community_detection(corpus_embeddings, min_community_size=min_cluster_size, threshold=cluster_accuracy, init_max_size=len(corpus_embeddings))

    for keyword, cluster in enumerate(clusters):
        print("\nCluster {}, #{} Elements ".format(keyword + 1, len(cluster)))

        for sentence_id in cluster[0:]:
            print("\t", corpus_sentences[sentence_id])
            corpus_sentences_list.append(corpus_sentences[sentence_id])
            cluster_name_list.append("Cluster {}, #{} Elements ".format(keyword + 1, len(cluster)))

    df_new = pd.DataFrame(None)
    df_new['Cluster Name'] = cluster_name_list
    df_new["Keyword"] = corpus_sentences_list

    df_all.append(df_new)
    have = set(df_new["Keyword"])

    corpus_set = corpus_set_all - have
    remaining = len(corpus_set)
    print("Total Unclustered Keywords: ", remaining)
    if check_len == remaining:
        break

In [26]:
# make a new dataframe from the list of dataframe and merge back into the orginal df
df_new = pd.concat(df_all)
df = df.merge(df_new.drop_duplicates('Keyword'), how='left', on="Keyword")

In [27]:
# rename the clusters to the shortest keyword in the cluster
df['Length'] = df['Keyword'].astype(str).map(len)
df = df.sort_values(by="Length", ascending=True)

df['Cluster Name'] = df.groupby('Cluster Name')['Keyword'].transform('first')
df.sort_values(['Cluster Name', "Keyword"], ascending=[True, True], inplace=True)

df['Cluster Name'] = df['Cluster Name'].fillna("zzz_no_cluster")

del df['Length']

In [28]:
# move the cluster and keyword columns to the front
col = df.pop("Keyword")
df.insert(0, col.name, col)

col = df.pop('Cluster Name')
df.insert(0, col.name, col)

df.sort_values(["Cluster Name", "Keyword"], ascending=[True, True], inplace=True)

In [29]:
uncluster_percent = (remaining / count_rows) * 100
clustered_percent = 100 - uncluster_percent
print(clustered_percent,"% of rows clustered successfully!")

68.33333333333334 % of rows clustered successfully!


## Save the cluster

In [30]:
df.to_csv('keywords_clustered.csv', index=False)

# Creating heatmap

In [31]:
import pandas as pd

# Creating a list of unique cluster names from the DataFrame
unique_cluster_names = df['Cluster Name'].unique().tolist()

# Adding 'title' and 'publicationYear' to the beginning of the list
unique_cluster_names.insert(0, 'title')
unique_cluster_names.insert(1, 'publicationYear')

# Creating an empty DataFrame with columns from the unique_cluster_names list
df_heatMap = pd.DataFrame(columns=unique_cluster_names)

# Filling 'title' and 'publicationYear' columns from df_mergedKeys DataFrame
df_heatMap['title'] = df_mergedKeys['title']
df_heatMap['publicationYear'] = df_mergedKeys['publicationYear']

# Filling all other columns with 0
for names in unique_cluster_names:
    if names not in ['publicationYear', 'title']:
        df_heatMap[names] = 0

total = 0
# Looping through the df_heatMap DataFrame
for index, ro_hm in df_heatMap.iterrows():
    for i, ro_mk in df_mergedKeys.iterrows():
        # Checking if 'title' value in both DataFrames match
        if ro_hm['title'] == ro_mk['title']:
            # Looping through each row in the df DataFrame
            for j, row in df.iterrows():
                # Checking if Keyword is in merged_Keywords list
                if row['Keyword'] in ro_mk['merged_Keywords']:
                    df_heatMap.at[index, row['Cluster Name']] += 1
                    total = total + 1

df_heatMap['total'] = df_heatMap.iloc[:, 3:].sum(axis=1)
df_heatMap.to_csv('heatmap.csv', index=False)

In [32]:
df_mergedKeys['sum'] = df_mergedKeys['merged_Keywords'].apply(lambda x: len(x))

In [33]:
df_heatmap_title = df_heatMap.drop(['publicationYear', 'total', 'zzz_no_cluster'], axis=1)
df_heatmap_pub = df_heatMap.drop(['title', 'total', 'zzz_no_cluster'], axis=1)

# find the maximum value in the DataFrame
max_value = df_heatmap_title.iloc[:, 1:].max().max()
df_heatmap_title.shape[1]

67

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

pivot_data_pub = df_heatmap_pub.pivot_table(index='publicationYear')
pivot_data_title = df_heatmap_title.pivot_table(index='title')
sns.set(font_scale=1.2)
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(40, 20))
sns.heatmap(pivot_data_pub, ax=ax1, cmap='coolwarm', norm=LogNorm(vmin=0.01, vmax=max_value))
sns.heatmap(pivot_data_title, ax=ax2, cmap='coolwarm', norm=LogNorm(vmin=0.01, vmax=max_value))

# Save the heatmaps as SVG images
fig.savefig('heatmap.svg', format='svg')

plt.show()


# Word cloud visualization of the whole clusters

In [None]:
cluster_dict = {cluster: group['Keyword'].tolist() for cluster, group in df.groupby('Cluster Name')}

# print the resulting dictionary
print(cluster_dict)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# define cluster titles and keywords
clusters = cluster_dict

# create word cloud for each cluster
for cluster, keywords in clusters.items():
    wordcloud = WordCloud(background_color="white", width=800, height=400).generate(" ".join(keywords))
    
    # display and save word cloud with cluster title
    plt.figure(figsize=(10, 20))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(cluster)
    #plt.savefig(f"{cluster}.svg", dpi=300, bbox_inches="tight", format="svg")
    plt.show()

# Semantic Graph

In [37]:
from gensim.models import KeyedVectors
# Load pre-trained word embedding model
word_vectors = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/ARDUOUS/word2vec.bin', binary=True)

In [None]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

# Remove stop words and punctuations
clean_keywords = [word for word in unique_values if word.isalnum()]

# Convert keywords to vectors
vectors = []
keyerror_list = []

for word in clean_keywords:
    try:
        vectors.append(word_vectors[word])
    except KeyError:
      keyerror_list.append(word)
      pass

clean_keywords = [word for word in clean_keywords if word not in keyerror_list]

# Calculate pairwise cosine similarity
sims = np.dot(vectors, np.transpose(vectors))
sims /= np.linalg.norm(vectors, axis=1)[:, np.newaxis]
sims /= np.linalg.norm(vectors, axis=1)[np.newaxis, :]


# Create graph
G = nx.Graph()
for i, word in enumerate(clean_keywords):
    G.add_node(i, label=word)
    for j in range(i+1, len(clean_keywords)):
        if sims[i, j] > 0.2:
            G.add_edge(i, j, weight=sims[i, j])

# Draw graph and save as SVG
pos = nx.spring_layout(G, seed=42, k=0.15, iterations=20)
nx.draw_networkx_nodes(G, pos, node_size=5, node_color='lightblue', linewidths = 1)
nx.draw_networkx_edges(G, pos, width=0.2, edge_color='red', arrows=True)
nx.draw_networkx_labels(G, pos, labels=nx.get_node_attributes(G, 'label'), font_size=0.001, font_color='black')
nx.draw_networkx_edge_labels(G, pos, edge_labels=nx.get_edge_attributes(G, 'weight'), font_size=0.00001, font_color='black')
plt.axis('off')
plt.savefig('semantic_graph.svg', format='svg', dpi=1200,bbox_inches='tight')