## Dataset cleaning and Meta Analysis


In [None]:
# Import libraries 
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sbs

# Options for plots
plt.rcParams['figure.figsize'] = (10, 8)
sbs.set('paper')

# Import litstudy
path = os.path.abspath(os.path.join('..'))
if path not in sys.path:
    sys.path.append(path)
import litstudy

In [4]:
#importing data derived from scopus and its cleaning
df = pd.read_csv("scopus.csv")
df=df.drop_duplicates('DOI')
df = df.drop(['Author full names','Author(s) ID','Link','Source'],  axis =1)
df.to_csv('cleanedscopus.csv', index=False)

In [5]:
df = pd.read_csv("cleanedscopus.csv")

In [None]:
#Importing the cleaned dataset 

df_filtered = df[(df['Year'] >= 2015) & (df['Year'] <= 2024)]

#citation count ranges
citation_ranges = [(0, 0), (1, 5), (6, 10), (11, 20), (21, 30), (31, 50), (51, float('inf'))]
range_labels = ['0', '1-5', '6-10', '11-20', '21-30', '31-50', '51+']
colors = ['skyblue', 'lightgreen', 'salmon', 'gold', 'lightblue', 'orchid', 'lightcoral']
# Categorize papers based on their publication year and citation count ranges
df_filtered['CitationRange'] = pd.cut(df_filtered['Cited by'], bins=[low - 0.1 for low, high in citation_ranges] + [float('inf')], labels=range_labels)

#Papers by year
grouped = df_filtered.groupby('Year')

# Create subplots
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(12, 5))

# Bar charts for each year
for (year, data), ax in zip(grouped, axes.flatten()):
    citation_counts = data['CitationRange'].value_counts().sort_index()
    ax.bar(citation_counts.index, citation_counts, color=colors)
    ax.set_title(f' {year}', fontsize=10)
    ax.set_xlabel('Citation Range', fontsize=10)
    ax.set_ylabel('Number of Papers', fontsize=10)
    ax.tick_params(axis='both', which='major', labelsize=10)  # Increase tick label font size
    ax.set_xticklabels(citation_counts.index, rotation=45)
    
plt.tight_layout()
plt.show()

In [None]:
# Publication per year
df_filtered = df[df['Year'] > 2015]

papers_per_year = df_filtered.groupby('Year').size().reset_index(name='Paper_Count')

colors = plt.cm.Blues(np.linspace(0.3, 1, len(papers_per_year)))

# Plotting the number of papers published per year
plt.figure(figsize=(6, 4))
plt.bar(papers_per_year['Year'], papers_per_year['Paper_Count'], color=colors)
plt.xlabel('Year')
plt.ylabel('Number of Papers')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(papers_per_year['Year'])
plt.tight_layout()
plt.show()

In [None]:
# Filter journal articles
journal_df = df[df['Document Type'] == 'Article']

# Count for top 10 journals
top_journals = journal_df['Source title'].value_counts().head(20)

fig, ax = plt.subplots(figsize=(8,6))
top_journals.plot(kind='bar', ax=ax)
ax.set_title('Top 10 Journals by Number of Papers')
ax.set_xlabel('Journal')
ax.set_ylabel('Number of Papers')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
#Filter  conference papers
conf_df = df[df['Document Type'] == 'Conference paper']

#Count for top 10 conferences
top_confs = conf_df['Source title'].value_counts().head(20)
plt.figure(figsize=(8,6))
top_confs.plot(kind='bar')
plt.title('Top 10 Conferences by Number of Papers')
plt.xlabel('Conference')
plt.ylabel('Number of Papers')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
#getting author affiliations from the dataset

author_affil_list = []

for entry in df["Authors with affiliations"].dropna():
    parts = [p.strip() for p in entry.split(';') if p.strip()]
    author_affil_list.extend(parts)

total_authors = len(author_affil_list)

print(f"Total number of authors: {total_authors}\n")
print("Sample author-affiliation pairs:")
for i, pair in enumerate(author_affil_list[:5], 1):
    print(f"{i}. {pair}")

In [None]:
# getting top 10 most common countries
from collections import Counter

# Function to extract country
def extract_country(affiliation):
    try:
        return affiliation.split(',')[-1].strip()
    except:
        return None
countries = [extract_country(aff) for aff in author_affil_list]
countries = [c for c in countries if c]
country_counts = Counter(countries)
# Get top 10 most common countries
top_10 = country_counts.most_common(10)
print("Top 10 Countries by Frequency:\n")
for country, count in top_10:
    print(f"{country}: {count}")

In [None]:
def extract_country(affiliation):
    try:
        return affiliation.split(',')[-1].strip()
    except:
        return None
countries = [extract_country(aff) for aff in author_affil_list]
countries = [c for c in countries if c]   

country_counts = Counter(countries)
top_10 = country_counts.most_common(10)

# Split into names and counts
country_names = [c for c, _ in top_10]
frequencies   = [n for _, n in top_10]
 
colors = plt.cm.Blues(np.linspace(1, 0.5, len(frequencies)))

# Create horizontal bar chart
plt.figure(figsize=(5, 6))
plt.barh(country_names, frequencies, color=colors)
plt.xlabel('Number of Authors')
plt.ylabel('Country')
plt.title('Top 10 Countries by Author Affiliation Frequency')
plt.gca().invert_yaxis()  # Highest value on top
plt.tight_layout()
plt.show()

In [None]:
import networkx as nx
from itertools import combinations
import matplotlib.pyplot as plt

# ---- CONFIG ----
POSSIBLE_AFFIL_COLS = ["Authors with affiliations", "Affiliations", "Author Affiliations"]
SPLIT_CHAR = ';'            # how multiple affiliations are separated in a cell
TOP_N_DEFAULT = 15          # fallback if graph is large

# ---- PICK THE AFFILIATION COLUMN SAFELY ----
affil_col = None
for c in POSSIBLE_AFFIL_COLS:
    if c in df.columns:
        affil_col = c
        break
if affil_col is None:
    # fallback to first column, but warn
    affil_col = df.columns[0]
    print(f"⚠️ Using first column '{affil_col}' as affiliations (expected one of {POSSIBLE_AFFIL_COLS}).")

# ---- OPTIONAL: AUTHOR–AFFIL LIST PREVIEW ----
author_affil_list = []
if "Authors with affiliations" in df.columns:
    for entry in df["Authors with affiliations"].dropna():
        parts = [p.strip() for p in str(entry).split(SPLIT_CHAR) if p.strip()]
        author_affil_list.extend(parts)
    total_authors = len(author_affil_list)
    print(f"Total number of authors: {total_authors}\n")
    print("Sample author-affiliation pairs:")
    for i, pair in enumerate(author_affil_list[:5], 1):
        print(f"{i}. {pair}")

# ---- COUNTRY EXTRACTION ----
def extract_country(affiliation: str) -> str | None:
    if not isinstance(affiliation, str) or not affiliation.strip():
        return None
    # naive heuristic: take the last comma-separated token
    parts = [p.strip() for p in affiliation.split(',') if p.strip()]
    return parts[-1] if parts else None

# ---- BUILD COUNTRY–COUNTRY COLLAB NETWORK ----
G = nx.Graph()

for raw in df[affil_col].dropna():
    affils = [a.strip() for a in str(raw).split(SPLIT_CHAR) if a.strip()]
    countries = [extract_country(a) for a in affils]
    countries = [c for c in countries if c]

    # add weighted edges between unique country pairs per record
    for c1, c2 in combinations(sorted(set(countries)), 2):
        if G.has_edge(c1, c2):
            G[c1][c2]["weight"] += 1
        else:
            G.add_edge(c1, c2, weight=1)

if G.number_of_edges() == 0:
    print("Not enough collaborations to draw a network (need at least one edge).")
else:
    # pick top-N nodes by weighted degree
    N = min(TOP_N_DEFAULT, G.number_of_nodes())
    top_nodes = sorted(G.degree(weight='weight'), key=lambda x: x[1], reverse=True)[:N]
    top_nodes = [n for n, _ in top_nodes]
    G_top = G.subgraph(top_nodes).copy()

    if G_top.number_of_edges() == 0:
        print(f"Top-{N} subgraph has no edges; consider increasing N or revising parsing.")
    else:
        edge_weights = [d.get('weight', 1) for _, _, d in G_top.edges(data=True)]
        width_scale = 0.25
        widths = [max(1.0, w * width_scale) for w in edge_weights]

        # layout & draw
        plt.figure(figsize=(9, 7))
        pos = nx.spring_layout(G_top, k=0.7, seed=42)

        nx.draw_networkx_nodes(
            G_top, pos, node_size=1200, node_color='lightblue',
            edgecolors='gray', linewidths=1.2
        )
        # simple gray edges scaled by weight (keeps it readable)
        nx.draw_networkx_edges(G_top, pos, width=widths, alpha=0.9)

        for node, (x, y) in pos.items():
            plt.text(
                x, y, node,
                fontsize=10, fontweight='bold',
                ha='center', va='center',
                bbox=dict(facecolor='white', edgecolor='none', boxstyle='round,pad=0.25')
            )

        plt.title("Top Countries Collaboration Network")
        plt.axis('off')
        plt.show()

# Abstract Preprocessing and Analysis with LDA


## Content analysis

In [None]:
import pandas as pd
import re
import spacy
import en_core_web_sm
import nltk
import gensim
from nltk.corpus import wordnet
import gensim.corpora as corpora
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from nltk.stem import WordNetLemmatizer
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import Phrases, CoherenceModel
from gensim.models import TfidfModel, LdaModel, CoherenceModel
from gensim.utils import lemmatize
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
df = pd.read_csv('cleanedscopus.csv')
print(df.columns) 
data = df['Abstract'].dropna().values.tolist() 

In [None]:
def preprocess(text):
    text = text.lower()
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])
    return text

# Define word groups for each topic
topic_words = [
    ['network'],
    ['blockchain'],
    ['information'],
    ['systems'],
    ['users'],
    ['challenges'],
    ['security'], ['privacy'],
    ['management'],
    ['data'],
    ['integrity'],
    ['protection'],
    ['sharing'],
    ['process'],
    ['access']
]

# Generate the word co-occurrence matrix
def generate_co_word_heatmap(df, topic_words):
    df['processed_text'] = df['Abstract'].apply(preprocess)
    vectorizer = CountVectorizer(vocabulary=list(dict.fromkeys([word for topic in topic_words for word in topic])))
    x_counts = vectorizer.fit_transform(df['processed_text'])
    terms = vectorizer.get_feature_names_out()
    co_occurrence = (x_counts.T @ x_counts).toarray()
    np.fill_diagonal(co_occurrence, 0)

    # Create a heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(co_occurrence, xticklabels=terms, yticklabels=terms, cmap="YlGnBu", annot=True, annot_kws={"size": 6})
    plt.title("Co-Word Heatmap for Selected Topics")
    plt.show()

# Generate the heatmap
generate_co_word_heatmap(df, topic_words)

In [None]:
# Preprocessing of data

def preprocess_texts(texts):
    # 1. Tokenize & remove stopwords
    base = [
        [w for w in simple_preprocess(str(doc), deacc=True)
         if w not in STOPWORDS]
        for doc in texts
    ]
    
    # 2. Build bigram & trigram models
    bigram = Phrases(base, min_count=5, threshold=100)
    trigram = Phrases(bigram[base], threshold=100)
    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)
    phrased = [trigram_mod[bigram_mod[doc]] for doc in base]

    # 3. Lemmatization using gensim (default POS='n', but can specify)
    lemmatized = []
    for doc in phrased:
        lemmas = [w.decode('utf-8').split('/')[0] for w in lemmatize(" ".join(doc))]
        lemmatized.append([w for w in lemmas if w not in STOPWORDS and len(w) > 2])
    
    return lemmatized

processed_abstracts = preprocess_texts(df['Abstract'])

# Dictionary, corpus, and TF-IDF
dictionary = Dictionary(processed_abstracts)
corpus_bow = [dictionary.doc2bow(doc) for doc in processed_abstracts]
tfidf = TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

# Train LDA
num_topics = 40
lda_model = LdaModel(
    corpus=corpus_tfidf,
    id2word=dictionary,
    num_topics=num_topics,
    passes=20,
    alpha='auto',
    eta='auto',
    random_state=42
)

# Compute coherence
coherence_model = CoherenceModel(model=lda_model, texts=processed_abstracts, dictionary=dictionary, coherence='c_v')
print("Coherence Score (c_v):", coherence_model.get_coherence())

In [None]:
# Generating the word cloud for the topics with higher Coherence score based 
processed_abstracts = preprocess_texts(df['Abstract']) 
corpus = litstudy.build_corpus(processed_abstracts, ngram_threshold=0.45)
num_topics = 30
topic_model = litstudy.train_nmf_model(corpus, num_topics, max_iter=300)
plt.figure(figsize=(60, 30))
litstudy.plot_topic_clouds(topic_model, ncols=5)
plt.savefig('cloud.pdf', bbox_inches='tight', pad_inches=0.1)

In [None]:
# Generating the trending topics graph for merged topics from the word cloud


df_relevant = preprocess_texts(df['Abstract'].tolist())

# Selected topics based on the cohrence score topics and their associated keywords
topics_keywords = {
    'Topic 1': ['medical', 'emrs', 'records', 'electronic_medical', 'medical_records'],
    'Topic 2': ['ehr', 'health', 'electronic_health', 'records', 'electronic'],
    'Topic 3': ['bitcoin', 'education', 'transactions', 'block', 'payment'],
    'Topic 4': ['hyperledger', 'fabric', 'hyperledger_fabric', 'permissioned', 'file'],
    'Topic 5': ['crowdsourcing', 'reputation', 'task', 'workers', 'mobile'],
    'Topic 6': ['detection', 'video', 'image', 'deep', 'intrusion'],
    'Topic 7': ['edge', 'edge_computing', 'computing', 'mec', 'resource'],
    'Topic 8': ['encryption', 'attribute', 'scheme', 'abe', 'attribute_encryption'],
    'Topic 9': ['access_control', 'access', 'control', 'policy', 'attribute'],
    'Topic 10': ['research', 'review', 'survey', 'future', 'applications'],
    'Topic 11': ['voting', 'electronic_voting', 'voter', 'election', 'electronic'],
    'Topic 12': ['supply', 'supply_chain', 'chain', 'food', 'product'],
    'Topic 13': ['learning', 'federated', 'federated_learning', 'training', 'model'],
    'Topic 14': ['authentication', 'cross', 'vanets', 'domain', 'cross_domain'],
    'Topic 15': ['iot', 'devices', 'iot_devices', 'internet_things', 'things'],
    'Topic 16': ['iiot', 'industry', 'industrial_internet', 'things_iiot', 'internet'],
    'Topic 17': ['healthcare', 'patient', 'health', 'records', 'care'],
    'Topic 18': ['vehicles', 'iov', 'internet_vehicles', 'vehicular', 'transportation'],
    'Topic 19': ['energy', 'energy_trading', 'renewable', 'evs', 'renewable_energy'],
    'Topic 20': ['grid', 'smart_grid', 'smart', 'power', 'aggregation'],
    'Topic 21': ['location', 'knowledge', 'zero', 'zero_knowledge', 'proof'],
    'Topic 22': ['home', 'smart_home', 'smart', 'iot', 'devices'],
    'Topic 23': ['contracts', 'smart_contracts', 'protocol', 'smart', 'transactions'],
    'Topic 24': ['iomt', 'medical', 'things_iomt', 'devices', 'healthcare'],
    'Topic 25': ['cloud', 'cloud_computing', 'computing', 'fog', 'storage'],
    'Topic 26': ['identity', 'identity_management', 'ssi', 'sovereign', 'self_sovereign'],
    'Topic 27': ['cities', 'smart_cities', 'smart', 'infrastructure', 'transportation'],
    'Topic 28': ['sharing', 'personal', 'information', 'management', 'trust'],
    'Topic 29': ['metaverse', 'virtual', 'social', 'virtual_world', 'reality'],
    'Topic 30': ['trading', 'energy_trading', 'electricity', 'peer', 'market']
}

# Carefully selected topics and define a new merged topics 
merged_topics = {
    'Privacy of personal information in Healthcare': ['Topic 1', 'Topic 2', 'Topic 17',  'Topic 24', 'Topic 28', 'Topic 4', 'Topic 5', 'Topic 10'],
    'Identity Management in E-education and E-voting systems': ['Topic 11'],
    'Security Measures using IDS': ['Topic 6'],
    'Data privacy on Cloud and Edge computing': ['Topic 7', 'Topic 25'],
    'Access-Control and authentication management in supply chain using smart contracts': ['Topic 9','Topic 3', 'Topic 14', 'Topic 12', 'Topic 23', 'Topic 26'],
    'Advanced encryption protocols using ZKP': ['Topic 8', 'Topic 21'],
    'Privacy protection in Distributed Systems using FL': ['Topic 13'],
    'Security and privacy in IoT-driven systems and smart cities': ['Topic 15', 'Topic 16', 'Topic 18', 'Topic 22', 'Topic 27'],
    'Data privacy in Energy trading over smart grid': ['Topic 19', 'Topic 20', 'Topic 30'],
    'Privacy of real identity in Metaverse': ['Topic 29'],
}

# Function to assign topic to a publication
def assign_topic(title, abstract, topics_keywords):
    combined_text = f"{title} {abstract}".lower()
    topic_scores = {topic: sum(combined_text.count(keyword) for keyword in keywords) for topic, keywords in topics_keywords.items()}
    assigned_topic = max(topic_scores, key=topic_scores.get)
    return assigned_topic

# Function to assign merged topic to a publication based on the original topic
def assign_merged_topic(assigned_topic, merged_topics):
    for key, value in merged_topics.items():
        if assigned_topic in value:
            return key
    return assigned_topic  # Default to the assigned topic if no merge is found

# Assign topics to each publication
df_relevant['Assigned_Topic'] = df_relevant.apply(lambda row: assign_topic(row['Title'], row['Abstract'], topics_keywords), axis=1)

# Assign merged topics based on the original assigned topics
df_relevant['Merged_Topic'] = df_relevant['Assigned_Topic'].apply(lambda x: assign_merged_topic(x, merged_topics))

# Filter data for years 2016 to 2024
df_filtered = df_relevant[(df_relevant['Year'] >= 2016) & (df_relevant['Year'] <= 2024)]

# Aggregate counts by year and merged topic
yearly_topic_counts = df_filtered.groupby(['Year', 'Merged_Topic']).size().unstack(fill_value=0)

# Display the aggregated data
print(yearly_topic_counts)

# Visualization: Line Graph
plt.figure(figsize=(14, 8))

for column in yearly_topic_counts.columns:
    plt.plot(yearly_topic_counts.index, yearly_topic_counts[column], marker='o', label=column)

plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Publications', fontsize=14)
plt.title('Topic Trend Analysis Over the Years', fontsize=16)
plt.legend(title='Merged Topic', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.savefig('trending.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [46]:
# ===== Dendrogram of Topics =====
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
from typing import List


def topic_labels_from_lda(lda_model, dictionary, topn: int = 6) -> List[str]:
    """Create readable labels by joining top terms per topic."""
    labels = []
    for t in range(lda_model.num_topics):
        terms = lda_model.show_topic(t, topn=topn)  # list of (term, prob)
        label = " ".join([w for w, _ in terms])
        labels.append(label)
    return labels

def dendrogram_from_topic_matrix(topic_term_matrix: np.ndarray,
                                labels: List[str],
                                out_path: str = "dendrogram.pdf",
                                title: str = "Hierarchical Clustering Dendrogram",
                                leaf_font_size: int = 10,
                                figsize=(12, 16)):
    """
    topic_term_matrix: shape (n_topics, vocab_size), rows are topic vectors.
    labels: list of length n_topics with human-readable topic labels.
    """
    # Normalize topic vectors so cosine distance is meaningful
    row_norm = np.linalg.norm(topic_term_matrix, axis=1, keepdims=True)
    row_norm[row_norm == 0] = 1.0
    X = topic_term_matrix / row_norm
    D = pdist(X, metric="cosine")
    Z = linkage(D, method="average")

    # Plot (orientation='right' for long labels)
    plt.figure(figsize=figsize)
    dendrogram(
        Z,
        labels=labels,
        orientation='right',
        leaf_font_size=leaf_font_size,
        color_threshold=None,
        above_threshold_color='k'
    )
    plt.title(title)
    plt.xlabel("Distance")
    plt.ylabel("Topics")
    plt.tight_layout()
    plt.savefig(out_path, bbox_inches='tight', pad_inches=0.1)
    plt.close()
    print(f"Saved dendrogram to {out_path}")

def labels_from_components(components: np.ndarray,
                           id2term: List[str],
                           topn: int = 6) -> List[str]:
    labels = []
    for k in range(components.shape[0]):
        top_idx = np.argsort(components[k])[::-1][:topn]
        words = [id2term[i] for i in top_idx]
        labels.append(" ".join(words))
    return labels
try:

    id2term = [dictionary[i] for i in range(len(dictionary))]

    if 'topic_model' in globals():
        components = topic_model.components_   
        nmf_labels = labels_from_components(components, id2term, topn=6)
        dendrogram_from_topic_matrix(components, nmf_labels, out_path="dendrogram.pdf")
except Exception as e:
    print("Could not derive NMF labels automatically. Ensure id2term matches components columns.\n", e)
