In [93]:
import pandas as pd
import string
from nltk.corpus import stopwords
import nltk
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances


In [94]:
# Download NLTK data if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [95]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [142]:
# Function to remove punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    text_without_punct = text.translate(translator)
    return text_without_punct

# Function to lowercase text
def lowercase_text(text):
    return text.lower()

# Function to remove stop words
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Function to get BERT embeddings
def get_bert_embeddings(text):
    # Tokenize the text and convert it to input IDs
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Extract the embeddings for the [CLS] token (you can also use [CLS], [SEP], or average over all tokens)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings.tolist()


In [143]:
# Read the CSV file
data = pd.read_csv('FinalDataF.csv', encoding='latin-1')

In [140]:
# Define a function to preprocess the thesis titles and abstracts and get BERT embeddings
def preprocess_and_get_embeddings(row):
    title = row['Title']
    abstract = row['Abstract']

    # Check for NaN values and return them unchanged
    if pd.isna(title):
        title = ""
    if pd.isna(abstract):
        abstract = ""

    title = remove_punctuation(title)
    title = lowercase_text(title)
    title = remove_stop_words(title)
    title_embeddings = get_bert_embeddings(title)

    abstract = remove_punctuation(abstract)
    abstract = lowercase_text(abstract)
    abstract = remove_stop_words(abstract)
    abstract_embeddings = get_bert_embeddings(abstract)

    return pd.Series({'Title_Embeddings': title_embeddings, 'Abstract_Embeddings': abstract_embeddings})

data.head()

Unnamed: 0,Title,Abstract,Title_Embeddings,Abstract_Embeddings,Title_Cluster,Abstract_Cluster,Title_Cluster_Labels,Abstract_Cluster_Labels
0,A CASE STUDY OF DRIVER'S LICENSE PROCESSES ON ...,A Case Study on Driver's License Processes was...,"[[-0.05910889804363251, 0.26362770795822144, -...","[[-0.296672523021698, 0.3988223075866699, 0.31...",0,0,Cluster 0,Cluster 0
1,A CASE STUDY ON POULTRY EGG PRODUCTION BUSINESS,A Case Study on Poultry Egg Production Busines...,"[[-0.2676674425601959, 0.0976429432630539, -0....","[[-0.22526228427886963, 0.14748013019561768, 0...",0,4,Cluster 0,Cluster 0
2,DESIGN AND DEVELOPMENT OF A FIRE DETECTION AND...,A fire detection and alarm system prototype ba...,"[[-0.41528111696243286, -0.09206049889326096, ...","[[-0.302248477935791, 0.34940415620803833, 0.2...",1,2,Cluster 0,Cluster 0
3,"A KNOWLEDGE-BASED SYSTEM N MATCHING TREE, PLAN...","A Knowledge-Based System on Matching of Tree, ...","[[-0.7902029752731323, 0.06406101584434509, -0...","[[-0.29129481315612793, 0.33515429496765137, 0...",0,2,Cluster 0,Cluster 0
4,DESIGN AND DEVELOPMENT OF MEMORANDUM PRIORITIZ...,A memorandum is a means of inter-office corres...,"[[-0.7163332104682922, 0.11507435888051987, -0...","[[-0.695845901966095, 0.15343907475471497, 0.0...",0,0,Cluster 0,Cluster 0


In [100]:
# Apply the preprocessing and embedding function to each row of the DataFrame
embeddings_data = data.apply(preprocess_and_get_embeddings, axis=1)


In [101]:
# Concatenate the embeddings with the original DataFrame
data = pd.concat([data, embeddings_data], axis=1)


In [109]:
# Reshape the title_embeddings to 2D
title_embeddings = title_embeddings.reshape(title_embeddings.shape[0], -1)
# Reshape the abstract_embeddings to 2D
abstract_embeddings = abstract_embeddings.reshape(abstract_embeddings.shape[0], -1)



In [144]:
# Apply divisive hierarchical clustering using AgglomerativeClustering for title embeddings
n_clusters = 5  # Adjust the number of clusters as needed
title_hierarchical_clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
title_cluster_labels = title_hierarchical_clustering.fit_predict(title_embeddings)


In [145]:
# Apply divisive hierarchical clustering using AgglomerativeClustering for abstract embeddings
abstract_hierarchical_clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
abstract_cluster_labels = abstract_hierarchical_clustering.fit_predict(abstract_embeddings)


In [126]:
# Add cluster labels to the DataFrame for title embeddings and abstract embeddings
data['Title_Cluster'] = title_cluster_labels
data['Abstract_Cluster'] = abstract_cluster_labels


In [146]:
# Calculate silhouette score, cohesion, and separation for title embeddings
silhouette_avg_title = silhouette_score(title_embeddings, title_cluster_labels)
pairwise_dist_title = pairwise_distances(title_embeddings, metric='euclidean')
cohesion_title = 0
separation_title = 0
for i in range(n_clusters):
    cluster_i_indices = (title_cluster_labels == i)
    cluster_i_embeddings = title_embeddings[cluster_i_indices]
    cluster_i_dist = pairwise_dist_title[cluster_i_indices][:, cluster_i_indices]
    cohesion_title += cluster_i_dist.sum() / 2.0  # Divide by 2 to avoid double counting
    other_clusters_dist = pairwise_dist_title[cluster_i_indices][:, ~cluster_i_indices]
    separation_title += other_clusters_dist.sum()


In [128]:
# Normalize cohesion and separation by the number of samples for title embeddings
num_samples_title = len(title_embeddings)
cohesion_title /= num_samples_title
separation_title /= num_samples_title

In [148]:
# Calculate silhouette score, cohesion, and separation for title embeddings
silhouette_avg_title = silhouette_score(title_embeddings, title_cluster_labels)
pairwise_dist_title = pairwise_distances(title_embeddings, metric='euclidean')
cohesion_title = 0
separation_title = 0
for i in range(n_clusters):
    cluster_i_indices = (title_cluster_labels == i)
    cluster_i_embeddings = title_embeddings[cluster_i_indices]
    cluster_i_dist = pairwise_dist_title[cluster_i_indices][:, cluster_i_indices]
    cohesion_title += cluster_i_dist.sum() / 2.0  # Divide by 2 to avoid double counting
    other_clusters_dist = pairwise_dist_title[cluster_i_indices][:, ~cluster_i_indices]
    separation_title += other_clusters_dist.sum()

# Normalize cohesion and separation by the number of samples for title embeddings
num_samples_title = len(title_embeddings)
cohesion_title /= num_samples_title
separation_title /= num_samples_title

# Calculate silhouette score, cohesion, and separation for abstract embeddings
silhouette_avg_abstract = silhouette_score(abstract_embeddings, abstract_cluster_labels)
pairwise_dist_abstract = pairwise_distances(abstract_embeddings, metric='euclidean')
cohesion_abstract = 0
separation_abstract = 0
for i in range(n_clusters):
    cluster_i_indices = (abstract_cluster_labels == i)
    cluster_i_embeddings = abstract_embeddings[cluster_i_indices]
    cluster_i_dist = pairwise_dist_abstract[cluster_i_indices][:, cluster_i_indices]
    cohesion_abstract += cluster_i_dist.sum() / 2.0  # Divide by 2 to avoid double counting
    other_clusters_dist = pairwise_dist_abstract[cluster_i_indices][:, ~cluster_i_indices]
    separation_abstract += other_clusters_dist.sum()

# Normalize cohesion and separation by the number of samples for abstract embeddings
num_samples_abstract = len(abstract_embeddings)
cohesion_abstract /= num_samples_abstract
separation_abstract /= num_samples_abstract

print(f"Silhouette Score for Title Embeddings: {silhouette_avg_title}")
print(f"Cohesion for Title Embeddings: {cohesion_title}")
print(f"Separation for Title Embeddings: {separation_title}")

print(f"Silhouette Score for Abstract Embeddings: {silhouette_avg_abstract}")
print(f"Cohesion for Abstract Embeddings: {cohesion_abstract}")
print(f"Separation for Abstract Embeddings: {separation_abstract}")

# Save the DataFrame with embeddings and clusters to a new CSV file
data.to_csv('thesis_dataset_with_embeddings_and_clusters.csv', index=False)


Silhouette Score for Title Embeddings: 0.18864330649375916
Cohesion for Title Embeddings: 330.10525384200247
Separation for Title Embeddings: 3014.6395970394738
Silhouette Score for Abstract Embeddings: 0.1572137176990509
Cohesion for Abstract Embeddings: 249.62188977693256
Separation for Abstract Embeddings: 2865.4959292763156


In [137]:
# Load the DataFrame with embeddings and clusters
data = pd.read_csv('thesis_dataset_with_embeddings_and_clusters.csv')

In [133]:
# Extract the cluster labels for both Title and Abstract embeddings
title_cluster_labels = data['Title_Cluster']
abstract_cluster_labels = data['Abstract_Cluster']


In [134]:
# Define a function to generate cluster labels
def generate_cluster_labels(cluster_labels):
    cluster_counts = np.bincount(cluster_labels)
    label_mapping = {cluster_id: f'Cluster {cluster_id}' for cluster_id in range(len(cluster_counts))}
    cluster_labels = [label_mapping[cluster_id] for cluster_id in cluster_labels]
    return cluster_labels

In [135]:
# Generate labels for Title Clusters
title_cluster_labels = generate_cluster_labels(title_cluster_labels)

# Generate labels for Abstract Clusters
abstract_cluster_labels = generate_cluster_labels(abstract_cluster_labels)


In [136]:
# Add cluster labels to the DataFrame
data['Title_Cluster_Labels'] = title_cluster_labels
data['Abstract_Cluster_Labels'] = abstract_cluster_labels

# Save the DataFrame with cluster labels to a new CSV file
data.to_csv('thesis_dataset_with_cluster_labels.csv', index=False)