In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import torch
import re
from transformers import BertTokenizer, BertModel
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt

In [2]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

In [3]:
# Load a pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.to(device)  # Move the model to the GPU if available

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [4]:
# Check if GPU is available, and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Wrap the model in the DataParallel wrapper
model = torch.nn.DataParallel(model)

In [6]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# Function to remove punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    text_without_punct = text.translate(translator)
    return re.sub(r'\s+', ' ', text_without_punct).strip()  # Replace consecutive spaces with a single space


# Function to lowercase text
def lowercase_text(text):
    return text.lower()

# Function to lemmatize text
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    lemmatized_words = []
    for word in word_tokenize(text):
        if word not in stop_words:
            lemmatized_words.append(lemmatizer.lemmatize(word))
        else:
            lemmatized_words.append(word)  # Keep stopwords unchanged
    return ' '.join(lemmatized_words)

# Function to remove stop words
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Function to preprocess text
def preprocess_text(text):
    # Add more preprocessing steps as needed
    text = remove_punctuation(text)
    text = lowercase_text(text)
    text = remove_stop_words(text)
    text = lemmatize_text(text)
    return text

# Function to get BERT embeddings
def get_bert_embeddings(text):
    # This function should be defined with the appropriate model and tokenizer setup.
    # Make sure to define and load the model and tokenizer before calling this function.
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings.cpu().numpy()


In [8]:
# Define a function to display BERT embeddings and preprocessed text
def display_bert_embedding_and_preprocessing(data, row_idx):
    # Get BERT embeddings for the specified row
    embedding = data.iloc[row_idx, -768:]  # Assuming the embeddings have 768 dimensions
    preprocessed_text = data.iloc[row_idx]['Text']

    # Display BERT embeddings
    print("BERT Embeddings:")
    print(embedding)
    
    # Display preprocessed text
    print("\nPreprocessed Text:")
    print(preprocessed_text)
    
def preprocess_and_get_embeddings(row):
    text = row.get('Text', '')  # Use get() to handle missing values
    embeddings = get_bert_embeddings(text)
    return embeddings

In [39]:
# Read the CSV file
data = pd.read_csv('DataFinal_1.csv', encoding='latin-1')

# Print the list of column names
print(data.columns)

Index(['ï»¿Title', 'Abstract'], dtype='object')


In [40]:
# Merge the 'Title' and 'Abstract' columns into a single column called 'Text'
data['Text'] = data['ï»¿Title'] + ',' + data['Abstract']

In [41]:
# Assuming 'data' is your DataFrame
data['Text'] = data['Text'].apply(preprocess_text)

In [60]:
# Process and obtain embeddings for each row
embeddings = data.apply(preprocess_and_get_embeddings, axis=1)
embeddings = np.vstack(embeddings.to_numpy())  # Convert to a NumPy array


In [61]:
# Concatenate the embeddings with the original DataFrame
data = pd.concat([data, pd.DataFrame(embeddings)], axis=1)

In [62]:
# Check the shape of text_embeddings
#text_embeddings = embeddings  # Assign the embeddings to text_embeddings
print(embeddings.shape)  # Should be (number_of_samples, embedding_dimension)

(326,)


In [45]:
# Choose a row to display (change row_idx to the desired row)
row_idx = 200 # Change this to the index of the row you want to display
display_bert_embedding_and_preprocessing(data, row_idx)

BERT Embeddings:
0     -0.208345
1      0.235173
2      0.290928
3      0.064327
4      0.105875
         ...   
763     -0.0644
764   -0.435931
765   -0.227162
766    0.120141
767    0.546278
Name: 200, Length: 768, dtype: object

Preprocessed Text:
medical supply inventory distribution system pnp hospitalthe pharmacy pnp regional xiii health service hospital us traditional way inventory medical supply equipment performing daily transaction paper pen used recording supply thus result poor inventory management product availability monitoring deemed necessary response pnp working environment medical supply inventory system pnp hospital designed help pnp hospital improve staff work efficiency computerizing part business process automation inventory monitoring medicine product done properly cope high demand pnp keywords information system inventory inventory system laravel postgresql


In [46]:
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.metrics import silhouette_score


In [57]:
# Computes the distance matrix between all pairs of data points.
def compute_distance_matrix(data):
    distance_matrix = np.zeros((data.shape[0], data.shape[0]))
    for i in range(data.shape[0]):
        for j in range(i + 1, data.shape[0]):
            distance_matrix[i][j] = np.linalg.norm(data[i] - data[j])
    return distance_matrix

# Performs DIANA hierarchical clustering.

def perform_diana_clustering(distance_matrix):
    linkage_matrix = linkage(distance_matrix, method="ward")
    cluster_assignments = np.array(dendrogram(linkage_matrix)["leaves"])
    return cluster_assignments

def calculate_silhoutte_score(embeddings, cluster_assignments):
    silhouette_score = silhouette_score(embeddings, cluster_assignments)
    return silhouette_score

def calculate_cohesion(embeddings, cluster_assignments):
    cohesion_score = 0
    for cluster_id in range(1, np.max(cluster_assignments) + 1):
      cluster_indices = np.where(cluster_assignments == cluster_id)[0]
      cluster_embeddings = embeddings[cluster_indices]

      cluster_centroid = np.mean(cluster_embeddings, axis=0)

      cohesion_score += np.sum(np.linalg.norm(cluster_embeddings - cluster_centroid))

    return cohesion_score

def calculate_separation(embeddings, cluster_assignments):
    separation_score = 0
    for cluster_id in range(1, np.max(cluster_assignments) + 1):
      cluster_indices = np.where(cluster_assignments == cluster_id)[0]
      cluster_embeddings = embeddings[cluster_indices]

      cluster_centroid = np.mean(cluster_embeddings, axis=0)

      other_cluster_indices = np.where(cluster_assignments != cluster_id)[0]
      other_cluster_embeddings = embeddings[other_cluster_indices]

      separation_score += np.sum(np.linalg.norm(cluster_embeddings - other_cluster_embeddings))

    return separation_score

In [58]:
# Print unique values in each column
for column in data.columns:
    print(f"Unique values in {column}: {data[column].unique()}")


Unique values in ï»¿Title: ["A CASE STUDY OF DRIVER'S LICENSE PROCESSES ON LAND TRANSPORTATION OFFICE"
 'A CASE STUDY ON POULTRY EGG PRODUCTION BUSINESS'
 'DESIGN AND DEVELOPMENT OF A FIRE DETECTION AND ALARM SYSTEM PROTOTYPE BASED ON A WIRELESS SENSOR NETWORK'
 'A KNOWLEDGE-BASED SYSTEM N MATCHING TREE, PLANT AND LEGUMES ON MINERAL-BEARING SITE'
 'DESIGN AND DEVELOPMENT OF MEMORANDUM PRIORITIZATION SYSTEM A DSS TOOL FOR SANGGUNIANG PANLALAWIGAN MANAGEMENT'
 'WEB-BASED STUDENT SUBJECT EVALUATION SYSTEM FOR NORMISIST'
 'COMPUTERIZATION OF ENTRANCE EXAM OF NORMISIST GUIDANCE OFFICE'
 'A SYSTEM DESIGN FOR UNIFIED TVET PROGRAM REGISTRATION MANAGEMENT'
 'MICROFINANCE COLLECTION MONITORING SYSTEM AND CLIENT MAPPING WITH ANDROID APPLICATION'
 'ABACADA DIAGNOSTIC AGENT IN PEST MANAGEMENT'
 'WEB-BASED SCHOLARSHIP FINDER IN CARAGA STATE UNIVERSITY'
 'AGRICULTURAL COMPREHENSIVE ASSESSMENT OF LANDSCAPE AND MODELING FOR SUITANABILITY ANALYSIS AND FORECASTING EVENTS (CALM-SAFE AGRICULTURE) PROGRAM: 

In [59]:
# Convert the data to a NumPy array.
embeddings = data.to_numpy()

In [56]:


# Perform DIANA hierarchical clustering.
cluster_assignments = perform_diana_clustering(embeddings)

# Calculate the silhouette score, cohesion, and separation
silhouette_score = calculate_silhoutte_score(embeddings, cluster_assignments)
cohesion_score = calculate_cohesion(embeddings, cluster_assignments)
separation_score = calculate_separation(embeddings, cluster_assignments)

# Print the results
print("Silhouette Score:", silhouette_score)
print("Cohesion Score:", cohesion_score)
print("Separation Score:", separation_score)


ValueError: could not convert string to float: "A CASE STUDY OF DRIVER'S LICENSE PROCESSES ON LAND TRANSPORTATION OFFICE"