In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import torch
import re
from transformers import BertTokenizer, BertModel
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt

In [2]:
# Download NLTK data if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

In [4]:
# Load a pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.to(device)  # Move the model to the GPU if available

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [5]:
# Check if GPU is available, and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# Wrap the model in the DataParallel wrapper
model = torch.nn.DataParallel(model)

In [54]:
# Functions to remove numbers and double "##", remove punctuation, lowercase all text, remove stop words, stem words, and lemmatize words
def remove_numbers_and_double_hash(text):
  text = re.sub(r"[0-9]+", "", text)
  text = re.sub(r"##", "", text)
  return text

def remove_punctuation(text):
  return re.sub('[^a-zA-Z0-9]', ' ', text)

def lowercase_text(text):
  return text.lower()

def remove_stop_words(text):
  stop_words = set(nltk.corpus.stopwords.words('english'))
  return [word for word in text if word not in stop_words]

def stem_words(text):
  stemmer = nltk.PorterStemmer()
  return [stemmer.stem(word) for word in text]

def lemmatize_words(text):
  lemmatizer = nltk.WordNetLemmatizer()
  return [lemmatizer.lemmatize(word) for word in text]

# Functions to preprocess text and get BERT embeddings
def preprocess_text(text):
  # Add more preprocessing steps as needed
  text = remove_numbers_and_double_hash(text)
  text = remove_punctuation(text)
  text = lowercase_text(text)
  text = remove_stop_words(text)
  text = stem_words(text)
  text = lemmatize_words(text)
  return text

def get_bert_embeddings(text):
  # This function should be defined with the appropriate model and tokenizer setup.
  # Make sure to define and load the model and tokenizer before calling this function.
  inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
  with torch.no_grad():
    outputs = model(**inputs)
  embeddings = outputs.last_hidden_state[:, 0, :]
  return embeddings.cpu().numpy()

# Functions to handle both pre-processing and BERT embeddings
def preprocess_and_get_embeddings(row):
  # Get text from the row
  text = row['Text']

  # Preprocess text
  preprocessed_text = preprocess_text(text)

  # Get BERT embeddings
  embeddings = get_bert_embeddings(preprocessed_text)

  return embeddings

In [55]:
# Define a function to display BERT embeddings and preprocessed text
def display_bert_embedding_and_preprocessing(data, row_idx):
  # Get BERT embedding for the specified row
  embedding = data.iloc[row_idx, -768:]

  # Get preprocessed text for the specified row
  preprocessed_text = data.iloc[row_idx]['Text']

  # Display BERT embedding
  print("BERT Embeddings:")
  print(embedding)

  # Display preprocessed text
  print("\nPreprocessed Text:")
  print(preprocessed_text)

In [56]:
# Read the CSV file
data = pd.read_csv('DataFinal_1.csv', encoding='latin-1')

In [57]:
# Print the list of column names
print(data.columns)


Index(['ï»¿Title', 'Abstract'], dtype='object')


In [63]:
# Merge the 'Title' and 'Abstract' columns into a single column called 'Text'
data['Text'] = data['ï»¿Title'].fillna('') + ' ' + data['Abstract'].fillna('')

In [64]:
# Process and obtain embeddings for each row
embeddings = data.apply(preprocess_and_get_embeddings, axis=1)
embeddings = np.vstack(embeddings.to_numpy())  # Convert to a NumPy array

TypeError: expected string or bytes-like object

In [60]:
# Concatenate the embeddings with the original DataFrame
data = pd.concat([data, pd.DataFrame(embeddings)], axis=1)

In [61]:
# Check the shape of text_embeddings
#text_embeddings = embeddings  # Assign the embeddings to text_embeddings
print(embeddings.shape)  # Should be (number_of_samples, embedding_dimension)

(251014, 768)


In [62]:
# Choose a row to display (change row_idx to the desired row)
row_idx = 200 # Change this to the index of the row you want to display
display_bert_embedding_and_preprocessing(data, row_idx)

KeyError: 'text'

In [27]:
# Save the DataFrame to a CSV file
data.to_csv('sample_data.csv', index=False)

In [55]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import fcluster, linkage

In [60]:
def perform_cluster_analysis(embeddings, linkage_matrix, min_clusters, max_clusters, word_list):

    results = pd.DataFrame(columns=['Num_Clusters', 'Silhouette_Score', 'Cohesion_Score', 'Separation_Score', 'Cluster_Assignments'])

    for num_clusters in range(min_clusters, max_clusters + 1):
        # Perform hierarchical clustering and assign clusters
        cluster_assignments = fcluster(linkage_matrix, num_clusters, criterion='maxclust')

        # Calculate silhouette score
        silhouette_avg = silhouette_score(embeddings, cluster_assignments)

        # Calculate cohesion and separation scores
        cluster_centers = []
        max_cluster_size = 0

        for cluster_id in range(1, num_clusters + 1):
            cluster_indices = np.where(cluster_assignments == cluster_id)[0]
            cluster_words = [word_list[i] for i in cluster_indices]
            cluster_centers.append(cluster_words)

            # Track the maximum cluster size
            max_cluster_size = max(max_cluster_size, len(cluster_indices))

        # Pad shorter clusters with a placeholder value (e.g., '')
        cluster_centers = [cluster + [''] * (max_cluster_size - len(cluster)) for cluster in cluster_centers]

        # Convert the cluster_centers list to a two-dimensional NumPy array
        cluster_centers = np.array(cluster_centers, dtype='object')

        # Replace '' with a placeholder numeric value (e.g., 0) for numerical calculations
        cluster_centers[cluster_centers == ''] = np.NAN

        # Convert the dtype to a numeric type
        cluster_centers = cluster_centers.astype(float, casting='unsafe')

        # Calculate pairwise distances using squareform to handle the new numeric placeholder
        pairwise_distances = pdist(cluster_centers, metric='euclidean')
        pairwise_distances = squareform(pairwise_distances)

        cohesion_score = np.mean(pairwise_distances)
        separation_score = np.min(pairwise_distances)

        # Create a new row for the results DataFrame
        new_row = {'Num_Clusters': num_clusters, 'Silhouette_Score': silhouette_avg, 'Cohesion_Score': cohesion_score, 'Separation_Score': separation_score, 'Cluster_Assignments': cluster_assignments}

        # Append the new row to the results DataFrame
        results = results.append(new_row, ignore_index=True)

    return results


In [61]:
# Tokenize the text in your dataset
stop_words = set(stopwords.words('english'))

data['Tokens'] = data['Text'].apply(lambda x: [token for token in tokenizer.tokenize(x) if token not in stop_words])

# Create a list of words based on your tokens
word_list = [word for tokens in data['Tokens'] for word in tokens]

# Calculate the pairwise distances between the data points
pairwise_distances = pdist(embeddings, metric='euclidean')

# Perform hierarchical clustering using DIANA
linkage_matrix = linkage(pairwise_distances, method='ward', metric='euclidean')

# Example usage
min_clusters = 2
max_clusters = 10

# Assume embeddings and linkage_matrix are defined
results = perform_cluster_analysis(embeddings, linkage_matrix, min_clusters, max_clusters, word_list)

# Print the cluster analysis results
print(results.to_string())

# Choose the optimal cluster assignments
cluster_assignments = results.loc[(results['Cohesion_Score'] > 0.1) & (results['Separation_Score'] > 0.2), 'Cluster_Assignments'].values[0]

# Add cluster assignments to the DataFrame
data['Cluster'] = cluster_assignments

# Print the cluster analysis results
print(results.to_string())

ValueError: could not convert string to float: 'driver'