In [1]:
import sys
sys.path.insert(0, 'D:\\My Work\\Final Year Project\\Transformer\\')

In [2]:
import numpy as np
import pandas as pd
import nltk
import os
import warnings
from collections import defaultdict
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from nltk.corpus import wordnet as wn, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score
from sklearn.metrics import adjusted_rand_score as ari_score
from sklearn.metrics import f1_score 
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import LatentDirichletAllocation
import pickle
import torch
import random

from utils.Encoder import Encoder

os.environ['OMP_NUM_THREADS'] = '1'
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.cluster._kmeans")
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.feature_extraction.text")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [25]:
doc_content = []  # all the content in the document
doc_name = []  # name of the document
files_path = []  # path to the documents
lexical_chain = []  # list of lexical chains from each document
total_features = []  # total number of features. 1652
final_training_Features = []
corpus = []
doc_list_sequence = []

In [26]:
def reset_random_seeds(seed):
   os.environ['PYTHONHASHSEED']=str(seed)
   torch.manual_seed(seed)
   np.random.seed(seed)
   random.seed(seed)


def ReadDocuments(dir_name):
    for Path in os.listdir(dir_name):
        file_p = os.path.join(dir_name, Path)
        with open(file_p, "r") as file:
            FileContents = file.read()
            doc_content.append(FileContents.lower())
            doc_name.append(Path)
            files_path.append(file_p)

def Purity_Score(label_seq, pred_labels):
    # Calculate the confusion matrix to compare true labels and cluster assignments
    confusion = confusion_matrix(label_seq, pred_labels)
    # Calculate the purity
    purity = np.sum(np.max(confusion, axis=0)) / np.sum(confusion)
    return purity

def Evaluate(X, true_labels, predicted_labels):
    purity = Purity_Score(true_labels, predicted_labels)
    silhouette = silhouette_score(X, predicted_labels, metric='euclidean')
    ari = ari_score(true_labels, predicted_labels)
    nmi = nmi_score(true_labels, predicted_labels)
    
    print(f"Purity: {purity}")
    print(f"Silhouette Score: {silhouette}")
    print(f"ARI Score: {ari}")
    print(f"NMI Score: {nmi}")

def SaveFeatures(X, file_name):
    pickle_path = open(file_name, 'wb')
    pickle.dump(X, pickle_path)
    pickle_path.close()

def ReadFeatures(file_name):
    pickle_read = open(file_name, 'rb')
    x = pickle.load(pickle_read)
    pickle_read.close()
    return x

In [5]:
file_name = "DOC50_Features/DOC50_TFIDF_Features.pkl"
x = ReadFeatures(file_name)

In [6]:
x.shape

(50, 3885)

In [7]:
x = x.toarray()

In [8]:
x = torch.tensor(x, dtype=torch.float32)

In [9]:
x = x.reshape(shape=(1, x.size()[0], x.size()[1]))

In [10]:
x.size()

torch.Size([1, 50, 3885])

In [28]:
d_model = 3885
num_heads = 1
drop_prob = 0.1
batch_size = 1
max_sequence_length = 50
ffn_hidden = 2048
num_layers = 30

In [41]:
reset_random_seeds(42)
encoder = Encoder(d_model=d_model, ffn_hidden=ffn_hidden, num_heads=num_heads, drop_prob=drop_prob, num_layers=num_layers)

In [42]:
out = encoder(x)

x.size(): torch.Size([1, 50, 3885])
tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0265, 0.0000,  ..., 0.0339, 0.0000, 0.0339],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]])
qkv.size(): torch.Size([1, 50, 11655])
1, 50, 1, 3885
qkv.size(): torch.Size([1, 50, 1, 11655])
q.size(): torch.Size([1, 1, 50, 3885]), k.size(): torch.Size([1, 1, 50, 3885]), v.size(): torch.Size([1, 1, 50, 3885])
values.size(): torch.Size([1, 1, 50, 3885]), attention.size(): torch.Size([1, 1, 50, 50])
x.size(): torch.Size([1, 50, 3885])
tensor([[[-0.1081, -0.8454,  1.0193,  ..., -0.7751,  0.0484,  0.4439],
         [-0.3943,  1.0203,  0.8510,  ...,  1.1748,  0.1908,  1.3690],
         [ 0.0945, -0.4769,  1.2139,  ..., -0.3050,  0.1550,  0.4473],
    

In [14]:
def KMeans_Labels(X, n, rstate_limit, true_labels):
    print(X)
    # Specify the number of clusters (you can choose an appropriate value)
    num_clusters = n
    
    # find centoids which give maximum purity
    purity_collection = {}
    for i in range(rstate_limit):
        clusters = KMeans(n_init='auto', n_clusters=num_clusters, random_state=i, init='k-means++').fit(X).labels_
        purity_collection[i] = Purity_Score(true_labels, clusters)
    
    max_rand_state = max(purity_collection, key=purity_collection.get)
    print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")

    # Create a KMeans model
    kmeans = KMeans(n_init='auto', n_clusters=num_clusters, random_state=max_rand_state, init='k-means++')
    # Fit the KMeans model to the TF-IDF data
    kmeans.fit(X)
    # Get the cluster assignments for each document
    cluster_assignments = kmeans.labels_
    
    return cluster_assignments

def Actual_Labels():
    ReadDocuments(os.getcwd() + "\Doc50")
    actual_labels = {} # dictionary to store true assignments for each document | read sequence not followed
    label_path = os.getcwd() + '\\Doc50 GT\\'
    for labels_directory in os.listdir(label_path): # for each assignment folder
        actual_cluster = int(labels_directory[1]) # extract cluster label from directory name
        doc_labels = os.listdir(label_path + f"\\{labels_directory}") # for all document ids assigned to this cluster
        for doc in doc_labels:
            actual_labels[doc] = actual_cluster-1 # save cluster label
    
    label_seq = [] # save labels in order of documents read
    for doc in doc_name:
        label_seq.append(actual_labels[doc])
    return label_seq

def print_results(true_labels, predicted_labels, X):
    print("RESULTS:")
    print(f"Purity: {Purity_Score(true_labels, predicted_labels)}")
    print(f"Silhouette Score: {silhouette_score(X, predicted_labels)}")


def wrapperFunction():
    # ReadDocuments('Doc50')
    vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', preprocessor=custom_preprocessor)
    X = vectorizer.fit_transform(doc_content)

    SaveFeatures(X, 'DOC50_TFIDF_Features.pkl')
    
    true_labels = Actual_Labels()
    predicted_labels = KMeans_Labels(X, 5, 1500, true_labels)
    Evaluate(X, true_labels, predicted_labels)
    return predicted_labels, X


In [43]:
enhanced_x = out.detach().numpy()

In [44]:
enhanced_x[0].shape

(50, 3885)

In [45]:
doc_content = []  # all the content in the document
doc_name = []  # name of the document
files_path = []  # path to the documents
lexical_chain = []  # list of lexical chains from each document
total_features = []  # total number of features. 1652
final_training_Features = []
corpus = []
doc_list_sequence = []

In [46]:
true_labels = Actual_Labels()
pred_lables = KMeans_Labels(enhanced_x[0], 5, 700, true_labels)

[[-0.07043584 -0.4605227   0.3678442  ...  1.117732   -0.37223384
   0.02894631]
 [ 0.79605615 -0.6738356   0.48651254 ...  0.63497174  0.433036
  -0.7865405 ]
 [-0.13338403 -0.6083789  -0.11183209 ... -0.24193646 -1.0200937
  -0.51656073]
 ...
 [-0.35077035 -0.10367109 -0.7955016  ... -0.84621     0.03775709
  -2.2199671 ]
 [-0.641953   -1.2674987  -0.7170792  ... -0.22117291 -0.24665022
  -1.6835128 ]
 [-0.8173722  -0.8955151  -1.0286976  ... -1.082792    0.6256592
  -2.005139  ]]
Maximum purity of 0.78 found on random state 142


In [39]:
len(true_labels)

50

In [47]:
Evaluate(enhanced_x[0], true_labels, pred_lables)

Purity: 0.78
Silhouette Score: 0.01145158987492323
ARI Score: 0.6034407559970923
NMI Score: 0.7665762273140948
