In [23]:
import os
import numpy as np
import pandas as pd
import gensim
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from pdfminer.high_level import extract_text
import pickle
from rake_nltk import Rake
import nltk
import shutil
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>


False

In [11]:
num_clusters = 3

pdf_folder = 'Final_Dataset/'
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith(".pdf")]
pdf_texts = []
pdf_labels = []
for pdf_file in pdf_files:
    pdf_text = extract_text(pdf_file)
    if pdf_text is not None:
        pdf_texts.append(pdf_text)
        pdf_labels.append(os.path.basename(pdf_file))

if pdf_texts:
    print(f"PDF texts extracted: {len(pdf_texts)}")
else:
    print("PDF error")

PDF texts extracted: 56


In [14]:
# Tokenize and preprocess text
tokenized_docs = [gensim.utils.simple_preprocess(text) for text in pdf_texts]



In [16]:
# Train a Word2Vec model
word2vec_model = Word2Vec(tokenized_docs, vector_size=300, window=10, min_count=2, workers=4, sg=1) #epochs=100)
with open('Models/word2vec_model.pkl', 'wb') as f:
    pickle.dump(word2vec_model, f)
    print("Doc2vec model created; word2vec_model.pkl created")

Doc2vec model created; word2vec_model.pkl created


In [19]:
# Cluster the documents using K-means clustering
# doc_vectors = [np.mean([word2vec_model.wv[token] for token in doc], axis=0) for doc in tokenized_docs]
keys = list(word2vec_model.wv.key_to_index.keys())
doc_vectors = []
for doc in tokenized_docs:
    vec = np.zeros(word2vec_model.vector_size)
    count = 0
    for word in doc:
        if word in keys:
            vec += word2vec_model.wv[word]
            count += 1
    if count != 0:
        vec /= count
    doc_vectors.append(vec)

kmeans_model = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=1)
kmeans_model.fit(doc_vectors)
with open('Models/kmeans_model_word2vec.pkl', 'wb') as f:
    pickle.dump(kmeans_model, f)
    print("Kmeans Doc2vec model creation successful; kmeans_model_word2vec.pkl created")

Kmeans Doc2vec model creation successful; kmeans_model_word2vec.pkl created


In [20]:
# Identify the category using centroid analysis
centroids = kmeans_model.cluster_centers_
similarity_matrix = cosine_similarity(doc_vectors, centroids)
category_labels = np.argmax(similarity_matrix, axis=1)
print(f"Category Labels: {category_labels}")

Category Labels: [1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 0 1 1 1 0 1 1 1 0 0 0 1 1 1 0 0 0 0 1 1
 1 1 0 1 1 0 0 1 1 0 0 1 0 0 1 1 0 1 0]


In [21]:
# Assign the document to the corresponding category
result_df = pd.DataFrame({'Document': pdf_files, 'Category': category_labels})

In [22]:
# Extract keywords from documents using RAKE
r = Rake(stopwords=nltk.corpus.stopwords.words('english'), min_length=1, max_length=3)
keywords = []
for text in pdf_texts:
    r.extract_keywords_from_text(text)
    keywords.append(r.get_ranked_phrases())
    
# Map the category labels to keywords
category_keywords = [keywords[label][0] for label in category_labels]
result_df['Category Keyword'] = category_keywords

In [25]:
result_path = "Final_Results/Bag_Words_Approach/"
# Create directories with names from category_keyword
for idx, row in result_df.iterrows():
    category_keyword = row['Category Keyword']
    folder_name = os.path.join(result_path, category_keyword)
    os.makedirs(folder_name, exist_ok=True)

# Copy the corresponding document in that folder
for idx, row in result_df.iterrows():
    source_file = row['Document']
    category_keyword = row['Category Keyword']
    folder_name = os.path.join(result_path, category_keyword)
    shutil.copy(source_file, folder_name)
