In [53]:
# pip install rake-nltk

In [64]:
import os
import numpy as np
import pandas as pd
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from pdfminer.high_level import extract_text
import pickle
import nltk
from rake_nltk import Rake
import shutil
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>


False

In [55]:
num_clusters = 3

pdf_folder = 'Final_Dataset/'
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith(".pdf")]
pdf_texts = []
for pdf_file in pdf_files:
    pdf_text = extract_text(pdf_file)
    pdf_texts.append(pdf_text)

if pdf_texts:
    print(f"PDF texts extracted: {len(pdf_texts)}")
else:
    print("PDF error")


PDF texts extracted: 56


In [56]:
# Tokenize and preprocess text
documents = [TaggedDocument(gensim.utils.simple_preprocess(text), [i]) for i, text in enumerate(pdf_texts)]

In [57]:
# Train a Doc2Vec model
doc2vec_model = Doc2Vec(documents, vector_size=300, window=10, min_count=2, workers=4, epochs=100)
with open('Models/doc2vec_model.pkl', 'wb') as f:
    pickle.dump(doc2vec_model, f)
    print("Doc2vec model created; doc2vec_model.pkl created")

Doc2vec model created; doc2vec_model.pkl created


In [58]:
# Cluster the documents using K-means clustering
doc_vectors = [doc2vec_model.infer_vector(doc.words) for doc in documents]
kmeans_model = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=1)
kmeans_model.fit(doc_vectors)

In [59]:
with open('Models/kmeans_model_doc2vec.pkl', 'wb') as f:
    pickle.dump(kmeans_model, f)
    print("Kmeans Doc2vec model creation successful; kmeans_model_doc2vec.pkl created")

Kmeans Doc2vec model creation successful; kmeans_model_doc2vec.pkl created


In [60]:
# Identify the category using centroid analysis
centroids = kmeans_model.cluster_centers_
similarity_matrix = cosine_similarity(doc_vectors, centroids)
category_labels = np.argmax(similarity_matrix, axis=1)
print(f"Category Labels: {category_labels}")

Category Labels: [1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 1 2 0 1 1 1 1 1 1 1 1 0 1 2 1 2 1 1 2 1 1 1
 1 1 1 1 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1]


In [61]:
# Assign the document to the corresponding category
result_df = pd.DataFrame({'Document': pdf_files, 'Category': category_labels})


In [62]:
# Extract keywords from documents using RAKE
r = Rake(stopwords=nltk.corpus.stopwords.words('english'), min_length=1, max_length=3)
keywords = []
for text in pdf_texts:
    r.extract_keywords_from_text(text)
    keywords.append(r.get_ranked_phrases())

# Map the category labels to keywords
category_keywords = [keywords[label][0] for label in category_labels]
result_df['Category Keyword'] = category_keywords
# result_df

In [63]:
result_path = "Final_Results/Doc2vec_Approach/"
# Create directories with names from category_keyword
for idx, row in result_df.iterrows():
    category_keyword = row['Category Keyword']
    folder_name = os.path.join(result_path, category_keyword)
    os.makedirs(folder_name, exist_ok=True)

# Copy the corresponding document in that folder
for idx, row in result_df.iterrows():
    source_file = row['Document']
    category_keyword = row['Category Keyword']
    folder_name = os.path.join(result_path, category_keyword)
    shutil.copy(source_file, folder_name)
