In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Downloading Required Packages

In [None]:
!pip install pytesseract pdf2image pillow opencv-python matplotlib pymupdf && \
pip install nltk PyPDF2 pdfplumber pycryptodome && \
pip install pdfplumber && \
pip install nltk sentence-transformers scikit-learn && \
pip install python-Levenshtein

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pymupdf
  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.10 (from pymupdf)
  Downloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytesseract, P


# Project IA-Cognition

This notebook is part of the **A content based recommender system on Risk Mangement** project. The goal of this project is to analyze textual data, extract meaningful features, and evaluate the relevance of extracted concepts using various natural language processing techniques. The project includes cleaning data, extracting attributes and relations from text, calculating term frequencies, and evaluating the performance of feature extraction.

This notebook is organized as follows:
1. **Data Extraction**
2. **Data Segmentation**
3. **Concept and Relationship Identification**
4. **Process Evaluation**


# Imports

In [None]:
# Data Handling
import pandas as pd
import numpy as np

# PDF Processing
import pdfplumber
import fitz
from pdf2image import convert_from_path

# Image Processing
from PIL import Image, ImageEnhance, ImageFilter
import cv2
import pytesseract

# Text Processing
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import RegexpParser
from nltk.probability import FreqDist
from collections import Counter, defaultdict

# Natural Language Processing
import spacy
from sentence_transformers import SentenceTransformer
from nltk.parse.corenlp import CoreNLPDependencyParser

# Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

# String Distance
import Levenshtein
from Levenshtein import distance as levenshtein_distance

# Concurrent Processing
import concurrent.futures

# Visualization
import matplotlib.pyplot as plt


  from tqdm.autonotebook import tqdm, trange



# Data Extraction



In [None]:
# Download NLTK data (if not already downloaded)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)


True

In [None]:

# Define the path to your PDF file
pdf_path = '/content/drive/MyDrive/PI/practice-standard-project-risk-management.pdf'

In [None]:
# Initialize a list to store text from all pages
extracted_text = []

In [None]:
nltk.download('stopwords')

# Load the default English stopwords
stop_words = set(stopwords.words('english'))

# Define your custom stop words
custom_stop_words = {
    # Articles
    'a', 'an', 'the',

    # Prepositions
    'in', 'on', 'at', 'by', 'for', 'with', 'of',

    # Conjunctions
    'and', 'or', 'but', 'if',

    # Pronouns
    'i', 'you', 'he', 'she', 'it', 'we', 'they', 'this', 'that',

    # Common Verbs
    'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'do', 'does',

    # Adverbs
    'very', 'so', 'just',

    # Quantifiers
    'all', 'some', 'many', 'most',

    # Additional Words
    'other', 'chapter', 'appendix',

    # Single Letters
    'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',

    # Specific Words and Numbers
    '2009', 'to', 'can', 'may', 'of','.'
}

# Update the NLTK stopwords with the custom stopwords
stop_words.update(custom_stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
def clean_text(text):
    # Normalize unicode characters (remove accents)
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
    # Convert text to lowercase
    text = text.lower()
    # Remove ellipses (sequences of three or more periods)
    text = re.sub(r'\.{3,}', ' ', text)
    # Remove any other unwanted patterns
    # Remove digits
    # text = re.sub(r'\d+', '', text)
    # Remove punctuation and special characters except spaces
    text = re.sub(r'[^\w\s]', '', text)
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Removes any non-ASCII characters
    # Strip leading and trailing spaces
    text = text.strip()

    return text

In [None]:

start_page = 13  # Start from page 13
end_page = 118   # End at page 117 (inclusive)
# Open the PDF file
with pdfplumber.open(pdf_path) as pdf:
    # Iterate through each page

    for page_number, page in enumerate(pdf.pages[start_page - 1:end_page], start=start_page):## to verify
        text = page.extract_text()
        if text:
            # Clean the text
            text = clean_text(text)
            # Tokenize the text into words
            tokens = word_tokenize(text)
            # Lemmatize the tokens
            tokens = [lemmatizer.lemmatize(token) for token in tokens]
            # Remove stop words
            tokens_without_stopwords = [word for word in tokens if word not in stop_words]
            # Reconstruct cleaned text
            cleaned_text = ' '.join(tokens_without_stopwords)
            # Append the cleaned text to the list
            extracted_text.append(cleaned_text)
        else:
            print(f"--- Page {page_number} ---\nNo text found on this page.\n")

# Now you can join all the page texts into a single string
full_text = ' '.join(extracted_text)

# Tokenize the text into sentences
sentences = sent_tokenize(full_text)

# (Optional) Print the sentences
for idx, sentence in enumerate(sentences, start=1):
    print(f"Sentence {idx}: {sentence}\n")


--- Page 20 ---
No text found on this page.

--- Page 30 ---
No text found on this page.

--- Page 36 ---
No text found on this page.

--- Page 42 ---
No text found on this page.

--- Page 48 ---
No text found on this page.

--- Page 68 ---
No text found on this page.

--- Page 70 ---
No text found on this page.

--- Page 80 ---
No text found on this page.

--- Page 118 ---
No text found on this page.




# Data Segmentation

NLTK’s lemmatizer requires WordNet POS tags for more accurate results due to the following reasons:

NLTK POS Tags: These tags, generated by NLTK's pos_tag function, classify words into parts of speech such as nouns, verbs, adjectives, and adverbs (e.g., NN for nouns, VB for verbs).

WordNet Lemmatizer: To effectively lemmatize words, it is crucial to inform the lemmatizer of each word's part of speech. NLTK's WordNetLemmatizer performs optimally when provided with WordNet's POS tags (e.g., wordnet.NOUN, wordnet.VERB).

In summary, this step prepares the environment for extracting, processing, and lemmatizing text, which are fundamental tasks in NLP workflows. This organization ensures that the correct POS tags are utilized, allowing the lemmatized text to be stored for future use.

In [None]:
def get_wordnet_pos(nltk_pos_tag):
    """Map NLTK POS tags to WordNet POS tags."""
    if nltk_pos_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif nltk_pos_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif nltk_pos_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif nltk_pos_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN  # Default to noun

In [None]:
# Initialize a list to store all lemmatized tokens with POS tags
all_lemmatized_pos_tags = []

In [None]:
extracted_text = []

In [None]:
# Open the PDF file
tokens=[]
# Initialize a list to store text from all pages
extracted_text = []
with pdfplumber.open(pdf_path) as pdf:
    # Iterate through each page
    for page_number, page in enumerate(pdf.pages[start_page - 1:end_page], start=start_page):
        text = page.extract_text()
        if text:
            # Clean the text
            text = clean_text(text)
            extracted_text.append(text)
            # Tokenize the text into words
            tokens = word_tokenize(text)
            # Remove stop words
            tokens = [token for token in tokens if token not in stop_words]
            # Perform POS tagging
            pos_tags = nltk.pos_tag(tokens)
            # Lemmatize tokens with POS tags
            lemmatized_tokens = []
            for token, tag in pos_tags:
                wordnet_pos = get_wordnet_pos(tag)
                lemmatized_token = lemmatizer.lemmatize(token, wordnet_pos)
                lemmatized_tokens.append((lemmatized_token, tag))
            # Collect all lemmatized tokens with POS tags
            all_lemmatized_pos_tags.extend(lemmatized_tokens)
        else:
            pass  # Skip pages with no text

In [None]:
# Create a DataFrame from all_lemmatized_pos_tags
df_tokens = pd.DataFrame(all_lemmatized_pos_tags, columns=['Word', 'POS Tag'])

# Calculate frequency of each word and POS Tag
df_freq = df_tokens.groupby(['Word', 'POS Tag']).size().reset_index(name='Frequency')

# Sort by frequency
df_freq = df_freq.sort_values(by='Frequency', ascending=False)

# Rearrange columns
df_freq = df_freq[['Word', 'POS Tag', 'Frequency']]

# Display combined table
print("Combined Table of Words with POS Tags:")
print(df_freq.head(20).to_string(index=False))


Combined Table of Words with POS Tags:
       Word POS Tag  Frequency
       risk      NN        878
    project      NN        782
 management      NN        632
       risk     NNS        335
   analysis      NN        189
        pmp      NN        166
   practice      NN        155
    process      NN        151
   standard      NN        149
       plan      NN        115
  institute      NN         99
   response     NNS         94
    project     VBP         93
       risk     VBP         87
stakeholder     NNS         83
   response      NN         79
  technique     NNS         77
information      NN         73
     action     NNS         70
    project     NNS         66


# Concept and Relationship Identification

In this process, we identify key concepts from the text by focusing on high-frequency words, specifically nouns and proper nouns. We begin by defining relevant part-of-speech (POS) tags for nouns and proper nouns and then filter the tokenized data to retain only these words. Next, we calculate the frequency of each concept by grouping the filtered data and counting occurrences, storing the results in a new DataFrame. The frequency data is then sorted in descending order to highlight the most common concepts, with the top 20 concepts extracted and displayed for further analysis. This method effectively highlights essential concepts within the text for subsequent tasks.

In [None]:
# ------------------- Concept Identification ------------------- #
# Identify concepts based on high-frequency words (nouns and proper nouns)
concept_pos_tags = ['NN', 'NNS', 'NNP', 'NNPS']
df_concepts = df_tokens[df_tokens['POS Tag'].isin(concept_pos_tags)]

# Calculate frequency of each concept
df_concept_freq = df_concepts.groupby('Word').size().reset_index(name='Frequency')

# Sort by frequency
df_concept_freq = df_concept_freq.sort_values(by='Frequency', ascending=False)

# Display the most pertinent concepts
most_common_concepts = df_concept_freq.head(20)

print("\nMost Pertinent Concepts:")
print(most_common_concepts.to_string(index=False))


Most Pertinent Concepts:
       Word  Frequency
       risk       1213
    project        848
 management        633
   analysis        192
   response        173
   standard        169
    process        168
        pmp        166
   practice        163
       plan        137
  technique        120
  institute         99
     action         98
stakeholder         91
      level         81
    example         78
information         73
       cost         72
       time         64
     impact         62


In [None]:
# ------------------- Attribute and Relation Extraction ------------------- #
# Process text at the sentence level using more comprehensive patterns
attributes = []
relations = []

# Reconstruct the full text
full_text = ' '.join(extracted_text)

# Tokenize text into sentences
sentences = sent_tokenize(full_text)

for sentence in sentences:
    # Tokenize and POS tag
    tokens = word_tokenize(sentence)
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    pos_tags = nltk.pos_tag(tokens)
    # Lemmatize tokens with POS tags
    lemmatized_tokens = []
    for token, tag in pos_tags:
        wordnet_pos = get_wordnet_pos(tag)
        lemmatized_token = lemmatizer.lemmatize(token, wordnet_pos)
        lemmatized_tokens.append((lemmatized_token.lower(), tag))
    # Update tokens and pos_tags with lemmatized tokens
    tokens = [token for token, tag in lemmatized_tokens]
    pos_tags = [(token, tag) for token, tag in lemmatized_tokens]
    # Build chunk grammar to capture more patterns
    grammar = r"""
        ATTRIBUTE: {<NN.*><IN><NN.*>}          # Noun + Preposition + Noun
                  {<JJ.*><NN.*>}               # Adjective(s) + Noun
                  {<NN.*><NN.*>}               # Noun + Noun compounds
        RELATION: {<NN.*><VB.*><NN.*>}         # Noun + Verb + Noun
                 {<NN.*><VB.*><IN><NN.*>}      # Noun + Verb + Preposition + Noun
    """
    # Create chunk parser
    cp = RegexpParser(grammar)
    tree = cp.parse(pos_tags)
    # Extract attributes
    for subtree in tree.subtrees(filter=lambda t: t.label() == 'ATTRIBUTE'):
        words, tags = zip(*subtree.leaves())
        attribute = ' '.join(words)
        attributes.append(attribute)
    # Extract relations
    for subtree in tree.subtrees(filter=lambda t: t.label() == 'RELATION'):
        words, tags = zip(*subtree.leaves())
        relation = ' '.join(words)
        relations.append(relation)

In [None]:
# Calculate frequencies of attributes
attribute_freq = FreqDist(attributes)
most_common_attributes = attribute_freq.most_common(20)

In [None]:
# Display attributes
print("\nAttributes Linked to Concepts:")
print("{:<50}{}".format("Attribute Phrase", "Frequency"))
print("-" * 65)
for attribute, freq in most_common_attributes:
    print("{:<50}{}".format(attribute, freq))


Attributes Linked to Concepts:
Attribute Phrase                                  Frequency
-----------------------------------------------------------------
risk management                                   207
project risk                                      135
project management                                96
standard project                                  66
practice standard                                 65
institute practice                                53
management institute                              46
quantitative risk                                 45
risk response                                     38
qualitative risk                                  35
individual risk                                   35
overall project                                   33
management process                                32
identifi cation                                   31
plan risk                                         26
identify risk                                

In [None]:
# Calculate frequencies of relations
relation_freq = FreqDist(relations)
most_common_relations = relation_freq.most_common(20)

In [None]:
# Display relations
print("\nRelations Between Concepts:")
print("{:<60}{}".format("Relation Phrase", "Frequency"))
print("-" * 75)
for relation, freq in most_common_relations:
    print("{:<60}{}".format(relation, freq))



Relations Between Concepts:
Relation Phrase                                             Frequency
---------------------------------------------------------------------------
technique examples template                                 3
strength weaknesses application                             3
management process defi                                     2
project follow element                                      2
project objectives risk                                     2
risk occur project                                          2
plan risk response                                          2
project templates example                                   2
program portfolios project                                  1
textbook handbooks course                                   1
management identify risk                                    1
process describe address                                    1
refl ecting business                                        1
principle describe 

In [None]:

# Initialize a list to store all sentences
sentences = []

# Initialize the WordNet lemmatizer
# lemmatizer = WordNetLemmatizer()

# Get the list of English stop words
# stop_words = set(stopwords.words('english'))

# def clean_text(text):
#     # Normalize unicode characters (remove accents)
#     text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
#     # Convert text to lowercase
#     text = text.lower()
#     # Remove punctuation except hyphens (useful for compound words) and periods (useful for abbreviations)
#     text = re.sub(r'[^\w\s\.-]', '', text)
#     # Remove multiple spaces
#     text = re.sub(r'\s+', ' ', text)
#     # Strip leading and trailing spaces
#     text = text.strip()
#     return text

def get_wordnet_pos(nltk_pos_tag):
    if nltk_pos_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif nltk_pos_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif nltk_pos_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif nltk_pos_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN  # Default to noun

# Open the PDF file
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            # Clean the text
            text = clean_text(text)
            # Tokenize text into sentences
            raw_sentences = sent_tokenize(text)
            for raw_sentence in raw_sentences:
                # Tokenize the sentence into words
                tokens = word_tokenize(raw_sentence)
                # Remove stop words and lemmatize tokens
                filtered_tokens = []
                pos_tags = nltk.pos_tag(tokens)
                for token, tag in pos_tags:
                    if token not in stop_words:
                        wordnet_pos = get_wordnet_pos(tag)
                        lemmatized_token = lemmatizer.lemmatize(token, wordnet_pos)
                        filtered_tokens.append(lemmatized_token)
                if filtered_tokens:
                    sentences.append(filtered_tokens)
        else:
            pass  # Skip pages with no text

# Flatten the list of sentences to create a list of all tokens
all_tokens = [token for sentence in sentences for token in sentence]

# Create a DataFrame from all_tokens
df_tokens = pd.DataFrame(all_tokens, columns=['Word'])

# Calculate frequency of each word
df_word_freq = df_tokens['Word'].value_counts().reset_index()
df_word_freq.columns = ['Word', 'Frequency']

# Sort by frequency
df_word_freq = df_word_freq.sort_values(by='Frequency', ascending=False)

# Get the top N words as the most pertinent concepts
N = 10  # You can change N to get more or fewer concepts
most_common_concepts = df_word_freq.head(N)

print("\nMost Pertinent Concepts:")
print(most_common_concepts.to_string(index=False))

# Get unique words for embedding
unique_words = df_tokens['Word'].unique()

# Load a pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each unique word
word_embeddings = model.encode(unique_words, show_progress_bar=True)

# Create a dictionary to map words to their embeddings
word_to_embedding = dict(zip(unique_words, word_embeddings))

# Function to find top similar words for a given word
def find_similar_words(target_word, word_to_embedding, top_n=5):
    if target_word not in word_to_embedding:
        return []
    target_embedding = word_to_embedding[target_word].reshape(1, -1)
    # Exclude the target word from the list of words to compare
    words = [word for word in unique_words if word != target_word]
    embeddings = np.array([word_to_embedding[word] for word in words])
    similarities = cosine_similarity(target_embedding, embeddings)[0]
    # Get indices of top similar words
    similar_indices = similarities.argsort()[::-1][:top_n]
    similar_words = [(words[idx], similarities[idx]) for idx in similar_indices]
    return similar_words

# Identify synonyms within the extracted text
print("\nSynonyms Within Extracted Text:")
for word in most_common_concepts['Word']:
    similar_words = find_similar_words(word, word_to_embedding, top_n=5)
    if similar_words:
        print(f"\nWord: {word}")
        print("Similar Words:")
        for sim_word, score in similar_words:
            print(f"  {sim_word} (Similarity: {score:.4f})")
    else:
        print(f"\nWord: {word}")
        print("  No similar words found in the extracted text.")



Most Pertinent Concepts:
      Word  Frequency
      risk       1707
   project       1154
management        751
   process        297
  analysis        259
      plan        233
  standard        223
  response        215
  practice        191
       pmp        188




Batches:   0%|          | 0/101 [00:00<?, ?it/s]


Synonyms Within Extracted Text:

Word: risk
Similar Words:
  risks (Similarity: 0.9197)
  riskrelated (Similarity: 0.7320)
  riskguidance (Similarity: 0.7187)
  safety (Similarity: 0.6168)
  threats (Similarity: 0.6008)

Word: project
Similar Words:
  task (Similarity: 0.5643)
  idea (Similarity: 0.5636)
  need (Similarity: 0.5422)
  attempt (Similarity: 0.5378)
  research (Similarity: 0.5348)

Word: management
Similar Words:
  managing (Similarity: 0.8936)
  managerial (Similarity: 0.8539)
  manage (Similarity: 0.8503)
  manager (Similarity: 0.8017)
  organisation (Similarity: 0.7037)

Word: process
Similar Words:
  processrelated (Similarity: 0.6600)
  processesrisk (Similarity: 0.6553)
  execution (Similarity: 0.5894)
  procedure (Similarity: 0.5794)
  production (Similarity: 0.5476)

Word: analysis
Similar Words:
  analyze (Similarity: 0.7923)
  evaluation (Similarity: 0.6173)
  analytic (Similarity: 0.5817)
  examine (Similarity: 0.5607)
  research (Similarity: 0.5536)

Word: pla

# Process Evaluation

In this section, we perform two key analyses: calculating the Levenshtein distance for synonyms and computing the TF-IDF scores for the extracted text. First, we define a function to calculate the Levenshtein distance, which measures the similarity between a given word and its similar words, as identified through a previously defined method (find_similar_words). For each word in the list of most common concepts, we find the top five similar words and print the Levenshtein distance along with cosine similarity values. If no similar words are found, a message is displayed accordingly.

Next, we proceed to calculate TF-IDF scores to assess the importance of words in the context of the extracted text. We rebuild the cleaned and lemmatized sentences into a list, which is then transformed into a TF-IDF matrix using TfidfVectorizer. This matrix quantifies the importance of each word across all sentences. We extract the feature names (terms) and their corresponding mean TF-IDF scores, storing them in a DataFrame. Finally, we sort this DataFrame by TF-IDF score in descending order and print the top 20 words with the highest scores, providing insights into the most significant terms within the text.

In [None]:
def clean_text(text):
    # Normalize unicode characters (remove accents)
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation except hyphens and periods
    text = re.sub(r'[^\w\s\.-]', '', text)
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing spaces
    text = text.strip()
    return text

def get_wordnet_pos(nltk_pos_tag):
    if nltk_pos_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif nltk_pos_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif nltk_pos_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif nltk_pos_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN  # Default to noun

# Open the PDF file
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            # Clean the text
            text = clean_text(text)
            # Tokenize text into sentences
            raw_sentences = sent_tokenize(text)
            for raw_sentence in raw_sentences:
                # Tokenize the sentence into words
                tokens = word_tokenize(raw_sentence)
                # Remove stop words and lemmatize tokens
                filtered_tokens = []
                pos_tags = nltk.pos_tag(tokens)
                for token, tag in pos_tags:
                    if token not in stop_words:
                        wordnet_pos = get_wordnet_pos(tag)
                        lemmatized_token = lemmatizer.lemmatize(token, wordnet_pos)
                        filtered_tokens.append(lemmatized_token)
                if filtered_tokens:
                    sentences.append(filtered_tokens)
        else:
            pass  # Skip pages with no text

# Flatten the list of sentences to create a list of all tokens
all_tokens = [token for sentence in sentences for token in sentence]

# Create a DataFrame from all_tokens
df_tokens = pd.DataFrame(all_tokens, columns=['Word'])

# Calculate frequency of each word
df_word_freq = df_tokens['Word'].value_counts().reset_index()
df_word_freq.columns = ['Word', 'Frequency']

# Sort by frequency
df_word_freq = df_word_freq.sort_values(by='Frequency', ascending=False)

# Get the top N words as the most pertinent concepts
N = 10  # You can change N to get more or fewer concepts
most_common_concepts = df_word_freq.head(N)

print("\nMost Pertinent Concepts:")
print(most_common_concepts.to_string(index=False))

# Get unique words for embedding
unique_words = df_tokens['Word'].unique()

# Load a pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each unique word
word_embeddings = model.encode(unique_words, show_progress_bar=True)

# Create a dictionary to map words to their embeddings
word_to_embedding = dict(zip(unique_words, word_embeddings))

# Function to find top similar words for a given word
def find_similar_words(target_word, word_to_embedding, top_n=5):
    if target_word not in word_to_embedding:
        return []
    target_embedding = word_to_embedding[target_word].reshape(1, -1)
    # Exclude the target word from the list of words to compare
    words = [word for word in unique_words if word != target_word]
    embeddings = np.array([word_to_embedding[word] for word in words])
    similarities = cosine_similarity(target_embedding, embeddings)[0]
    # Get indices of top similar words
    similar_indices = similarities.argsort()[::-1][:top_n]
    similar_words = [(words[idx], similarities[idx]) for idx in similar_indices]
    return similar_words

# Function to calculate Levenshtein distance
def calculate_levenshtein_distance(word, similar_words):
    print(f"\nLevenshtein Distances for the word '{word}':")
    for similar_word, similarity in similar_words:
        distance = Levenshtein.distance(word, similar_word)
        print(f"  {word} -> {similar_word}: Levenshtein Distance = {distance}, Cosine Similarity = {similarity:.4f}")

# Identify synonyms within the extracted text and calculate Levenshtein distance
print("\nSynonyms and Levenshtein Distances Within Extracted Text:")
for word in most_common_concepts['Word']:
    similar_words = find_similar_words(word, word_to_embedding, top_n=5)
    if similar_words:
        print(f"\nWord: {word}")
        print("Similar Words and Their Distances:")
        calculate_levenshtein_distance(word, similar_words)
    else:
        print(f"\nWord: {word}")
        print("  No similar words found in the extracted text.")



Most Pertinent Concepts:
      Word  Frequency
      risk       3414
   project       2306
management       1502
   process        594
  analysis        518
      plan        459
  standard        446
  response        432
  practice        382
       pmp        376




Batches:   0%|          | 0/116 [00:00<?, ?it/s]


Synonyms and Levenshtein Distances Within Extracted Text:

Word: risk
Similar Words and Their Distances:

Levenshtein Distances for the word 'risk':
  risk -> risks: Levenshtein Distance = 1, Cosine Similarity = 0.9197
  risk -> risk-related: Levenshtein Distance = 8, Cosine Similarity = 0.8580
  risk -> riskrelated: Levenshtein Distance = 7, Cosine Similarity = 0.7320
  risk -> riskguidance: Levenshtein Distance = 8, Cosine Similarity = 0.7187
  risk -> non-risks: Levenshtein Distance = 5, Cosine Similarity = 0.7116

Word: project
Similar Words and Their Distances:

Levenshtein Distances for the word 'project':
  project -> project-: Levenshtein Distance = 1, Cosine Similarity = 0.8335
  project -> project-specific: Levenshtein Distance = 9, Cosine Similarity = 0.6084
  project -> project-related: Levenshtein Distance = 8, Cosine Similarity = 0.5837
  project -> project-wide: Levenshtein Distance = 5, Cosine Similarity = 0.5824
  project -> task: Levenshtein Distance = 7, Cosine Simi

In [None]:

# ------------- TF-IDF Calculation ------------- #

# Rebuild the cleaned, lemmatized sentences for TF-IDF vectorization
cleaned_sentences = [' '.join(sentence) for sentence in sentences]

# Create a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_sentences)

# Store the vocab (terms) and their importance scores (TF-IDF)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).flatten()  # Mean TF-IDF across all documents
tfidf_df = pd.DataFrame({'Word': tfidf_feature_names, 'TF-IDF Score': tfidf_scores})

# Sort the words by their TF-IDF score (descending order)
tfidf_df = tfidf_df.sort_values(by='TF-IDF Score', ascending=False)

print("\nTop 20 Words by TF-IDF Score:")
print(tfidf_df.head(20).to_string(index=False))


Top 20 Words by TF-IDF Score:
       Word  TF-IDF Score
       risk      0.109692
    project      0.100289
 management      0.087584
   standard      0.040488
  institute      0.039991
   practice      0.038371
    process      0.033015
       plan      0.028025
   analysis      0.026495
   response      0.025326
        use      0.019927
     action      0.018914
  objective      0.018394
    example      0.017402
   identifi      0.017350
  technique      0.017025
     figure      0.015023
stakeholder      0.014704
     impact      0.014166
   identify      0.014025


In [None]:
# ------------- F-Measure Evaluation ------------- #

# Assuming 'most_common_concepts' from your previous code are True Positives
tp_words = most_common_concepts['Word'].tolist()

# Consider top N words by TF-IDF as predicted relevant concepts
N = 10
predicted_concepts = tfidf_df.head(N)['Word'].tolist()

# Create a binarizer
mlb = MultiLabelBinarizer(classes=list(set(tp_words) | set(predicted_concepts)))

# Binarize the true and predicted categories
y_true = mlb.fit_transform([tp_words])[0]  # Actual (True Relevant Concepts)
y_pred = mlb.fit_transform([predicted_concepts])[0]  # Predicted Concepts (TF-IDF Top N)

# Calculate Precision, Recall and F1-Score
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

In [None]:

# Output Precision, Recall and F1-Score
print("\n--- F-Measure Evaluation ---")
print(f'Precision: {precision:.4f}')
print(f'Recall:    {recall:.4f}')
print(f'F1-Score: {f1:.4f}')


--- F-Measure Evaluation ---
Precision: 0.9000
Recall:    0.9000
F1-Score: 0.9000
