Extraction de text

In [1]:
import pdfplumber
import re
import os

def extract_figure_pages(pdf_path, start_page=1, end_page=12):
    """
    Extracts the list of pages that contain figures from the first few pages of the PDF.

    Args:
        pdf_path (str): Path to the PDF file.
        start_page (int): Starting page number (1-indexed).
        end_page (int): Ending page number (1-indexed).

    Returns:
        List[int]: List of page numbers that contain figures.
    """
    figure_pages = []
    figure_keywords = ["Figure", "Fig.", "Illustration", "Diagram", "Graph", "Schéma", "Tableau"]
    list_of_figures_found = False
    figure_page_pattern = re.compile(r'^(Figure|Fig\.|Illustration|Diagram|Graph|Schéma|Tableau)\s*\d+', re.IGNORECASE)

    with pdfplumber.open(pdf_path) as pdf:
        for page_num in range(start_page - 1, end_page):
            page = pdf.pages[page_num]
            text = page.extract_text()
            if not text:
                continue

            lines = text.split('\n')

            for i, line in enumerate(lines):
                if re.search(r'List of Figures|Liste des Figures|Table des Figures|Liste des Illustrations', line, re.IGNORECASE):
                    list_of_figures_found = True
                    figure_start_index = i + 1
                    break

            if list_of_figures_found:
                for line in lines[figure_start_index:]:
                    if figure_page_pattern.match(line):
                        page_number_match = re.search(r'\.\s*(\d+)$', line)
                        if page_number_match:
                            page_number = int(page_number_match.group(1))
                            figure_pages.append(page_number)
                        else:
                            numbers = re.findall(r'\d+', line)
                            if numbers:
                                page_number = int(numbers[-1])
                                figure_pages.append(page_number)
                    elif re.match(r'^\s*$', line):
                        break
            if list_of_figures_found:
                break

    figure_pages = sorted(list(set(figure_pages)))
    return figure_pages

def extract_chapters(pdf_path, start_page=13, end_page=124, num_chapters=9):
    """
    Extracts text from the PDF and divides it into specified chapters.

    Args:
        pdf_path (str): Path to the PDF file.
        start_page (int): Starting page number for chapter extraction (1-indexed).
        end_page (int): Ending page number for chapter extraction (1-indexed).
        num_chapters (int): Number of chapters to divide the text into.

    Returns:
        Dict[str, str]: Dictionary with chapter titles as keys and their corresponding text as values.
    """
    chapters_content = {}
    current_chapter = None

    chapter_heading_pattern = re.compile(
        r'^(CHAPTER|CHAPITRE)\s+(\d+)\s*[\-–—−:]\s*(.+)$',
        re.IGNORECASE
    )
    chapter_heading_multiline_pattern = re.compile(
        r'^(CHAPTER|CHAPITRE)\s+(\d+)\s*$',
        re.IGNORECASE
    )

    with pdfplumber.open(pdf_path) as pdf:
        lines_buffer = []
        for page_num in range(start_page - 1, end_page):
            page = pdf.pages[page_num]
            text = page.extract_text()
            if not text:
                continue
            lines = text.split('\n')
            for line in lines:
                line = line.strip()
                if not line:
                    continue

                chapter_match = chapter_heading_pattern.match(line)
                if chapter_match:
                    chapter_num = int(chapter_match.group(2))
                    chapter_title = chapter_match.group(3).strip()
                    if 1 <= chapter_num <= num_chapters:
                        current_chapter = f"Chapitre_{chapter_num}_{sanitize_filename(chapter_title)}"
                        if current_chapter not in chapters_content:
                            chapters_content[current_chapter] = ""
                        continue

                chapter_multiline_match = chapter_heading_multiline_pattern.match(line)
                if chapter_multiline_match:
                    chapter_num = int(chapter_multiline_match.group(2))
                    if 1 <= chapter_num <= num_chapters:
                        lines_buffer.append((page_num, line))
                        next_line_index = lines.index(line) + 1
                        if next_line_index < len(lines):
                            next_line = lines[next_line_index].strip()
                            if next_line:
                                chapter_title = next_line
                                current_chapter = f"Chapitre_{chapter_num}_{sanitize_filename(chapter_title)}"
                                if current_chapter not in chapters_content:
                                    chapters_content[current_chapter] = ""
                                continue

                if current_chapter:
                    if re.match(r'^\d+$', line):
                        continue
                    chapters_content[current_chapter] += line + "\n"

    return chapters_content

def sanitize_filename(name):
    """
    Sanitizes a string to be used as a filename by removing or replacing invalid characters.

    Args:
        name (str): The string to sanitize.

    Returns:
        str: A sanitized string suitable for filenames.
    """
    return re.sub(r'[\\/*?:"<>|]', "_", name.lower().replace(" ", "_"))

def extract_first_sentence(text):
    """
    Extracts the first sentence from a block of text.

    Args:
        text (str): The text to extract the sentence from.

    Returns:
        str: The first sentence.
    """
    sentence_endings = re.compile(r'[.!?]')
    match = sentence_endings.search(text)
    if match:
        end = match.end()
        return text[:end].strip()
    else:
        return text[:100].strip()

def save_chapters(chapters, output_dir="output/chapters"):
    """
    Saves each chapter's content to a separate text file and prints a snippet from each chapter.

    Args:
        chapters (Dict[str, str]): Dictionary with chapter titles and their text.
        output_dir (str): Directory where chapter files will be saved.
    """
    os.makedirs(output_dir, exist_ok=True)
    for chapter, content in chapters.items():
        filename = f"{chapter}.txt"
        file_path = os.path.join(output_dir, filename)
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(content)
        first_sentence = extract_first_sentence(content)
        print(f"--- {chapter.replace('_', ' ').title()} Snippet ---")
        print(first_sentence)
        print("--------------------------\n")
    print(f"Chapters saved to '{output_dir}' directory.")

def save_figure_pages(figure_pages, output_file="output/figure_pages.txt"):
    """
    Saves the list of figure pages to a text file.

    Args:
        figure_pages (List[int]): List of page numbers containing figures.
        output_file (str): Path to the output text file.
    """
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, "w", encoding="utf-8") as f:
        for page in figure_pages:
            f.write(f"Page {page}\n")
    print(f"Figure pages saved to '{output_file}'.")

def save_full_text(text, output_file="output/extracted_text.txt"):
    """
    Saves the extracted full text to a text file.

    Args:
        text (str): The full extracted text.
        output_file (str): Path to the output text file.
    """
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"Full text saved to '{output_file}'.")

def extract_full_text(pdf_path, start_page=13, end_page=124):
    """
    Extracts the full text from specified pages.

    Args:
        pdf_path (str): Path to the PDF file.
        start_page (int): Starting page number (1-indexed).
        end_page (int): Ending page number (1-indexed).

    Returns:
        str: The extracted full text.
    """
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page_num in range(start_page - 1, end_page):
            if page_num < 0 or page_num >= len(pdf.pages):
                continue
            page = pdf.pages[page_num]
            text = page.extract_text()
            if text:
                full_text += text + "\n"
    return full_text

def main():
    pdf_path = "practice-standard-project-risk-management.pdf"

    if not os.path.isfile(pdf_path):
        print(f"Error: PDF file '{pdf_path}' not found.")
        return

    print("Extracting figure pages from pages 1 to 12...")
    figure_pages = extract_figure_pages(pdf_path, start_page=1, end_page=12)
    if not figure_pages:
        print("No figure pages found.")
    else:
        print(f"Figure pages found: {figure_pages}")
        save_figure_pages(figure_pages, output_file="output/figure_pages.txt")

    print("\nExtracting full text from pages 13 to 124...")
    full_text = extract_full_text(pdf_path, start_page=13, end_page=124)
    save_full_text(full_text, output_file="output/extracted_text.txt")

    print("\nDividing text into 9 chapters...")
    chapters = extract_chapters(pdf_path, start_page=13, end_page=124, num_chapters=9)
    save_chapters(chapters, output_dir="output/chapters")

if __name__ == "__main__":
    main()


Extracting figure pages from pages 1 to 12...
Figure pages found: [2, 6, 17, 23, 27, 29, 32, 33, 38, 41, 44, 49, 53]
Figure pages saved to 'output/figure_pages.txt'.

Extracting full text from pages 13 to 124...
Full text saved to 'output/extracted_text.txt'.

Dividing text into 9 chapters...
--- Chapitre 1 Introduction Snippet ---
INTRODUCTION
P roject Management Institute (PMI) practice standards are guides to the use of a tool, technique, or process
identifi ed in A Guide to the Project Management Body of Knowledge ( PMBOK ® G uide – Fourth Edition) or
other PMI standards.
--------------------------

--- Chapitre 2 Principles And Concepts Snippet ---
PRINCIPLES AND CONCEPTS
2.
--------------------------

--- Chapitre 3 Introduction To Project Risk Management Processes Snippet ---
INTRODUCTION TO PROJECT RISK MANAGEMENT PROCESSES
3.
--------------------------

--- Chapitre 4 Plan Risk Management Snippet ---
PLAN RISK MANAGEMENT
4.
--------------------------

--- Chapitre 5 Identify R

Cleaning Text

In [2]:
# Import necessary libraries
import os
import re
import pdfplumber

def sanitize_filename(name):
    """
    Sanitizes a string to be used as a filename by removing or replacing invalid characters.

    Args:
        name (str): The string to sanitize.

    Returns:
        str: A sanitized string suitable for filenames.
    """
    return re.sub(r'[\\/*?:"<>|]', "_", name.lower().replace(" ", "_"))

def clean_text(text):
    """
    Cleans the input text by removing standalone numbers and normalizing whitespace.

    Args:
        text (str): The text to clean.

    Returns:
        str: Cleaned text.
    """
    # Remove all standalone numbers
    text = re.sub(r'\b\d+\b', '', text)
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing spaces
    text = text.strip()
    return text

def get_chapter_name_from_page(pdf, page_num):
    """
    Extracts the chapter name from the top of the specified page.

    Args:
        pdf (pdfplumber.PDF): The opened PDF object.
        page_num (int): The 1-indexed page number.

    Returns:
        str: Sanitized chapter name if found, else None.
    """
    try:
        page = pdf.pages[page_num - 1]  # 0-indexed
        text = page.extract_text()
        if not text:
            return None
        lines = text.split('\n')
        for line in lines[:5]:  # Assume chapter name is within the first 5 lines
            line = line.strip()
            # Match chapter headings similar to extract_chapters function
            chapter_heading_pattern = re.compile(
                r'^(CHAPTER|CHAPITRE)\s+(\d+)\s*[\-–—−:]\s*(.+)$',
                re.IGNORECASE
            )
            chapter_multiline_pattern = re.compile(
                r'^(CHAPTER|CHAPITRE)\s+(\d+)\s*$',
                re.IGNORECASE
            )
            match = chapter_heading_pattern.match(line)
            if match:
                chapter_num = int(match.group(2))
                chapter_title = match.group(3).strip()
                sanitized_title = sanitize_filename(chapter_title)
                chapter_name = f"chapitre_{chapter_num}_{sanitized_title}".lower()
                return chapter_name
            else:
                match = chapter_multiline_pattern.match(line)
                if match:
                    chapter_num = int(match.group(2))
                    # Assume next line is the title
                    current_index = lines.index(line)
                    if current_index + 1 < len(lines):
                        next_line = lines[current_index + 1].strip()
                        if next_line:
                            chapter_title = next_line
                            sanitized_title = sanitize_filename(chapter_title)
                            chapter_name = f"chapitre_{chapter_num}_{sanitized_title}".lower()
                            return chapter_name
        return None
    except Exception as e:
        print(f"Error extracting chapter name from page {page_num}: {e}")
        return None

def main():
    # Define paths
    pdf_path = "content/practice-standard-project-risk-management.pdf"
    chapters_dir = "output/chapters"
    cleaned_chapters_dir = "output/cleaned_chapters"
    final_chapters_dir = "output/final_chapters"

    # Create directories if they don't exist
    os.makedirs(cleaned_chapters_dir, exist_ok=True)
    os.makedirs(final_chapters_dir, exist_ok=True)

    # Step 1: Clean the text for each chapter and save
    print("Cleaning chapters...")
    chapters = {}
    for filename in os.listdir(chapters_dir):
        if filename.endswith(".txt"):
            chapter_path = os.path.join(chapters_dir, filename)
            with open(chapter_path, "r", encoding="utf-8") as f:
                text = f.read()
            cleaned = clean_text(text)
            # Save cleaned text
            cleaned_filename = filename.replace("chapitre_", "cleaned_chapitre_")
            cleaned_path = os.path.join(cleaned_chapters_dir, cleaned_filename)
            with open(cleaned_path, "w", encoding="utf-8") as f:
                f.write(cleaned)
            # Store in dict for later use
            chapter_key = filename.replace(".txt", "").lower()
            chapters[chapter_key] = cleaned
            print(f"Cleaned and saved {filename} to {cleaned_chapters_dir}")
    print("All chapters cleaned.\n")

    # Step 2: Save the combined text for each chapter
    print("Saving final chapters...")
    for chapter, text in chapters.items():
        final_path = os.path.join(final_chapters_dir, f"{chapter}.txt")
        with open(final_path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f"Saved final chapter to {final_path}")
    print("All final chapters saved.\n")

    print("Processing completed successfully.")

if __name__ == "__main__":
    main()


Cleaning chapters...
Cleaned and saved Chapitre_1_introduction.txt to output/cleaned_chapters
Cleaned and saved Chapitre_2_principles_and_concepts.txt to output/cleaned_chapters
Cleaned and saved Chapitre_3_introduction_to_project_risk_management_processes.txt to output/cleaned_chapters
Cleaned and saved Chapitre_4_plan_risk_management.txt to output/cleaned_chapters
Cleaned and saved Chapitre_5_identify_risks.txt to output/cleaned_chapters
Cleaned and saved Chapitre_6_perform_qualitative_risk_analysis.txt to output/cleaned_chapters
Cleaned and saved Chapitre_7_perform_quantitative_risk_analysis.txt to output/cleaned_chapters
Cleaned and saved Chapitre_8_plan_risk_responses.txt to output/cleaned_chapters
Cleaned and saved Chapitre_9_monitor_and_control_risks.txt to output/cleaned_chapters
All chapters cleaned.

Saving final chapters...
Saved final chapter to output/final_chapters\chapitre_1_introduction.txt
Saved final chapter to output/final_chapters\chapitre_2_principles_and_concepts.

Preprocessing


In [3]:
import nltk
from nltk.corpus import stopwords
import os
import re

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

def remove_stop_words(text, stop_words, max_removed=5):
    """
    Removes stop words from the text and returns the cleaned text along with the first few stop words removed.

    Args:
        text (str): The text to process.
        stop_words (set): A set of stop words to remove.
        max_removed (int): Maximum number of stop words to record.

    Returns:
        Tuple[str, List[str]]: Cleaned text and list of first stop words removed.
    """
    words = re.findall(r'\b\w+\b', text)
    cleaned_words = []
    removed_stop_words = []
    for word in words:
        word_lower = word.lower()
        if word_lower in stop_words:
            if len(removed_stop_words) < max_removed:
                removed_stop_words.append(word_lower)
            continue
        cleaned_words.append(word)
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text, removed_stop_words

def main():
    # Define paths
    final_chapters_dir = "output/final_chapters"  # Directory with final chapters
    cleaned_stopwords_chapters_dir = "output/cleaned_stopwords_chapters"  # Directory to save cleaned chapters

    # Create the output directory if it doesn't exist
    os.makedirs(cleaned_stopwords_chapters_dir, exist_ok=True)

    # Initialize stop words (English and French)
    try:
        stop_words_en = set(stopwords.words('english'))
    except LookupError:
        nltk.download('stopwords')
        stop_words_en = set(stopwords.words('english'))
        
    try:
        stop_words_fr = set(stopwords.words('french'))
    except LookupError:
        nltk.download('stopwords')
        stop_words_fr = set(stopwords.words('french'))
        
    stop_words = stop_words_en.union(stop_words_fr)

    # Initialize a list to hold mapping information
    mapping_info = []

    # Iterate through each final chapter file
    for filename in os.listdir(final_chapters_dir):
        if filename.endswith(".txt"):
            chapter_path = os.path.join(final_chapters_dir, filename)
            with open(chapter_path, "r", encoding="utf-8") as f:
                text = f.read()

            # Remove stop words from the text and collect first 5 removed
            cleaned_text, first_5_stop_words = remove_stop_words(text, stop_words, max_removed=5)

            # Save the cleaned chapter
            cleaned_filename = filename.replace("chapitre_", "cleaned_chapitre_")
            cleaned_path = os.path.join(cleaned_stopwords_chapters_dir, cleaned_filename)
            with open(cleaned_path, "w", encoding="utf-8") as f:
                f.write(cleaned_text)

            # Extract chapter name without extension for readability
            chapter_name = filename.replace(".txt", "").replace("_", " ").title()

            # Record the mapping information
            mapping_info.append((filename, chapter_name, first_5_stop_words))

            # Print the first 5 stop words removed
            print(f"Processed '{filename}':")
            if first_5_stop_words:
                print(f" - First 5 stop words removed: {', '.join(first_5_stop_words)}")
            else:
                print(" - No stop words were removed.")
            print()

    print("All chapters have been processed and cleaned.")
    print("\nSummary of stop words removed for each chapter:")
    for desc_file, chap_name, stop_words_removed in mapping_info:
        if stop_words_removed:
            print(f" - {chap_name}: {', '.join(stop_words_removed)}")
        else:
            print(f" - {chap_name}: No stop words removed.")

    print("\nCleaned chapters saved to 'output/cleaned_stopwords_chapters' directory.")

if __name__ == "__main__":
    main()


Processed 'chapitre_1_introduction.txt':
 - First 5 stop words removed: are, to, the, of, a

Processed 'chapitre_2_principles_and_concepts.txt':
 - First 5 stop words removed: and, t, his, the, to

Processed 'chapitre_3_introduction_to_project_risk_management_processes.txt':
 - First 5 stop words removed: to, and, all, are, is

Processed 'chapitre_4_plan_risk_management.txt':
 - First 5 stop words removed: and, of, the, t, he

Processed 'chapitre_5_identify_risks.txt':
 - First 5 stop words removed: and, of, the, a, be

Processed 'chapitre_6_perform_qualitative_risk_analysis.txt':
 - First 5 stop words removed: and, of, the, t, he

Processed 'chapitre_7_perform_quantitative_risk_analysis.txt':
 - First 5 stop words removed: and, of, the, the, a

Processed 'chapitre_8_plan_risk_responses.txt':
 - First 5 stop words removed: t, he, that, are, to

Processed 'chapitre_9_monitor_and_control_risks.txt':
 - First 5 stop words removed: and, the, of, the, the

All chapters have been processed a

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Lemmatization

In [8]:
import os
import re
import spacy
from langdetect import detect, LangDetectException
from spacy.cli import download as spacy_download

# Install spaCy models if they are not already installed
def install_spacy_models():
    """
    Installs the required spaCy language models if they are not already installed.
    """
    try:
        spacy.load('en_core_web_sm')
    except OSError:
        print("Downloading 'en_core_web_sm' model...")
        spacy_download('en_core_web_sm')
    
    try:
        spacy.load('fr_core_news_sm')
    except OSError:
        print("Downloading 'fr_core_news_sm' model...")
        spacy_download('fr_core_news_sm')

# Detect the language of the text
def detect_language(text):
    """
    Detects the language of the given text using langdetect.

    Args:
        text (str): The text to detect language for.

    Returns:
        str: Detected language code ('en' for English, 'fr' for French).
             Returns 'unknown' if detection fails or the language is neither English nor French.
    """
    try:
        lang = detect(text)
        return lang if lang in ['en', 'fr'] else 'unknown'
    except LangDetectException:
        return 'unknown'

# Lemmatize the text using spaCy
def lemmatize_text(text, nlp):
    """
    Lemmatizes the given text using the provided spaCy NLP model.

    Args:
        text (str): The text to lemmatize.
        nlp (spacy.lang.*.Language): The spaCy NLP model.

    Returns:
        Tuple[str, List[Tuple[str, str]]]: The lemmatized text and a list of tuples containing original and lemmatized words.
    """
    doc = nlp(text)
    lemmatized_words = []
    cleaned_tokens = []
    
    for token in doc:
        if token.is_alpha:  # Check if the token is alphabetic
            lemma = token.lemma_
            if lemma != token.text.lower():
                lemmatized_words.append((token.text, lemma))  # Track lemmatization changes
            cleaned_tokens.append(lemma)
        else:
            cleaned_tokens.append(token.text)
    
    lemmatized_text = ' '.join(cleaned_tokens)
    return lemmatized_text, lemmatized_words

# Split the chapter text into main text and figure descriptions
def split_chapter_text(text):
    """
    Splits the chapter text into main text and figure descriptions.

    Args:
        text (str): The complete chapter text.

    Returns:
        Tuple[str, List[str]]: A tuple containing the main text and a list of figure descriptions.
    """
    pattern = re.compile(r'\n\nFigure Description from page \d+:\n', re.IGNORECASE)
    parts = pattern.split(text)
    main_text = parts[0].strip()
    figure_descriptions = parts[1:]  # The remaining parts are figure descriptions
    return main_text, figure_descriptions

# Main function to process chapters
def main():
    # Install spaCy models if not already installed
    install_spacy_models()
    
    # Load spaCy models
    nlp_en = spacy.load('en_core_web_sm')
    nlp_fr = spacy.load('fr_core_news_sm')
    
    # Define paths
    cleaned_stopwords_chapters_dir = "output/cleaned_stopwords_chapters"  # Directory with stop words removed
    lemmatized_chapters_dir = "output/lemmatized_chapters"  # Directory to save lemmatized chapters
    
    # Create the output directory if it doesn't exist
    os.makedirs(lemmatized_chapters_dir, exist_ok=True)
    
    # Initialize a list to hold mapping information
    mapping_info = []
    
    # Iterate through each cleaned chapter file
    for filename in os.listdir(cleaned_stopwords_chapters_dir):
        if filename.endswith(".txt"):
            chapter_path = os.path.join(cleaned_stopwords_chapters_dir, filename)
            with open(chapter_path, "r", encoding="utf-8") as f:
                text = f.read()
    
            # Split the text into main text and figure descriptions
            main_text, figure_descriptions = split_chapter_text(text)
    
            # Detect language of the main text
            language = detect_language(main_text)
            nlp = nlp_en if language == 'en' else nlp_fr if language == 'fr' else None
            
            if nlp is None:
                print(f"Could not detect language for '{filename}'. Skipping lemmatization.")
                continue
    
            # Lemmatize the main text and record word transformations
            lemmatized_main_text, lemmatized_words = lemmatize_text(main_text, nlp)
    
            # Reassemble the lemmatized text with figure descriptions
            lemmatized_text = lemmatized_main_text
            for desc in figure_descriptions:
                lemmatized_text += f"\n\nFigure Description:\n{desc.strip()}"
    
            # Save the lemmatized chapter
            lemmatized_filename = filename.replace("cleaned_chapitre_", "lemmatized_chapitre_")
            lemmatized_path = os.path.join(lemmatized_chapters_dir, lemmatized_filename)
            with open(lemmatized_path, "w", encoding="utf-8") as f:
                f.write(lemmatized_text)
    
            # Extract chapter name without extension for readability
            chapter_name = filename.replace(".txt", "").replace("cleaned_chapitre_", "").replace("_", " ").title()
    
            # Select first five unique lemmatization examples
            unique_lemmatizations = []
            seen = set()
            for original, lemma in lemmatized_words:
                if original.lower() != lemma.lower() and original.lower() not in seen:
                    unique_lemmatizations.append((original, lemma))
                    seen.add(original.lower())
                if len(unique_lemmatizations) >= 5:
                    break
    
            # Record the mapping information
            mapping_info.append((filename, chapter_name, unique_lemmatizations))
    
            # Print the first five lemmatization examples
            print(f"Processed '{filename}':")
            if unique_lemmatizations:
                print(" - First 5 lemmatization examples:")
                for orig, lem in unique_lemmatizations:
                    print(f"    * {orig} -> {lem}")
            else:
                print(" - No lemmatization changes were made.")
            print()
    
    print("All chapters have been lemmatized and saved.")
    print("\nSummary of lemmatization applied to each chapter:")
    for desc_file, chap_name, lemmatizations in mapping_info:
        if lemmatizations:
            examples = ', '.join([f"{orig} -> {lem}" for orig, lem in lemmatizations])
            print(f" - {chap_name}: {examples}")
        else:
            print(f" - {chap_name}: No lemmatization changes.")
    
    print("\nLemmatized chapters saved to 'output/lemmatized_chapters' directory.")

if __name__ == "__main__":
    main()


Processed 'cleaned_chapitre_1_introduction.txt':
 - First 5 lemmatization examples:
    * standards -> standard
    * guides -> guide
    * targeted -> target
    * audiences -> audience
    * projects -> project

Processed 'cleaned_chapitre_2_principles_and_concepts.txt':
 - First 5 lemmatization examples:
    * introduces -> introduce
    * ideas -> idea
    * required -> require
    * projects -> project
    * following -> follow

Processed 'cleaned_chapitre_3_introduction_to_project_risk_management_processes.txt':
 - First 5 lemmatization examples:
    * projects -> project
    * undertakings -> undertaking
    * based -> base
    * assumptions -> assumption
    * constraints -> constraint

Processed 'cleaned_chapitre_4_plan_risk_management.txt':
 - First 5 lemmatization examples:
    * objectives -> objective
    * processes -> process
    * executed -> execute
    * activities -> activity
    * requires -> require

Processed 'cleaned_chapitre_5_identify_risks.txt':
 - First 5 lem

PostTag

In [6]:
!pip install langdetect


Defaulting to user installation because normal site-packages is not writeable
Collecting langdetect
  Using cached langdetect-1.0.9.tar.gz (981 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (pyproject.toml): started
  Building wheel for langdetect (pyproject.toml): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993251 sha256=d9d4e8141b8f16eec528800ab7d53f7bc4fd898a85f0c6f0c050ac4cf433fd56
  Stored in directory: c:\users\admin\appdata\local\pip\cache\wheels\c1\67\88\e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collec

In [9]:
# Import necessary libraries
import os
import re
import spacy
import pandas as pd
from langdetect import detect, LangDetectException
from spacy.cli import download as spacy_download
from collections import defaultdict

def install_spacy_models():
    """
    Installs the required spaCy language models if they are not already installed.
    """
    try:
        spacy.load('en_core_web_sm')
    except OSError:
        print("Downloading 'en_core_web_sm' model...")
        spacy_download('en_core_web_sm')
    
    try:
        spacy.load('fr_core_news_sm')
    except OSError:
        print("Downloading 'fr_core_news_sm' model...")
        spacy_download('fr_core_news_sm')

def detect_language(text):
    """
    Detects the language of the given text using langdetect.

    Args:
        text (str): The text to detect language for.

    Returns:
        str: Detected language code ('en' for English, 'fr' for French).
             Returns 'unknown' if detection fails or the language is neither English nor French.
    """
    try:
        lang = detect(text)
        if lang in ['en', 'fr']:
            return lang
        else:
            return 'unknown'
    except LangDetectException:
        return 'unknown'

def split_chapter_text(text):
    """
    Splits the chapter text into main text and figure descriptions.

    Args:
        text (str): The complete chapter text.

    Returns:
        Tuple[str, List[str]]: A tuple containing the main text and a list of figure descriptions.
    """
    # Split the text by figure descriptions
    pattern = re.compile(r'\n\nFigure Description from page \d+:\n', re.IGNORECASE)
    parts = pattern.split(text)
    # The first part is the main text
    main_text = parts[0].strip()
    # The remaining parts are figure descriptions
    figure_descriptions = parts[1:] if len(parts) > 1 else []
    return main_text, figure_descriptions

def get_all_words(doc):
    """
    Extracts all words along with their POS tags and frequencies.

    Args:
        doc (spacy.lang.*.Doc): The spaCy processed document.

    Returns:
        List[Tuple[str, str, int]]: A list of tuples containing word, POS tag, and frequency.
    """
    word_pos_freq = defaultdict(int)
    
    for token in doc:
        if token.is_alpha:  # Consider alphabetic tokens only
            lemma = token.lemma_.lower()
            pos = token.pos_
            word_pos_freq[(lemma, pos)] += 1

    # Convert to a list of tuples: word, POS tag, frequency
    all_words = [(word, pos, freq) for (word, pos), freq in word_pos_freq.items()]
    
    return all_words

def get_top_words(all_words, top_n=5):
    """
    Extracts the top N most frequent words from the list.

    Args:
        all_words (List[Tuple[str, str, int]]): List of all words with their POS tags and frequencies.
        top_n (int): Number of top words to extract.

    Returns:
        List[Tuple[str, str, int]]: A list of tuples containing the top N words, POS tags, and frequency.
    """
    # Sort by frequency in descending order and take the top N
    sorted_words = sorted(all_words, key=lambda x: x[2], reverse=True)
    return sorted_words[:top_n]

def main():
    # Install spaCy models if not already installed
    install_spacy_models()
    
    # Load spaCy models
    nlp_en = spacy.load('en_core_web_sm')
    nlp_fr = spacy.load('fr_core_news_sm')
    
    # Define paths
    lemmatized_chapters_dir = "output/lemmatized_chapters"  # Directory with lemmatized chapters
    pos_tables_dir = "output/pos_tables"  # Directory to save POS tables
    
    # Create the output directory if it doesn't exist
    os.makedirs(pos_tables_dir, exist_ok=True)
    
    # Initialize a list to hold mapping information
    mapping_info = []
    
    # Iterate through each lemmatized chapter file
    for filename in os.listdir(lemmatized_chapters_dir):
        if filename.endswith(".txt"):
            chapter_path = os.path.join(lemmatized_chapters_dir, filename)
            with open(chapter_path, "r", encoding="utf-8") as f:
                text = f.read()
    
            # Split the text into main text and figure descriptions
            main_text, figure_descriptions = split_chapter_text(text)
    
            if not main_text:
                print(f"Chapter '{filename}' has no main text. Skipping POS tagging.")
                continue
    
            # Detect language of the main text
            language = detect_language(main_text)
            if language == 'en':
                nlp = nlp_en
            elif language == 'fr':
                nlp = nlp_fr
            else:
                print(f"Could not detect language for '{filename}'. Skipping POS tagging.")
                continue
    
            # Process the main text with spaCy
            doc = nlp(main_text)
    
            # Get all words with their POS tags and frequencies
            all_words = get_all_words(doc)
    
            # Get top 5 words for display
            top_words = get_top_words(all_words, top_n=5)
    
            # Create a pandas DataFrame for all words
            df_all_words = pd.DataFrame(all_words, columns=['Word', 'POS Tag', 'Frequency'])
    
            # Save all words to a CSV file
            chapter_name = filename.replace(".txt", "").replace("_", " ").title()
            table_filename = f"POS_Table_{chapter_name.replace(' ', '_')}.csv"
            table_path = os.path.join(pos_tables_dir, table_filename)
            df_all_words.to_csv(table_path, index=False)
    
            # Append to mapping information
            mapping_info.append((filename, chapter_name, top_words))
    
            # Display the top 5 words in the notebook
            print(f"--- {chapter_name} ---")
            df_top_words = pd.DataFrame(top_words, columns=['Word', 'POS Tag', 'Frequency'])
            display(df_top_words)
            print("\n")
    
    print("All chapters have been POS tagged and tables have been saved.")
    print("\nSummary of top 5 words for each chapter:")
    for desc_file, chap_name, top_words in mapping_info:
        print(f" - {chap_name}:")
        for word, pos, freq in top_words:
            print(f"    * {word} ({pos}) - {freq} times")
        print()
    
    print(f"POS tables saved to '{pos_tables_dir}' directory.")

if __name__ == "__main__":
    main()


--- Lemmatized Chapitre 1 Introduction ---


Unnamed: 0,Word,POS Tag,Frequency
0,management,PROPN,83
1,project,PROPN,80
2,project,NOUN,76
3,risk,PROPN,72
4,process,NOUN,37




--- Lemmatized Chapitre 2 Principles And Concepts ---


Unnamed: 0,Word,POS Tag,Frequency
0,risk,NOUN,69
1,project,NOUN,61
2,project,PROPN,36
3,management,PROPN,32
4,risk,PROPN,29




--- Lemmatized Chapitre 3 Introduction To Project Risk Management Processes ---


Unnamed: 0,Word,POS Tag,Frequency
0,risk,NOUN,63
1,project,NOUN,49
2,project,PROPN,47
3,management,PROPN,44
4,risk,PROPN,43




--- Lemmatized Chapitre 4 Plan Risk Management ---


Unnamed: 0,Word,POS Tag,Frequency
0,risk,NOUN,84
1,management,NOUN,66
2,project,NOUN,57
3,management,PROPN,43
4,project,PROPN,32




--- Lemmatized Chapitre 5 Identify Risks ---


Unnamed: 0,Word,POS Tag,Frequency
0,risk,NOUN,69
1,project,NOUN,32
2,identifi,PROPN,29
3,process,NOUN,17
4,risks,PROPN,16




--- Lemmatized Chapitre 6 Perform Qualitative Risk Analysis ---


Unnamed: 0,Word,POS Tag,Frequency
0,risk,NOUN,85
1,project,NOUN,31
2,risk,PROPN,19
3,may,AUX,18
4,analysis,NOUN,16




--- Lemmatized Chapitre 7 Perform Quantitative Risk Analysis ---


Unnamed: 0,Word,POS Tag,Frequency
0,risk,NOUN,87
1,project,NOUN,60
2,quantitative,ADJ,32
3,analysis,NOUN,29
4,cost,NOUN,21




--- Lemmatized Chapitre 8 Plan Risk Responses ---


Unnamed: 0,Word,POS Tag,Frequency
0,risk,NOUN,116
1,response,NOUN,67
2,project,NOUN,45
3,plan,NOUN,29
4,action,NOUN,28




--- Lemmatized Chapitre 9 Monitor And Control Risks ---


Unnamed: 0,Word,POS Tag,Frequency
0,risk,NOUN,389
1,project,NOUN,230
2,pmp,PROPN,186
3,project,PROPN,179
4,management,PROPN,167




All chapters have been POS tagged and tables have been saved.

Summary of top 5 words for each chapter:
 - Lemmatized Chapitre 1 Introduction:
    * management (PROPN) - 83 times
    * project (PROPN) - 80 times
    * project (NOUN) - 76 times
    * risk (PROPN) - 72 times
    * process (NOUN) - 37 times

 - Lemmatized Chapitre 2 Principles And Concepts:
    * risk (NOUN) - 69 times
    * project (NOUN) - 61 times
    * project (PROPN) - 36 times
    * management (PROPN) - 32 times
    * risk (PROPN) - 29 times

 - Lemmatized Chapitre 3 Introduction To Project Risk Management Processes:
    * risk (NOUN) - 63 times
    * project (NOUN) - 49 times
    * project (PROPN) - 47 times
    * management (PROPN) - 44 times
    * risk (PROPN) - 43 times

 - Lemmatized Chapitre 4 Plan Risk Management:
    * risk (NOUN) - 84 times
    * management (NOUN) - 66 times
    * project (NOUN) - 57 times
    * management (PROPN) - 43 times
    * project (PROPN) - 32 times

 - Lemmatized Chapitre 5 Ident

 extract important and noisy nouns based on their semantic similarity to domain-specific terms

In [13]:
!pip install sentence-transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting sentence-transformers
  Using cached sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Using cached sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
Using cached transformers-4.45.2-py3-none-any.whl (9.9 MB)
Installing collected packages: transformers, sentence-transformers


ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\Users\\admin\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\transformers\\models\\deprecated\\trajectory_transformer\\convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py'



In [14]:
import os
import re
import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer, util
from IPython.core.display import HTML, display

# Load spaCy model for POS tagging
nlp = spacy.load("en_core_web_sm")

# Load pre-trained sentence transformer model for semantic similarity
model = SentenceTransformer('all-MiniLM-L6-v2')

def preprocess_text(text):
    """ Preprocess the text by normalizing, removing punctuation, and numbers. """
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove multiple spaces
    return text

def filter_invalid_words(words):
    """ Filter out invalid words such as single letters or badly lemmatized words. """
    return [word for word in words if len(word) > 1 and word.isalpha()]

def extract_frequent_nouns(pos_df, threshold=5):
    """ Extract nouns with frequency greater than the given threshold. """
    frequent_nouns_df = pos_df[(pos_df['POS Tag'] == 'NOUN') & (pos_df['Frequency'] > threshold)]
    return filter_invalid_words(frequent_nouns_df['Word'].dropna().tolist())

def extract_infrequent_nouns(pos_df, threshold=5):
    """ Extract nouns with frequency less than or equal to the given threshold. """
    infrequent_nouns_df = pos_df[(pos_df['POS Tag'] == 'NOUN') & (pos_df['Frequency'] <= threshold)]
    return filter_invalid_words(infrequent_nouns_df['Word'].dropna().tolist())

def compute_embeddings_and_filter(nouns, domain_specific_nouns, threshold=0.4):
    """ Compute sentence embeddings for nouns and filter based on semantic similarity. """
    if not nouns or not domain_specific_nouns:
        return [], nouns  # No important nouns if input lists are empty

    nouns_embeddings = model.encode(nouns, convert_to_tensor=True)
    domain_embeddings = model.encode(domain_specific_nouns, convert_to_tensor=True)

    similarities = util.pytorch_cos_sim(nouns_embeddings, domain_embeddings).numpy().max(axis=1)

    important_nouns = [(word, score) for word, score in zip(nouns, similarities) if score >= threshold]
    noisy_nouns = [(word, score) for word, score in zip(nouns, similarities) if score < threshold]

    # Ensure there are fewer noisy words than important ones
    if len(noisy_nouns) >= len(important_nouns):
        noisy_nouns = noisy_nouns[:len(important_nouns) - 1]

    return important_nouns, noisy_nouns

def adaptive_threshold(pos_df):
    """ Compute an adaptive threshold based on the word frequency distribution. """
    max_frequency = pos_df['Frequency'].max()
    if max_frequency > 100:
        return 0.4
    elif max_frequency > 50:
        return 0.3
    else:
        return 0.2

def display_side_by_side_table(chapter_name, noisy_df, important_df):
    """ Display noisy and important words side by side in a single HTML table for a given chapter. """
    chapter_title = f"<h2><b>{chapter_name}</b></h2>"
    table_html = (
        f"{chapter_title}"
        f"<table style='width:100%; border:1px solid black;'>"
        f"<tr>"
        f"<th style='text-align:left; border:1px solid black;'>Noisy Words (Sample)</th>"
        f"<th style='text-align:left; border:1px solid black;'>Important Words (Sample)</th>"
        f"</tr>"
        f"<tr>"
        f"<td style='width:50%; vertical-align:top; border:1px solid black;'>"
        f"{noisy_df.head(5).to_html(index=False)}</td>"
        f"<td style='width:50%; vertical-align:top; border:1px solid black;'>"
        f"{important_df.head(5).to_html(index=False)}</td>"
        f"</tr>"
        f"</table>"
    )
    display(HTML(table_html))

def process_chapters(pos_tables_dir, lemmatized_chapters_dir, noisy_words_dir, important_words_dir):
    """ Process each chapter individually, compute embeddings, and identify noisy and important nouns. """
    for pos_filename in os.listdir(pos_tables_dir):
        if pos_filename.endswith(".csv"):
            chapter_identifier = pos_filename.replace("POS_Table_", "").replace(".csv", "")
            lemmatized_path = os.path.join(lemmatized_chapters_dir, f"{chapter_identifier}.txt")
            
            if os.path.exists(lemmatized_path):
                with open(lemmatized_path, "r", encoding="utf-8") as f:
                    text = f.read()
                processed_text = preprocess_text(text)

                pos_df = pd.read_csv(os.path.join(pos_tables_dir, pos_filename))

                # Compute adaptive threshold
                adaptive_sim_threshold = adaptive_threshold(pos_df)

                # Extract domain-specific and infrequent nouns
                domain_specific_nouns = extract_frequent_nouns(pos_df)
                infrequent_nouns = extract_infrequent_nouns(pos_df)

                # Compute embeddings and filter important vs noisy nouns
                important_nouns, noisy_nouns = compute_embeddings_and_filter(
                    infrequent_nouns, domain_specific_nouns, threshold=adaptive_sim_threshold
                )

                # Save results to CSV
                noisy_nouns_df = pd.DataFrame(noisy_nouns, columns=["Word", "Similarity Score"])
                important_nouns_df = pd.DataFrame(important_nouns, columns=["Word", "Similarity Score"])

                noisy_nouns_df.to_csv(os.path.join(noisy_words_dir, f"Noisy_Words_{chapter_identifier}.csv"), index=False)
                important_nouns_df.to_csv(os.path.join(important_words_dir, f"Important_Words_{chapter_identifier}.csv"), index=False)

                # Display results side by side
                display_side_by_side_table(chapter_identifier, noisy_nouns_df, important_nouns_df)

if __name__ == "__main__":
    # Define directories
    pos_tables_dir = "output/pos_tables"
    lemmatized_chapters_dir = "output/lemmatized_chapters"
    noisy_words_dir = "output/noisy_words"
    important_words_dir = "output/important_words"

    # Create output directories if they don't exist
    os.makedirs(noisy_words_dir, exist_ok=True)
    os.makedirs(important_words_dir, exist_ok=True)

    # Process each chapter to identify noisy and important words
    process_chapters(pos_tables_dir, lemmatized_chapters_dir, noisy_words_dir, important_words_dir)


ModuleNotFoundError: No module named 'sentence_transformers'

extract and analyze noun phrases from lemmatized text using SpaCy

In [1]:
import os
import pandas as pd
import re
import spacy
import logging
from collections import Counter

# Configure logging
logging.basicConfig(level=logging.INFO)

# Define directories for the POS table and lemmatized chapter files
POS_TABLES_DIR = "output/pos_tables"
LEMMATIZED_CHAPTERS_DIR = "output/lemmatized_chapters"
CONCEPTS_OUTPUT_DIR = "output/concepts"

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Ensure the concepts output directory exists
os.makedirs(CONCEPTS_OUTPUT_DIR, exist_ok=True)

def normalize_and_clean_text(text):
    """ Normalize and clean text by converting to lowercase, removing punctuation and extra spaces. """
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def extract_noun_phrases(doc):
    """ Extract multi-word noun phrases and single nouns from the document. """
    noun_phrases = []
    single_nouns = []

    for chunk in doc.noun_chunks:
        words = [token.text.lower() for token in chunk if token.pos_ == "NOUN"]
        if len(words) in [2, 3]:
            noun_phrases.append(' '.join(words))

    for token in doc:
        if token.pos_ == "NOUN" and token.dep_ != "compound":
            single_nouns.append(token.lemma_.lower())

    return noun_phrases, single_nouns

def adjust_single_noun_frequency(single_nouns, multi_word_counter):
    """ Adjust the frequency of single nouns based on occurrences in multi-word phrases. """
    single_noun_counter = Counter(single_nouns)
    for phrase, count in multi_word_counter.items():
        for word in phrase.split():
            single_noun_counter[word] = max(0, single_noun_counter[word] - count)
    return single_noun_counter

def calculate_concept_frequency(multi_word_nouns, single_nouns):
    """ Calculate the frequency of multi-word and single noun phrases. """
    multi_word_counter = Counter(multi_word_nouns)
    adjusted_single_noun_counter = adjust_single_noun_frequency(single_nouns, multi_word_counter)

    multi_word_df = pd.DataFrame(multi_word_counter.items(), columns=['Word', 'Frequency'])
    single_noun_df = pd.DataFrame(adjusted_single_noun_counter.items(), columns=['Word', 'Frequency'])

    combined_df = pd.concat([multi_word_df, single_noun_df], ignore_index=True)
    return combined_df.sort_values(by='Frequency', ascending=False).reset_index(drop=True)

def process_chapter_files(pos_tables_dir, lemmatized_chapters_dir, concepts_output_dir):
    """ Process each chapter file and extract concepts. """
    for pos_filename in os.listdir(pos_tables_dir):
        if pos_filename.endswith(".csv"):
            chapter_identifier = pos_filename.replace("POS_Table_", "").replace(".csv", "").lower()
            lemmatized_file_path = os.path.join(lemmatized_chapters_dir, f"{chapter_identifier}.txt")

            logging.info(f"Looking for lemmatized file: {lemmatized_file_path}")

            if os.path.exists(lemmatized_file_path):
                try:
                    with open(lemmatized_file_path, "r", encoding="utf-8") as f:
                        normalized_text = normalize_and_clean_text(f.read())
                except Exception as e:
                    logging.error(f"Error reading {lemmatized_file_path}: {e}")
                    continue
                
                if normalized_text:
                    doc = nlp(normalized_text)
                    multi_word_nouns, single_nouns = extract_noun_phrases(doc)
                    concept_freq_df = calculate_concept_frequency(multi_word_nouns, single_nouns)

                    logging.info(f"\n--- {chapter_identifier} ---")
                    if not concept_freq_df.empty:
                        logging.info("Most Pertinent Concepts:\n%s", concept_freq_df.head(20).to_string(index=False))
                    else:
                        logging.info("No concepts found.")

                    concept_freq_path = os.path.join(concepts_output_dir, f"Concepts_{chapter_identifier}.csv")
                    try:
                        concept_freq_df.to_csv(concept_freq_path, index=False)
                        logging.info(f"Concept frequencies saved to {concept_freq_path}")
                    except Exception as e:
                        logging.error(f"Error saving to {concept_freq_path}: {e}")
                else:
                    logging.info(f"No text found in {lemmatized_file_path}")
            else:
                logging.warning(f"Lemmatized file not found: {lemmatized_file_path}")

if __name__ == "__main__":
    process_chapter_files(POS_TABLES_DIR, LEMMATIZED_CHAPTERS_DIR, CONCEPTS_OUTPUT_DIR)


KeyboardInterrupt: 

In [62]:
import os

# Define the directory containing the lemmatized files
lemmatized_chapters_dir = 'output/lemmatized_chapters'

# Create a mapping of current filenames to the desired new filenames
rename_mapping = {
    'lemmatized_chapitre_1_introduction.txt': 'Chapitre_1_introduction.txt',
    'lemmatized_chapitre_2_principles_and_concepts.txt': 'Chapitre_2_principles_and_concepts.txt',
    'lemmatized_chapitre_3_introduction_to_project_risk_management_processes.txt': 'Chapitre_3_introduction_to_project_risk_management_processes.txt',
    'lemmatized_chapitre_4_plan_risk_management.txt': 'Chapitre_4_plan_risk_management.txt',
    'lemmatized_chapitre_5_identify_risks.txt': 'Chapitre_5_identify_risks.txt',
    'lemmatized_chapitre_6_perform_qualitative_risk_analysis.txt': 'Chapitre_6_perform_qualitative_risk_analysis.txt',
    'lemmatized_chapitre_7_perform_quantitative_risk_analysis.txt': 'Chapitre_7_perform_quantitative_risk_analysis.txt',
    'lemmatized_chapitre_8_plan_risk_responses.txt': 'Chapitre_8_plan_risk_responses.txt',
    'lemmatized_chapitre_9_monitor_and_control_risks.txt': 'Chapitre_9_monitor_and_control_risks.txt'
}

# Loop through the mapping and rename the files
for old_name, new_name in rename_mapping.items():
    old_path = os.path.join(lemmatized_chapters_dir, old_name)
    new_path = os.path.join(lemmatized_chapters_dir, new_name)
    
    # Check if the old file exists before renaming
    if os.path.isfile(old_path):
        os.rename(old_path, new_path)
        print(f'Renamed: {old_name} to {new_name}')
    else:
        print(f'File not found: {old_path}')

print('Renaming process completed.')


Renamed: lemmatized_chapitre_1_introduction.txt to Chapitre_1_introduction.txt
Renamed: lemmatized_chapitre_2_principles_and_concepts.txt to Chapitre_2_principles_and_concepts.txt
Renamed: lemmatized_chapitre_3_introduction_to_project_risk_management_processes.txt to Chapitre_3_introduction_to_project_risk_management_processes.txt
Renamed: lemmatized_chapitre_4_plan_risk_management.txt to Chapitre_4_plan_risk_management.txt
Renamed: lemmatized_chapitre_5_identify_risks.txt to Chapitre_5_identify_risks.txt
Renamed: lemmatized_chapitre_6_perform_qualitative_risk_analysis.txt to Chapitre_6_perform_qualitative_risk_analysis.txt
Renamed: lemmatized_chapitre_7_perform_quantitative_risk_analysis.txt to Chapitre_7_perform_quantitative_risk_analysis.txt
Renamed: lemmatized_chapitre_8_plan_risk_responses.txt to Chapitre_8_plan_risk_responses.txt
Renamed: lemmatized_chapitre_9_monitor_and_control_risks.txt to Chapitre_9_monitor_and_control_risks.txt
Renaming process completed.


In [65]:
import os
import pandas as pd
import re
import spacy
from collections import defaultdict
from spacy.matcher import PhraseMatcher
from IPython.display import display  # Ensure this is available in Jupyter

# Define directories for input and output
LEMMATIZED_CHAPTERS_DIR = "output/lemmatized_chapters"
CONCEPTS_DIR = "output/concepts"
SUMMARY_OUTPUT_DIR = "output/summary"

# Load spaCy model with necessary components
nlp = spacy.load('en_core_web_sm')

# Initialize PhraseMatcher for multi-word concept handling
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

# Ensure the output directory exists
os.makedirs(SUMMARY_OUTPUT_DIR, exist_ok=True)

# Function to normalize and clean text
def normalize_and_clean_text(text):
    """
    Normalize and clean the input text by converting to lowercase,
    removing punctuation, numbers, special characters, and extra spaces.
    """
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, numbers, and special characters (keeping only words and spaces)
    text = re.sub(r'[^\w\s]', ' ', text)  # Replace punctuation with space
    text = re.sub(r'\d+', '', text)      # Remove numbers
    # Remove multiple spaces and trim
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to extract attributes based on defined patterns
def extract_attributes(doc, single_word_concepts, multi_word_concept_spans):
    """
    Extract attributes from the document based on the following patterns:
    - Adjective(s) + Concept
    - Concept + Noun Compound

    Parameters:
    - doc: spaCy parsed document
    - single_word_concepts: Set of pre-extracted single-word concepts
    - multi_word_concept_spans: List of Span objects representing multi-word concepts

    Returns:
    - attributes_dict: Dictionary mapping concepts to their attributes
    """
    attributes_dict = defaultdict(set)
    
    # Handle single-word concepts
    for token in doc:
        concept = token.lemma_.lower()
        if concept in single_word_concepts:
            # Adjective(s) + Concept
            adjectives = [child.text.lower() for child in token.lefts if child.pos_ == "ADJ"]
            for adj in adjectives:
                attributes_dict[concept].add(adj)
    
            # Concept + Noun Compound
            compounds = [child.text.lower() for child in token.lefts 
                         if child.dep_ == "compound" and child.pos_ == "NOUN"]
            for comp in compounds:
                attributes_dict[concept].add(comp)
    
    # Handle multi-word concepts
    for span in multi_word_concept_spans:
        # Get the root token of the span
        root = span.root
        concept_key = '_'.join(span.text.lower().split())  # Replace spaces with underscores
        
        # Adjective(s) + Concept
        adjectives = [child.text.lower() for child in root.lefts if child.pos_ == "ADJ"]
        for adj in adjectives:
            attributes_dict[concept_key].add(adj)
    
        # Concept + Noun Compound
        compounds = [child.text.lower() for child in root.lefts 
                     if child.dep_ == "compound" and child.pos_ == "NOUN"]
        for comp in compounds:
            attributes_dict[concept_key].add(comp)
    
    return attributes_dict

# Function to extract relationships based on defined patterns
def extract_relationships(doc, single_word_concepts, multi_word_concepts, multi_word_concept_spans):
    """
    Extract relationships from the document based on the following patterns:
    - Concept + Verb + Concept
    - Concept + Verb + Preposition + Concept

    Parameters:
    - doc: spaCy parsed document
    - single_word_concepts: Set of pre-extracted single-word concepts
    - multi_word_concepts: Set of pre-extracted multi-word concepts
    - multi_word_concept_spans: List of Span objects representing multi-word concepts

    Returns:
    - relationships_dict: Dictionary mapping subject concepts to a list of (relationship, related_concept)
    """
    relationships_dict = defaultdict(list)
    
    # Create a set of all multi-word concept strings with underscores
    multi_word_concept_strings = set('_'.join(span.text.lower().split()) for span in multi_word_concept_spans)
    
    for token in doc:
        # Focus only on verbs
        if token.pos_ == "VERB":
            # Find subjects that are concepts
            subjects = []
            for child in token.children:
                if child.dep_ in ("nsubj", "nsubjpass") and child.pos_ == "NOUN":
                    # Check if the subject is part of a multi-word concept
                    subject_span = None
                    for span in multi_word_concept_spans:
                        if child.i >= span.start and child.i < span.end:
                            subject_span = span
                            break
                    if subject_span:
                        subj = '_'.join(subject_span.text.lower().split())
                        subjects.append(subj)
                    elif child.lemma_.lower() in single_word_concepts:
                        subjects.append(child.lemma_.lower())
            
            if not subjects:
                continue  # No valid subjects
            
            # Pattern: Concept + Verb + Concept
            direct_objects = []
            for child in token.children:
                if child.dep_ in ("dobj", "attr") and child.pos_ == "NOUN":
                    # Check for multi-word concepts
                    object_span = None
                    for span in multi_word_concept_spans:
                        if child.i >= span.start and child.i < span.end:
                            object_span = span
                            break
                    if object_span:
                        obj = '_'.join(object_span.text.lower().split())
                        direct_objects.append(obj)
                    elif child.lemma_.lower() in single_word_concepts:
                        direct_objects.append(child.lemma_.lower())
            
            # Pattern: Concept + Verb + Preposition + Concept
            prepositions = [child for child in token.children if child.dep_ == "prep" and child.pos_ == "ADP"]
            for prep in prepositions:
                for grandchild in prep.children:
                    if grandchild.dep_ == "pobj" and grandchild.pos_ == "NOUN":
                        # Check for multi-word concepts
                        pobj_span = None
                        for span in multi_word_concept_spans:
                            if grandchild.i >= span.start and grandchild.i < span.end:
                                pobj_span = span
                                break
                        if pobj_span:
                            obj = '_'.join(pobj_span.text.lower().split())
                            direct_objects.append(obj)
                        elif grandchild.lemma_.lower() in single_word_concepts:
                            direct_objects.append(grandchild.lemma_.lower())
            
            # Add relationships to the dictionary
            for subj in subjects:
                for obj in direct_objects:
                    # Avoid self-relationships unless meaningful
                    if subj == obj:
                        continue
                    # Determine if the relationship is via preposition
                    relationship = f"{token.lemma_} {obj}"
                    relationships_dict[subj].append((relationship, obj))
    
    return relationships_dict

# Function to process each chapter file
def process_chapter_files(lemmatized_chapters_dir, concepts_dir, summary_output_dir, sample_size=5):
    """
    Process each chapter to extract attributes and relationships and save them into summary CSV files.
    Additionally, display a small sample of the summary table for each chapter.

    Parameters:
    - lemmatized_chapters_dir: Directory containing lemmatized chapter text files
    - concepts_dir: Directory containing pre-extracted concepts CSV files
    - summary_output_dir: Directory to save summary CSV files
    - sample_size: Number of sample rows to display per chapter
    """
    # Iterate through each concepts CSV file in the concepts directory
    for concepts_filename in os.listdir(concepts_dir):
        if concepts_filename.endswith(".csv"):
            concepts_path = os.path.join(concepts_dir, concepts_filename)
            
            # Extract chapter identifier from concepts filename
            # Expected format: Concepts_Chapitre_<number>_<title>.csv
            match = re.match(r'Concepts_lemmatized_(Chapitre_\d+_[\w_]+)\.csv', concepts_filename, re.IGNORECASE)
            if not match:
                print(f"Filename {concepts_filename} does not match expected pattern. Skipping.")
                continue
            chapter_identifier = match.group(1)  # e.g., Chapitre_9_Monitor_And_Control_Risks
            
            lemmatized_file_path = os.path.join(lemmatized_chapters_dir, f"{chapter_identifier}.txt")
            
            if os.path.exists(lemmatized_file_path):
                # Read and normalize the lemmatized chapter text
                try:
                    with open(lemmatized_file_path, "r", encoding="utf-8") as f:
                        lemmatized_text = f.read()
                    normalized_text = normalize_and_clean_text(lemmatized_text)
                except Exception as e:
                    print(f"Error reading {lemmatized_file_path}: {e}")
                    continue
                
                if normalized_text:
                    # Process the text with spaCy to get a parsed document
                    doc = nlp(normalized_text)
                    
                    # Load concepts for this chapter
                    try:
                        concepts_df = pd.read_csv(concepts_path)
                        # Separate single-word and multi-word concepts
                        single_word_concepts = set(concepts_df[~concepts_df['Word'].str.contains(' ')]['Word'].str.lower())
                        multi_word_concepts = set(concepts_df[concepts_df['Word'].str.contains(' ')]['Word'].str.lower())
                        # Create a dictionary for concept frequencies
                        concept_freq_dict = pd.Series(concepts_df.Frequency.values,
                                                     index=concepts_df.Word.str.lower()).to_dict()
                        # Prepare multi-word patterns for PhraseMatcher
                        multi_word_patterns = [nlp.make_doc(concept) for concept in multi_word_concepts]
                        matcher.add("MULTI_WORD_CONCEPT", multi_word_patterns)
                        matches = matcher(doc)
                        multi_word_concept_spans = []
                        for match_id, start, end in matches:
                            span = doc[start:end]
                            multi_word_concept_spans.append(span)
                    except Exception as e:
                        print(f"Error reading {concepts_path}: {e}")
                        continue
                    
                    # Extract Attributes
                    attributes_dict = extract_attributes(doc, single_word_concepts, multi_word_concept_spans)
                    
                    # Extract Relationships
                    relationships_dict = extract_relationships(doc, single_word_concepts, multi_word_concepts, multi_word_concept_spans)
                    
                    # Create Summary Table
                    summary_list = []
                    for concept in sorted(single_word_concepts.union(
                            {'_'.join(span.text.lower().split()) for span in multi_word_concept_spans})):
                        # Determine if the concept is multi-word
                        if '_' in concept:
                            display_concept = ' '.join(concept.split('_'))
                        else:
                            display_concept = concept
                        
                        # Get frequency (replace underscores with spaces for multi-word concepts)
                        freq = concept_freq_dict.get(display_concept, 0)
                        
                        # Get attributes
                        attrs = attributes_dict.get(concept, set())
                        attrs_str = ', '.join(sorted(attrs)) if attrs else ''
                        
                        # Get relationships
                        rels = relationships_dict.get(concept, [])
                        
                        # For each relationship, create a separate row
                        for rel, related_concept in rels:
                            # Determine if related_concept is multi-word
                            if '_' in related_concept:
                                display_related_concept = ' '.join(related_concept.split('_'))
                            else:
                                display_related_concept = related_concept
                            
                            summary_list.append({
                                'Concept': display_concept,
                                'Frequency': freq,
                                'Attributes': attrs_str,
                                'Relationship': rel,
                                'Related Concept': display_related_concept
                            })
                    
                    summary_df = pd.DataFrame(summary_list)
                    
                    # Sort the summary table based on frequency in descending order
                    summary_df.sort_values(by='Frequency', ascending=False, inplace=True)
                    
                    # Reset index after sorting
                    summary_df.reset_index(drop=True, inplace=True)
                    
                    # Save Summary Table to CSV
                    if not summary_df.empty:
                        summary_output_filename = f"Summary_{chapter_identifier}.csv"
                        summary_output_path = os.path.join(summary_output_dir, summary_output_filename)
                        try:
                            summary_df.to_csv(summary_output_path, index=False)
                            print(f"Saved summary for {chapter_identifier} to {summary_output_path}")
                        except Exception as e:
                            print(f"Error saving {summary_output_path}: {e}")
                    
                        # Display a small sample of the summary table based on top frequency
                        print(f"\nSample Summary for {chapter_identifier}:")
                        display_df = summary_df.head(sample_size)  # Display first 'sample_size' rows as a sample
                        display(display_df)
            else:
                print(f"Lemmatized file not found for {chapter_identifier}: {lemmatized_file_path}")
                continue

# Run the function to process all chapter files
# Remove the `if __name__ == "__main__":` block to allow execution in Jupyter Notebook
process_chapter_files(
    lemmatized_chapters_dir=LEMMATIZED_CHAPTERS_DIR,
    concepts_dir=CONCEPTS_DIR,
    summary_output_dir=SUMMARY_OUTPUT_DIR,
    sample_size=5  # Adjust the number of sample rows to display
)


Saved summary for chapitre_1_introduction to output/summary\Summary_chapitre_1_introduction.csv

Sample Summary for chapitre_1_introduction:


Unnamed: 0,Concept,Frequency,Attributes,Relationship,Related Concept
0,management,24,"appropriate, change, characteristic, comprehen...",apply project,project
1,project,18,"ach, applicable, approach, assumption, attenti...",carry context_refl_ecte,context refl ecte
2,process,13,"chapter, clarifi, close, contain, culture, dea...",apply asset,asset
3,process,13,"chapter, clarifi, close, contain, culture, dea...",implement chapter,chapter
4,process,13,"chapter, clarifi, close, contain, culture, dea...",describe chapter_address,chapter address


Saved summary for chapitre_2_principles_and_concepts to output/summary\Summary_chapitre_2_principles_and_concepts.csv

Sample Summary for chapitre_2_principles_and_concepts:


Unnamed: 0,Concept,Frequency,Attributes,Relationship,Related Concept
0,process,5,"assessment, assure, credibility, integral, ite...",operate conclusion,conclusion
1,project risk,4,"approach, aspect, cid, communication, effectiv...",represent project,project
2,project risk,4,"approach, aspect, cid, communication, effectiv...",affect project,project
3,project risk management,3,"aspect, cid, communication, consistent, effect...",include part_project_process,part project process
4,stakeholder,3,"advance, attitudes, consultation, exposure, fa...",differ group_stakeholder,group stakeholder


Saved summary for chapitre_3_introduction_to_project_risk_management_processes to output/summary\Summary_chapitre_3_introduction_to_project_risk_management_processes.csv

Sample Summary for chapitre_3_introduction_to_project_risk_management_processes:


Unnamed: 0,Concept,Frequency,Attributes,Relationship,Related Concept
0,process,11,"available, element, essential, important, iter...",perform task,task
1,risk,9,"appropriate, careful, characteristic, clear, c...",use technique,technique
2,risk,9,"appropriate, careful, characteristic, clear, c...",consider effect,effect
3,risk,9,"appropriate, careful, characteristic, clear, c...",require risk_management_process,risk management process
4,action,7,"addition, additional, main, management, ning, ...",implement choose_strategy_action,choose strategy action


Saved summary for chapitre_4_plan_risk_management to output/summary\Summary_chapitre_4_plan_risk_management.csv

Sample Summary for chapitre_4_plan_risk_management:


Unnamed: 0,Concept,Frequency,Attributes,Relationship,Related Concept
0,plan,15,"available, chance, communications, e, effectiv...",describe risk_management_process,risk management process
1,plan,15,"available, chance, communications, e, effectiv...",describe frequency,frequency
2,stakeholder,7,"acceptance, adapt, assess, attitude, authority...",infl number_factor,number factor
3,stakeholder,7,"acceptance, adapt, assess, attitude, authority...",refl manager,manager
4,order,4,"effective, evolve, project, scope, time",ensure consistency_agreement,consistency agreement


Saved summary for chapitre_5_identify_risks to output/summary\Summary_chapitre_5_identify_risks.csv

Sample Summary for chapitre_5_identify_risks:


Unnamed: 0,Concept,Frequency,Attributes,Relationship,Related Concept
0,risk,11,"account, available, base, case, category, clar...",include project_risk_management,project risk management
1,risk,11,"account, available, base, case, category, clar...",include information,information
2,risk,11,"account, available, base, case, category, clar...",identify risks_process,risks process
3,technique,6,"assessment, combination, creativity, effective...",identify risk,risk
4,review,4,historical,use risk_breakdown_structure,risk breakdown structure


Saved summary for chapitre_6_perform_qualitative_risk_analysis to output/summary\Summary_chapitre_6_perform_qualitative_risk_analysis.csv

Sample Summary for chapitre_6_perform_qualitative_risk_analysis:


Unnamed: 0,Concept,Frequency,Attributes,Relationship,Related Concept
0,risk,9,"advance, categorize, chain, characteristic, co...",ask decision_point_view,decision point view
1,risk,9,"advance, categorize, chain, characteristic, co...",assess priority,priority
2,risk,9,"advance, categorize, chain, characteristic, co...",assess priority_risk_impact,priority risk impact
3,risk,9,"advance, categorize, chain, characteristic, co...",require term_response,term response
4,use,5,"analysis, credibility, easy, nition, process, ...",perform risk_analysis_success,risk analysis success


Saved summary for chapitre_7_perform_quantitative_risk_analysis to output/summary\Summary_chapitre_7_perform_quantitative_risk_analysis.csv

Sample Summary for chapitre_7_perform_quantitative_risk_analysis:


Unnamed: 0,Concept,Frequency,Attributes,Relationship,Related Concept
0,project,7,"accurate, achievement, analysis, appropriate, ...",proceed frequency_effort,frequency effort
1,risk analysis,6,"analysis, appropriate, new, objective, overall...",provide information,information
2,risk analysis,6,"analysis, appropriate, new, objective, overall...",perform risk_analysis_process,risk analysis process
3,risk analysis,6,"analysis, appropriate, new, objective, overall...",use method,method
4,bias,4,"combat, motivational, source",derive risk,risk


Saved summary for chapitre_8_plan_risk_responses to output/summary\Summary_chapitre_8_plan_risk_responses.csv

Sample Summary for chapitre_8_plan_risk_responses:


Unnamed: 0,Concept,Frequency,Attributes,Relationship,Related Concept
0,process,8,"analysis, assignment, factor, good, plan, poss...",determine risk,risk
1,plan,8,"accordance, additional, analysis, analyze, app...",evaluate order,order
2,process,8,"analysis, assignment, factor, good, plan, poss...",describe section,section
3,process,8,"analysis, assignment, factor, good, plan, poss...",determine success,success
4,plan,8,"accordance, additional, analysis, analyze, app...",develop address,address


Saved summary for chapitre_9_monitor_and_control_risks to output/summary\Summary_chapitre_9_monitor_and_control_risks.csv

Sample Summary for chapitre_9_monitor_and_control_risks:


Unnamed: 0,Concept,Frequency,Attributes,Relationship,Related Concept
0,analysis,28,"additional, analysis, appropriate, carlo, caus...",earn value_analysis,value analysis
1,analysis,28,"additional, analysis, appropriate, carlo, caus...",create time,time
2,analysis,28,"additional, analysis, appropriate, carlo, caus...",use risk,risk
3,analysis,28,"additional, analysis, appropriate, carlo, caus...",earn use,use
4,analysis,28,"additional, analysis, appropriate, carlo, caus...",provide view,view


In [67]:
import os
import pandas as pd
import re
import spacy
from collections import defaultdict
from spacy.matcher import PhraseMatcher
from IPython.display import display  # For Jupyter Notebook display

# Define directories for input and output
LEMMATIZED_CHAPTERS_DIR = "output/lemmatized_chapters"
CONCEPTS_DIR = "output/concepts"
SUMMARY_OUTPUT_DIR = "output/summary"
NODES_OUTPUT_DIR = "output/nodes"
RELATIONSHIPS_OUTPUT_DIR = "output/relationships"

# Load spaCy model with necessary components
nlp = spacy.load('en_core_web_sm')

# Initialize PhraseMatcher for multi-word concept handling
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

# Ensure the output directories exist
os.makedirs(SUMMARY_OUTPUT_DIR, exist_ok=True)
os.makedirs(NODES_OUTPUT_DIR, exist_ok=True)
os.makedirs(RELATIONSHIPS_OUTPUT_DIR, exist_ok=True)

# Function to normalize and clean text
def normalize_and_clean_text(text):
    """
    Normalize and clean the input text by converting to lowercase,
    removing punctuation, numbers, special characters, and extra spaces.
    """
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, numbers, and special characters (keeping only words and spaces)
    text = re.sub(r'[^\w\s]', ' ', text)  # Replace punctuation with space
    text = re.sub(r'\d+', '', text)      # Remove numbers
    # Remove multiple spaces and trim
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to extract attributes based on defined patterns
def extract_attributes(doc, single_word_concepts, multi_word_concept_spans):
    """
    Extract attributes from the document based on the following patterns:
    - Adjective(s) + Concept
    - Concept + Noun Compound

    Parameters:
    - doc: spaCy parsed document
    - single_word_concepts: Set of pre-extracted single-word concepts
    - multi_word_concept_spans: List of Span objects representing multi-word concepts

    Returns:
    - attributes_dict: Dictionary mapping concepts to their attributes
    """
    attributes_dict = defaultdict(set)
    
    # Handle single-word concepts
    for token in doc:
        concept = token.lemma_.lower()
        if concept in single_word_concepts:
            # Adjective(s) + Concept
            adjectives = [child.text.lower() for child in token.lefts if child.pos_ == "ADJ"]
            for adj in adjectives:
                attributes_dict[concept].add(adj)
    
            # Concept + Noun Compound
            compounds = [child.text.lower() for child in token.lefts 
                         if child.dep_ == "compound" and child.pos_ == "NOUN"]
            for comp in compounds:
                attributes_dict[concept].add(comp)
    
    # Handle multi-word concepts
    for span in multi_word_concept_spans:
        # Get the root token of the span
        root = span.root
        concept_key = '_'.join(span.text.lower().split())  # Replace spaces with underscores
        
        # Adjective(s) + Concept
        adjectives = [child.text.lower() for child in root.lefts if child.pos_ == "ADJ"]
        for adj in adjectives:
            attributes_dict[concept_key].add(adj)
    
        # Concept + Noun Compound
        compounds = [child.text.lower() for child in root.lefts 
                     if child.dep_ == "compound" and child.pos_ == "NOUN"]
        for comp in compounds:
            attributes_dict[concept_key].add(comp)
    
    return attributes_dict

# Function to extract relationships based on defined patterns
def extract_relationships(doc, single_word_concepts, multi_word_concepts, multi_word_concept_spans):
    """
    Extract relationships from the document based on the following patterns:
    - Concept + Verb + Concept
    - Concept + Verb + Preposition + Concept

    Parameters:
    - doc: spaCy parsed document
    - single_word_concepts: Set of pre-extracted single-word concepts
    - multi_word_concepts: Set of pre-extracted multi-word concepts
    - multi_word_concept_spans: List of Span objects representing multi-word concepts

    Returns:
    - relationships_dict: Dictionary mapping subject concepts to a list of (relationship, related_concept)
    """
    relationships_dict = defaultdict(list)
    
    # Create a set of all multi-word concept strings with underscores
    multi_word_concept_strings = set('_'.join(span.text.lower().split()) for span in multi_word_concept_spans)
    
    for token in doc:
        # Focus only on verbs
        if token.pos_ == "VERB":
            # Find subjects that are concepts
            subjects = []
            for child in token.children:
                if child.dep_ in ("nsubj", "nsubjpass") and child.pos_ == "NOUN":
                    # Check if the subject is part of a multi-word concept
                    subject_span = None
                    for span in multi_word_concept_spans:
                        if child.i >= span.start and child.i < span.end:
                            subject_span = span
                            break
                    if subject_span:
                        subj = '_'.join(subject_span.text.lower().split())
                        subjects.append(subj)
                    elif child.lemma_.lower() in single_word_concepts:
                        subjects.append(child.lemma_.lower())
            
            if not subjects:
                continue  # No valid subjects
            
            # Pattern: Concept + Verb + Concept
            direct_objects = []
            for child in token.children:
                if child.dep_ in ("dobj", "attr") and child.pos_ == "NOUN":
                    # Check for multi-word concepts
                    object_span = None
                    for span in multi_word_concept_spans:
                        if child.i >= span.start and child.i < span.end:
                            object_span = span
                            break
                    if object_span:
                        obj = '_'.join(object_span.text.lower().split())
                        direct_objects.append(obj)
                    elif child.lemma_.lower() in single_word_concepts:
                        direct_objects.append(child.lemma_.lower())
            
            # Pattern: Concept + Verb + Preposition + Concept
            prepositions = [child for child in token.children if child.dep_ == "prep" and child.pos_ == "ADP"]
            for prep in prepositions:
                for grandchild in prep.children:
                    if grandchild.dep_ == "pobj" and grandchild.pos_ == "NOUN":
                        # Check for multi-word concepts
                        pobj_span = None
                        for span in multi_word_concept_spans:
                            if grandchild.i >= span.start and grandchild.i < span.end:
                                pobj_span = span
                                break
                        if pobj_span:
                            obj = '_'.join(pobj_span.text.lower().split())
                            direct_objects.append(obj)
                        elif grandchild.lemma_.lower() in single_word_concepts:
                            direct_objects.append(grandchild.lemma_.lower())
            
            # Add relationships to the dictionary
            for subj in subjects:
                for obj in direct_objects:
                    # Avoid self-relationships unless meaningful
                    if subj == obj:
                        continue
                    # Determine if the relationship is via preposition
                    # For simplicity, we'll categorize relationships based on the verb
                    relationship = token.lemma_  # Use the verb lemma as the relationship type
                    relationships_dict[subj].append((relationship, obj))
    
    return relationships_dict

# Function to process each chapter file
def process_chapter_files(lemmatized_chapters_dir, concepts_dir, summary_output_dir, nodes_output_dir, relationships_output_dir, sample_size=5):
    """
    Process each chapter to extract attributes and relationships and save them into summary, nodes, and relationships CSV files.
    Additionally, display a small sample of the summary table for each chapter.

    Parameters:
    - lemmatized_chapters_dir: Directory containing lemmatized chapter text files
    - concepts_dir: Directory containing pre-extracted concepts CSV files
    - summary_output_dir: Directory to save summary CSV files
    - nodes_output_dir: Directory to save nodes CSV files
    - relationships_output_dir: Directory to save relationships CSV files
    - sample_size: Number of sample rows to display per chapter
    """
    # Initialize global aggregators
    global_attributes_dict = defaultdict(set)
    global_relationships_dict = defaultdict(list)
    global_concept_freq_dict = defaultdict(int)
    
    # Iterate through each concepts CSV file in the concepts directory
    for concepts_filename in os.listdir(concepts_dir):
        if concepts_filename.endswith(".csv"):
            concepts_path = os.path.join(concepts_dir, concepts_filename)
            
            # Extract chapter identifier from concepts filename
            # Expected format: Concepts_Chapitre_<number>_<title>.csv
            match = re.match(r'Concepts_lemmatized_(Chapitre_\d+_[\w_]+)\.csv', concepts_filename, re.IGNORECASE)
            if not match:
                print(f"Filename {concepts_filename} does not match expected pattern. Skipping.")
                continue
            chapter_identifier = match.group(1)  # e.g., Chapitre_9_Monitor_And_Control_Risks
            
            lemmatized_file_path = os.path.join(lemmatized_chapters_dir, f"{chapter_identifier}.txt")
            
            if os.path.exists(lemmatized_file_path):
                # Read and normalize the lemmatized chapter text
                try:
                    with open(lemmatized_file_path, "r", encoding="utf-8") as f:
                        lemmatized_text = f.read()
                    normalized_text = normalize_and_clean_text(lemmatized_text)
                except Exception as e:
                    print(f"Error reading {lemmatized_file_path}: {e}")
                    continue
                
                if normalized_text:
                    # Process the text with spaCy to get a parsed document
                    doc = nlp(normalized_text)
                    
                    # Load concepts for this chapter
                    try:
                        concepts_df = pd.read_csv(concepts_path)
                        # Ensure 'Word' and 'Frequency' columns exist
                        if not {'Word', 'Frequency'}.issubset(concepts_df.columns):
                            print(f"Concepts file {concepts_filename} missing required columns. Skipping.")
                            continue
                        # Separate single-word and multi-word concepts
                        single_word_concepts = set(concepts_df[~concepts_df['Word'].str.contains(' ')]['Word'].str.lower())
                        multi_word_concepts = set(concepts_df[concepts_df['Word'].str.contains(' ')]['Word'].str.lower())
                        # Update global concept frequency
                        for _, row in concepts_df.iterrows():
                            concept = row['Word'].lower()
                            freq = row['Frequency']
                            global_concept_freq_dict[concept] += freq
                        # Prepare multi-word patterns for PhraseMatcher
                        multi_word_patterns = [nlp.make_doc(concept) for concept in multi_word_concepts]
                        matcher.add("MULTI_WORD_CONCEPT", multi_word_patterns)
                        matches = matcher(doc)
                        multi_word_concept_spans = []
                        for match_id, start, end in matches:
                            span = doc[start:end]
                            multi_word_concept_spans.append(span)
                    except Exception as e:
                        print(f"Error reading {concepts_path}: {e}")
                        continue
                    
                    # Extract Attributes
                    attributes_dict = extract_attributes(doc, single_word_concepts, multi_word_concept_spans)
                    
                    # Aggregate global attributes
                    for concept, attrs in attributes_dict.items():
                        global_attributes_dict[concept].update(attrs)
                    
                    # Extract Relationships
                    relationships_dict = extract_relationships(doc, single_word_concepts, multi_word_concepts, multi_word_concept_spans)
                    
                    # Aggregate global relationships
                    for subj, rels in relationships_dict.items():
                        global_relationships_dict[subj].extend(rels)
                    
                    # Optionally, create and save per-chapter summaries
                    # [Omitted for brevity]
                    
                    # Display a small sample of the summary table based on top frequency
                    # [Omitted for brevity]
            else:
                print(f"Lemmatized file not found for {chapter_identifier}: {lemmatized_file_path}")
                continue
    
    # After processing all chapters, create combined nodes and relationships
    
    # Create Relationships DataFrame
    relationships_list = []
    for subj, rels in global_relationships_dict.items():
        for rel, obj in rels:
            relationships_list.append({
                'Source': subj,
                'Relationship': rel,
                'Target': obj
            })
    
    relationships_df = pd.DataFrame(relationships_list).drop_duplicates()
    
    # Identify all concepts that are part of relationships
    connected_concepts = set(relationships_df['Source']).union(set(relationships_df['Target']))
    
    # Create Nodes DataFrame
    nodes_list = []
    for concept in connected_concepts:
        display_concept = ' '.join(concept.split('_')) if '_' in concept else concept
        freq = global_concept_freq_dict.get(concept, 0)
        attrs = global_attributes_dict.get(concept, set())
        attrs_str = ', '.join(sorted(attrs)) if attrs else ''
        nodes_list.append({
            'Concept': display_concept,
            'Frequency': freq,
            'Attributes': attrs_str
        })
    
    nodes_df = pd.DataFrame(nodes_list).drop_duplicates(subset=['Concept'])
    
    # Save Combined Nodes CSV
    nodes_output_path = os.path.join(nodes_output_dir, "nodes.csv")
    try:
        nodes_df.to_csv(nodes_output_path, index=False)
        print(f"\nSaved combined nodes to {nodes_output_path}")
    except Exception as e:
        print(f"Error saving {nodes_output_path}: {e}")
    
    # Save Combined Relationships CSV
    relationships_output_path = os.path.join(relationships_output_dir, "relationships.csv")
    try:
        relationships_df.to_csv(relationships_output_path, index=False)
        print(f"Saved combined relationships to {relationships_output_path}")
    except Exception as e:
        print(f"Error saving {relationships_output_path}: {e}")
    
    # Optionally, display samples
    print("\nSample Nodes:")
    display(nodes_df.head(sample_size))
    
    print("\nSample Relationships:")
    display(relationships_df.head(sample_size))

# Run the function to process all chapter files
process_chapter_files(
    lemmatized_chapters_dir=LEMMATIZED_CHAPTERS_DIR,
    concepts_dir=CONCEPTS_DIR,
    summary_output_dir=SUMMARY_OUTPUT_DIR,
    nodes_output_dir=NODES_OUTPUT_DIR,
    relationships_output_dir=RELATIONSHIPS_OUTPUT_DIR,
    sample_size=5  # Adjust the number of sample rows to display
)





Saved combined nodes to output/nodes\nodes.csv
Saved combined relationships to output/relationships\relationships.csv

Sample Nodes:


Unnamed: 0,Concept,Frequency,Attributes
0,section,5,"analysis, d, describe, detail, detailed, main,..."
1,element,3,"analysis, analyst, cost, effect, follow, proje..."
2,lesson,10,"documenting, effectiveness, inclusion, opportu..."
3,example,27,"analysis, appropriate, breakdown, categorize, ..."
4,path,0,"critical, logical"



Sample Relationships:


Unnamed: 0,Source,Relationship,Target
0,guide,use,tool_technique_process
1,guide,use,project
2,guide,operate,signifi
3,area,provide,information_signifi
4,area,guide,edition
