In [54]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import os
import csv

# Load the spaCy model for noun chunking and named entity recognition
nlp = spacy.load("en_core_web_sm")

# Define additional stopwords and irrelevant terms to filter out
additional_stopwords = {
    'guide', 'figure', 'standard', 'edition', 'chapter', 'table', 'example',
    'technique', 'section', 'd.', 'third', 'fourth', 'effective', 'date',
    'process', 'approach', 'criteria', 'success', 'strategy', 'objective', 
    'step', 'introduction', 'provides', 'project', 'management', 'pm', 'pmbo',
    'pmbok', 'pmi', 'phd', 'plan', 'methodology', 'tools', 'template', 
    'results', 'ensure', 'ensures', 'requires', 'actions', 'inputs', 'outputs', 'practice', 
    'techniques', 'body', 'document', 'resources', 'preparation', 'organization', 
    'involves', 'use', 'each', 'one', 'two', 'three', 'may', 'should', 
    'must', 'several', 'provide'
}

# Function to clean and preprocess text
def clean_text(text):
    # Remove punctuation and digits, and trim whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)       # Remove digits
    text = ' '.join(text.split())         # Remove extra whitespace
    text = text.strip().lower()            # Normalize the text to lower case

    # Remove leading "a " if it exists
    if text.startswith("a "):
        text = text[2:].strip()  # Remove the leading "a " and trim whitespace
    
    return text

# Function to filter out irrelevant or redundant terms
def is_relevant(term):
    # Check if the term is long enough and not in the additional stopwords
    return len(term.strip()) >= 3 and term.strip() not in additional_stopwords

# Extract noun chunks from the text using spaCy
def extract_noun_chunks(text):
    doc = nlp(text)
    return [clean_text(chunk.text) for chunk in doc.noun_chunks if is_relevant(clean_text(chunk.text))]

# Function to extract key terms using TF-IDF
def extract_key_terms(documents, top_n=20):
    vectorizer = TfidfVectorizer(max_df=0.9, stop_words="english", use_idf=True)
    X = vectorizer.fit_transform(documents)
    
    # Get terms with the highest TF-IDF scores
    terms = vectorizer.get_feature_names_out()
    tfidf_scores = X.sum(axis=0).A1
    sorted_terms = sorted(zip(terms, tfidf_scores), key=lambda x: x[1], reverse=True)
    
    # Return top N terms based on TF-IDF score
    return [term for term, score in sorted_terms[:top_n] if is_relevant(term)]

# Function to extract concepts from multiple chapters
def extract_concepts_from_chapters(chapters):
    all_text = ' '.join(chapters)  # Combine all chapters
    noun_chunks = extract_noun_chunks(all_text)  # Extract noun chunks
    top_tfidf_terms = extract_key_terms(chapters, top_n=50)  # Get top 50 TF-IDF terms
    combined_concepts = set(noun_chunks + top_tfidf_terms)
    
    # Filter out additional irrelevant terms
    return sorted(concept for concept in combined_concepts if is_relevant(concept))

# Path to your folder with cleaned chapter files
folder_path = 'all_chapters'

# Load all chapter files into a list of strings
chapters = []
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            chapters.append(file.read())

# Extract the final list of relevant concepts
concepts = extract_concepts_from_chapters(chapters)

# Save concepts to a CSV file with UTF-8 encoding and clean up the entries
with open('unique_concepts.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Concepts'])  # Write header
    for concept in concepts:
        cleaned_concept = clean_text(concept)  # Clean the concept
        if cleaned_concept:  # Only write if the cleaned concept is not empty
            writer.writerow([cleaned_concept])

print(f"Extracted concepts: {concepts}")




In [59]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import os
import csv

# Load the spaCy model for noun chunking and named entity recognition
nlp = spacy.load("en_core_web_sm")

# Define additional stopwords and irrelevant terms to filter out
additional_stopwords = {
    'guide', 'figure', 'standard', 'edition', 'chapter', 'table', 'example',
    'technique', 'section', 'd.', 'third', 'fourth', 'effective', 'date',
    'process', 'approach', 'criteria', 'success', 'strategy', 'objective', 
    'step', 'introduction', 'provides', 'project', 'management', 'pm', 'pmbo',
    'pmbok', 'pmi', 'phd', 'plan', 'methodology', 'tools', 'template', 
    'results', 'ensure', 'ensures', 'requires', 'actions', 'inputs', 'outputs', 
    'practice', 'techniques', 'body', 'document', 'resources', 'preparation', 
    'organization', 'involves', 'use', 'each', 'one', 'two', 'three', 'may', 
    'should', 'must', 'several', 'provide'
}

# Function to clean and preprocess text
def clean_text(text):
    # Remove punctuation and digits, and trim whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)       # Remove digits
    text = ' '.join(text.split())         # Remove extra whitespace
    text = text.strip().lower()            # Normalize the text to lower case

    # Remove leading "a " if it exists
    if text.startswith("a "):
        text = text[2:].strip()  # Remove the leading "a " and trim whitespace
    
    return text

# Function to filter out irrelevant or redundant terms
def is_relevant(term):
    term = term.strip()
    return len(term) >= 3 and term not in additional_stopwords

# Extract noun chunks from the text using spaCy
def extract_noun_chunks(text):
    doc = nlp(text)
    return [clean_text(chunk.text) for chunk in doc.noun_chunks if is_relevant(chunk.text)]

# Function to extract key terms using TF-IDF
def extract_key_terms(documents, top_n=20):
    vectorizer = TfidfVectorizer(max_df=0.9, stop_words="english", use_idf=True)
    X = vectorizer.fit_transform(documents)
    
    # Get terms with the highest TF-IDF scores
    terms = vectorizer.get_feature_names_out()
    tfidf_scores = X.sum(axis=0).A1
    sorted_terms = sorted(zip(terms, tfidf_scores), key=lambda x: x[1], reverse=True)
    
    # Return top N terms based on TF-IDF score
    return [term for term, score in sorted_terms[:top_n] if is_relevant(term)]

# Function to extract concepts from multiple chapters
def extract_concepts_from_chapters(chapters):
    all_text = ' '.join(chapters)  # Combine all chapters
    noun_chunks = extract_noun_chunks(all_text)  # Extract noun chunks
    top_tfidf_terms = extract_key_terms(chapters, top_n=50)  # Get top 50 TF-IDF terms
    combined_concepts = set(noun_chunks + top_tfidf_terms)
    
    # Filter out additional irrelevant terms and sort the results
    return sorted(concept for concept in combined_concepts if is_relevant(concept))

# Path to your folder with cleaned chapter files
folder_path = 'all_chapters'

# Load all chapter files into a list of strings
chapters = []
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            chapters.append(file.read())

# Extract the final list of relevant concepts
concepts = extract_concepts_from_chapters(chapters)

# Save concepts to a CSV file with UTF-8 encoding and clean up the entries
with open('unique_concepts.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Concepts'])  # Write header
    for concept in concepts:
        cleaned_concept = clean_text(concept)  # Clean the concept
        if cleaned_concept:  # Only write if the cleaned concept is not empty
            writer.writerow([cleaned_concept])

print(f"Extracted concepts: {concepts}")






In [63]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import os
import csv

# Load the spaCy model for noun chunking and named entity recognition
nlp = spacy.load("en_core_web_sm")

# Define additional stopwords and irrelevant terms to filter out
additional_stopwords = {
    'guide', 'figure', 'standard', 'edition', 'chapter', 'table', 'example',
    'technique', 'section', 'd.', 'third', 'fourth', 'effective', 'date',
    'process', 'approach', 'criteria', 'success', 'strategy', 'objective', 
    'step', 'introduction', 'provides', 'project', 'management', 'pm', 'pmbo',
    'pmbok', 'pmi', 'phd', 'plan', 'methodology', 'tools', 'template', 
    'results', 'ensure', 'ensures', 'requires', 'actions', 'inputs', 'outputs', 
    'practice', 'techniques', 'body', 'document', 'resources', 'preparation', 
    'organization', 'involves', 'use', 'each', 'one', 'two', 'three', 'may', 
    'should', 'must', 'several', 'provide', 'also', 'to', 'that', 'it', 'is', 
    'are', 'for', 'and', 'in', 'of', 'the', 'a', 'an', 'as', 'with', 'on', 
    'by', 'this', 'which', 'from', 'at', 'but', 'or', 'if', 'not'
}

# Function to clean and preprocess text
def clean_text(text):
    # Remove punctuation and digits, and trim whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)       # Remove digits
    text = ' '.join(text.split())         # Remove extra whitespace
    text = text.strip().lower()            # Normalize the text to lower case

    # Remove leading "a " if it exists
    if text.startswith("a "):
        text = text[2:].strip()  # Remove the leading "a " and trim whitespace
    
    return text

# Function to filter out irrelevant or redundant terms
def is_relevant(term):
    term = term.strip()
    return len(term) >= 4 and term not in additional_stopwords  # Set minimum length to 4

# Extract noun chunks from the text using spaCy
def extract_noun_chunks(text):
    doc = nlp(text)
    return [clean_text(chunk.text) for chunk in doc.noun_chunks if is_relevant(chunk.text)]

# Function to extract key terms using TF-IDF
def extract_key_terms(documents, top_n=20):
    vectorizer = TfidfVectorizer(max_df=0.9, stop_words="english", use_idf=True)
    X = vectorizer.fit_transform(documents)
    
    # Get terms with the highest TF-IDF scores
    terms = vectorizer.get_feature_names_out()
    tfidf_scores = X.sum(axis=0).A1
    sorted_terms = sorted(zip(terms, tfidf_scores), key=lambda x: x[1], reverse=True)
    
    # Return top N terms based on TF-IDF score
    return [term for term, score in sorted_terms[:top_n] if is_relevant(term)]

# Function to check if a concept is correct
def is_correct_concept(concept):
    # Implement your logic to check if the concept is correct
    # For example, you can check for specific criteria or patterns
    # Return True if the concept is correct, otherwise False
    if isinstance(concept, str) and len(concept.split()) > 1:  # Example condition
        return True
    return False

# Extract concepts from multiple chapters
def extract_concepts_from_chapters(chapters):
    all_text = ' '.join(chapters)  # Combine all chapters
    noun_chunks = extract_noun_chunks(all_text)  # Extract noun chunks
    top_tfidf_terms = extract_key_terms(chapters, top_n=50)  # Get top 50 TF-IDF terms
    
    # Combine and deduplicate concepts
    combined_concepts = set(noun_chunks + top_tfidf_terms)
    
    # Filter out additional irrelevant terms and sort the results
    return sorted(concept for concept in combined_concepts if is_relevant(concept))

# Path to your folder with cleaned chapter files
folder_path = 'all_chapters'

# Load all chapter files into a list of strings
chapters = []
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            chapters.append(file.read())

# Extract the final list of relevant concepts
concepts = extract_concepts_from_chapters(chapters)

# Filter concepts to remove incorrect ones
corrected_concepts = [concept for concept in concepts if is_correct_concept(concept)]

# Save concepts to a CSV file with UTF-8 encoding and clean up the entries
with open('unique_concepts.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Concepts'])  # Write header
    for concept in corrected_concepts:
        cleaned_concept = clean_text(concept)  # Clean the concept
        if cleaned_concept:  # Only write if the cleaned concept is not empty
            writer.writerow([cleaned_concept])

print(f"Extracted and corrected concepts: {corrected_concepts}")






In [69]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import os
import csv

# Load the spaCy model for noun chunking and named entity recognition
nlp = spacy.load("en_core_web_sm")

# Define additional stopwords and irrelevant terms to filter out
additional_stopwords = {
    'guide', 'figure', 'standard', 'edition', 'chapter', 'table', 'example',
    'technique', 'section', 'd.', 'third', 'fourth', 'effective', 'date',
    'process', 'approach', 'criteria', 'success', 'strategy', 'objective', 
    'step', 'introduction', 'provides', 'project', 'management', 'pm', 'pmbo',
    'pmbok', 'pmi', 'phd', 'plan', 'methodology', 'tools', 'template', 
    'results', 'ensure', 'ensures', 'requires', 'actions', 'inputs', 'outputs', 
    'practice', 'techniques', 'body', 'document', 'resources', 'preparation', 
    'organization', 'involves', 'use', 'may', 'should', 'must', 'several', 
    'provide', 'also', 'to', 'that', 'it', 'is', 'are', 'for', 'and', 'in', 
    'of', 'the', 'a', 'an', 'as', 'with', 'on', 'by', 'this', 'which', 
    'from', 'at', 'but', 'or', 'if', 'not', 'each', 'those', 'various', 
    'more', 'all', 'its', 'specific', 'opportunities'
}

# Function to clean and preprocess text
def clean_text(text):
    # Remove punctuation and digits, and trim whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)       # Remove digits
    text = ' '.join(text.split())         # Remove extra whitespace
    text = text.strip().lower()            # Normalize the text to lower case

    # Remove leading "a " if it exists
    if text.startswith("a "):
        text = text[2:].strip()  # Remove the leading "a " and trim whitespace
    
    return text

# Function to filter out irrelevant or redundant terms
def is_relevant(term):
    term = term.strip()
    return (len(term) >= 4 and 
            term not in additional_stopwords and 
            not starts_with_irrelevant(term) and 
            not is_common_pattern(term) and 
            not is_too_short(term))

# Function to check if the term starts with common irrelevant words or phrases
def starts_with_irrelevant(term):
    irrelevant_starters = ['this', 'those', 'each', 'various', 'more', 'all', 'specific', 'the']
    return any(term.lower().startswith(starter) for starter in irrelevant_starters)

# Function to identify common patterns in phrases to exclude
def is_common_pattern(term):
    # Regex pattern for common irrelevant structures
    common_patterns = [
        r'^(this|those|various|each)\s+\w+',  # Starts with "this," "those," "various," or "each"
        r'^(the\s+\w+)',                     # Starts with "the" followed by any word
        r'^\w+\s+\w+$',                      # Two-word phrases
        r'^\w{1,3}$',                        # Single words or very short phrases
        r'^\w+\s+(and|or|but|with)\s+\w+',  # Phrases like "risk and response"
        r'\b(?:not|applicable|known|etc)\b' # Words like 'not', 'known', 'applicable' as part of phrases
    ]
    return any(re.match(pattern, term) for pattern in common_patterns)

# Function to check if the term is too short or not descriptive enough
def is_too_short(term):
    # Exclude phrases with 1-2 words or less than 4 characters
    words = term.split()
    return len(words) < 3 or len(term) < 4

# Extract noun chunks from the text using spaCy
def extract_noun_chunks(text):
    doc = nlp(text)
    return [clean_text(chunk.text) for chunk in doc.noun_chunks if is_relevant(chunk.text)]

# Function to extract key terms using TF-IDF
def extract_key_terms(documents, top_n=20):
    vectorizer = TfidfVectorizer(max_df=0.9, stop_words="english", use_idf=True)
    X = vectorizer.fit_transform(documents)
    
    # Get terms with the highest TF-IDF scores
    terms = vectorizer.get_feature_names_out()
    tfidf_scores = X.sum(axis=0).A1
    sorted_terms = sorted(zip(terms, tfidf_scores), key=lambda x: x[1], reverse=True)
    
    # Return top N terms based on TF-IDF score
    return [term for term, score in sorted_terms[:top_n] if is_relevant(term)]

# Function to extract concepts from multiple chapters
def extract_concepts_from_chapters(chapters):
    all_text = ' '.join(chapters)  # Combine all chapters
    noun_chunks = extract_noun_chunks(all_text)  # Extract noun chunks
    top_tfidf_terms = extract_key_terms(chapters, top_n=50)  # Get top 50 TF-IDF terms
    
    # Combine and deduplicate concepts
    combined_concepts = set(noun_chunks + top_tfidf_terms)
    
    # Filter out additional irrelevant terms and sort the results
    return sorted(concept for concept in combined_concepts if is_relevant(concept))

# Path to your folder with cleaned chapter files
folder_path = 'all_chapters'

# Load all chapter files into a list of strings
chapters = []
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            chapters.append(file.read())

# Extract the final list of relevant concepts
concepts = extract_concepts_from_chapters(chapters)

# Save concepts to a CSV file with UTF-8 encoding and clean up the entries
with open('unique_concepts.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Concepts'])  # Write header
    for concept in concepts:
        cleaned_concept = clean_text(concept)  # Clean the concept
        if cleaned_concept:  # Only write if the cleaned concept is not empty
            writer.writerow([cleaned_concept])

print(f"Extracted concepts: {concepts}")


Extracted concepts: ['acceptable risk exposure', 'accepted schedule risks', 'account probabilistic or projectwide effects', 'action planning project planning tools', 'add risk responses', 'additional dependent risks', 'additional response planning', 'additional risk identification', 'additional risk information', 'additional risk management actions', 'additional risk responses', 'additional secondary risks', 'address interviews addresses risks', 'addresses individual risks', 'afﬁ nity diagram', 'agreed deﬁ nitions', 'agreedupon deﬁ nitions', 'along four prioritized criteria d scenario analysis scenario analysis', 'already identiﬁ ed', 'an acceptable level', 'an additional overhead task', 'an agreedupon approach', 'an analytical technique', 'an application areaspeciﬁ c process', 'an appropriate method', 'an arrow points', 'an effective means', 'an emv calculation', 'an essential prerequisite', 'an established project management methodology', 'an ethical basis', 'an example cause and eff

In [1]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import os
import csv

# Load the spaCy model for noun chunking and named entity recognition
nlp = spacy.load("en_core_web_sm")

# Define additional stopwords and irrelevant terms to filter out
additional_stopwords = {
    'guide', 'figure', 'standard', 'edition', 'chapter', 'table', 'example',
    'technique', 'section', 'd.', 'third', 'fourth', 'effective', 'date',
    'process', 'approach', 'criteria', 'success', 'strategy', 'objective', 
    'step', 'introduction', 'provides', 'project', 'management', 'pm', 'pmbo',
    'pmbok', 'pmi', 'phd', 'plan', 'methodology', 'tools', 'template', 
    'results', 'ensure', 'ensures', 'requires', 'actions', 'inputs', 'outputs', 
    'practice', 'techniques', 'body', 'document', 'resources', 'preparation', 
    'organization', 'involves', 'use', 'may', 'should', 'must', 'several', 
    'provide', 'also', 'to', 'that', 'it', 'is', 'are', 'for', 'and', 'in', 
    'of', 'the', 'a', 'an', 'as', 'with', 'on', 'by', 'this', 'which', 
    'from', 'at', 'but', 'or', 'if', 'not', 'each', 'those', 'various', 
    'more', 'all', 'its', 'specific', 'opportunities'
}

# Function to clean and preprocess text
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)       # Remove digits
    text = ' '.join(text.split())         # Remove extra whitespace
    text = text.strip().lower()            # Normalize the text to lower case

    # Remove leading "a " if it exists
    if text.startswith("a "):
        text = text[2:].strip()  # Remove the leading "a " and trim whitespace
    
    return text

# Function to filter out irrelevant or redundant terms
def is_relevant(term):
    term = term.strip()
    return (len(term) >= 4 and 
            term not in additional_stopwords and 
            not starts_with_irrelevant(term) and 
            not is_common_pattern(term) and 
            is_noun_phrase(term) and  # Ensure the term is a noun phrase
            not is_too_short(term))

# Function to check if the term starts with common irrelevant words or phrases
def starts_with_irrelevant(term):
    irrelevant_starters = ['this', 'those', 'each', 'various', 'more', 'all', 'specific', 'the']
    return any(term.lower().startswith(starter) for starter in irrelevant_starters)

# Function to identify common patterns in phrases to exclude
def is_common_pattern(term):
    common_patterns = [
        r'^(this|those|various|each)\s+\w+',  # Starts with "this," "those," "various," or "each"
        r'^(the\s+\w+)',                     # Starts with "the" followed by any word
        r'^\w+\s+\w+$',                      # Two-word phrases
        r'^\w{1,3}$',                        # Single words or very short phrases
        r'^\w+\s+(and|or|but|with)\s+\w+',  # Phrases like "risk and response"
        r'\b(?:not|applicable|known|etc)\b' # Words like 'not', 'known', 'applicable' as part of phrases
    ]
    return any(re.match(pattern, term) for pattern in common_patterns)

# Function to check if the term is too short or not descriptive enough
def is_too_short(term):
    # Exclude phrases with 1-2 words or less than 4 characters
    words = term.split()
    return len(words) < 3 or len(term) < 4

# Function to check if a term is a noun phrase
def is_noun_phrase(term):
    doc = nlp(term)
    # Count the number of nouns in the term
    noun_count = sum(1 for token in doc if token.pos_ == "NOUN")
    # Accept if the term has 1 to 3 nouns
    return 1 <= noun_count <= 3

# Extract noun chunks from the text using spaCy
def extract_noun_chunks(text):
    doc = nlp(text)
    return [clean_text(chunk.text) for chunk in doc.noun_chunks if is_relevant(chunk.text)]

# Function to extract key terms using TF-IDF
def extract_key_terms(documents, top_n=20):
    vectorizer = TfidfVectorizer(max_df=0.9, stop_words="english", use_idf=True)
    X = vectorizer.fit_transform(documents)
    
    # Get terms with the highest TF-IDF scores
    terms = vectorizer.get_feature_names_out()
    tfidf_scores = X.sum(axis=0).A1
    sorted_terms = sorted(zip(terms, tfidf_scores), key=lambda x: x[1], reverse=True)
    
    # Return top N terms based on TF-IDF score
    return [term for term, score in sorted_terms[:top_n] if is_relevant(term)]

# Function to extract concepts from multiple chapters
def extract_concepts_from_chapters(chapters):
    all_text = ' '.join(chapters)  # Combine all chapters
    noun_chunks = extract_noun_chunks(all_text)  # Extract noun chunks
    top_tfidf_terms = extract_key_terms(chapters, top_n=50)  # Get top 50 TF-IDF terms
    
    # Combine and deduplicate concepts
    combined_concepts = set(noun_chunks + top_tfidf_terms)
    
    # Filter out additional irrelevant terms and sort the results
    return sorted(concept for concept in combined_concepts if is_relevant(concept))

# Path to your folder with cleaned chapter files
folder_path = 'all_chapters'

# Load all chapter files into a list of strings
chapters = []
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            chapters.append(file.read())

# Extract the final list of relevant concepts
concepts = extract_concepts_from_chapters(chapters)

# Save concepts to a CSV file with UTF-8 encoding and clean up the entries
with open('unique_concepts.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Concepts'])  # Write header
    for concept in concepts:
        cleaned_concept = clean_text(concept)  # Clean the concept
        if cleaned_concept:  # Only write if the cleaned concept is not empty
            writer.writerow([cleaned_concept])

print(f"Extracted concepts: {concepts}")




Extracted concepts: ['acceptable risk exposure', 'accepted schedule risks', 'account probabilistic or projectwide effects', 'add risk responses', 'additional dependent risks', 'additional response planning', 'additional risk identification', 'additional risk information', 'additional risk management actions', 'additional risk responses', 'additional secondary risks', 'addresses individual risks', 'afﬁ nity diagram', 'agreed deﬁ nitions', 'agreedupon deﬁ nitions', 'an acceptable level', 'an additional overhead task', 'an agreedupon approach', 'an analytical technique', 'an appropriate method', 'an arrow points', 'an effective means', 'an emv calculation', 'an essential prerequisite', 'an established project management methodology', 'an ethical basis', 'an example force', 'an expert scheduler', 'an exposure draft', 'an extension practice standard', 'an honest manner', 'an important component', 'an important customer', 'an important focus', 'an inappropriate amount', 'an increasingly glob

In [None]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import os
import csv
from collections import Counter

# Load spaCy's model
nlp = spacy.load("en_core_web_sm")

# Define additional stopwords and non-informative terms specific to your domain
additional_stopwords = {
    'figure', 'chapter', 'standard', 'edition', 'table', 'section', 'example', 'technique', 'process',
    'criteria', 'approach', 'success', 'tools', 'introduction', 'plan', 'items', 'manuals',
    'references', 'implementation', 'use', 'based', 'methods', 'analysis involvement'
}

# Important single-word domain concepts that should be allowed as a single term
domain_whitelist = {"risk", "project", "management", "impact", "strategy", "control", "owner", "stakeholders", "schedule"}

# Stopword list includes spaCy's built-in stopwords + additional ones
stopwords = spacy.lang.en.stop_words.STOP_WORDS.union(additional_stopwords)

# Additional noise phrases from artifacts
noise_phrases = ['d d', 'eg', 'yn', 'yesnodont knownot', 'find risks', 'andor', 'ﬁ']

# Cleaning function for initial text preprocessing
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)      # Remove digits
    text = re.sub(r'\s+', ' ', text)     # Normalize multiple spaces
    text = text.strip().lower()          # Normalize the text to lowercase
    return text

# Function to identify and filter noise or irrelevant terms
def is_noise_or_stopword(term):
    # Exclude if term is in noise phrases or stopwords and not in the whitelist
    if term in noise_phrases or term in stopwords:
        return True
    return term not in domain_whitelist and len(term) <= 3  # Exclude very short terms not in whitelist

# Function to ensure term contains only nouns (NOUN or PROPN) and is longer than just single stopword or noise
def is_valid_noun_phrase(doc):
    # Make sure every token is a noun, and length > 1 unless it's whitelisted
    return all(token.pos_ in {"NOUN", "PROPN"} for token in doc) and \
           (len(doc) > 1 or (len(doc) == 1 and doc.text in domain_whitelist))

# Cleaning and token validation in one step
def process_token(term):
    cleaned_term = clean_text(term)
    doc = nlp(cleaned_term)
    if is_valid_noun_phrase(doc) and not is_noise_or_stopword(cleaned_term):
        return cleaned_term
    return None

# Function to extract noun chunks and filter them for proper noun-only phrases and compound nouns
def extract_noun_chunks(text):
    doc = nlp(text)
    # Return only valid noun chunks
    noun_chunks = []
    for chunk in doc.noun_chunks:
        processed_chunk = process_token(chunk.text)
        if processed_chunk:
            noun_chunks.append(processed_chunk)
    return noun_chunks

# Extract key terms using TF-IDF with noun-only filtering
def extract_key_terms(documents, top_n=20):
    vectorizer = TfidfVectorizer(max_df=0.85, stop_words="english", use_idf=True)
    X = vectorizer.fit_transform(documents)
    terms = vectorizer.get_feature_names_out()
    tfidf_scores = X.sum(axis=0).A1
    sorted_terms = sorted(zip(terms, tfidf_scores), key=lambda x: x[1], reverse=True)

    return [term for term, score in sorted_terms[:top_n] if process_token(term)]

# Word frequency extractor with filtering logic
def extract_frequent_terms(text, min_count=3):
    words = [clean_text(word) for word in text.split() if word and len(word) > 3]
    word_freq = Counter(words)
    return [word for word, count in word_freq.items() if count >= min_count and process_token(word)]

# Function to extract final list of concepts from the given chapters by combining tf-idf, chunks, and frequencies
def extract_concepts_from_chapters(chapters):
    all_text = ' '.join(chapters)  # Combine all chapter texts

    # Extract noun phrases, high-frequency terms by count, and from TF-IDF rankings
    noun_chunks = extract_noun_chunks(all_text)
    top_tfidf_terms = extract_key_terms(chapters, top_n=50)
    frequent_terms = extract_frequent_terms(all_text)
    
    # Combine and deduplicate
    combined_concepts = set(noun_chunks + top_tfidf_terms + frequent_terms)
    
    return sorted(set(combined_concepts))

# Path to your folder with cleaned chapter files
folder_path = 'all_chapters'

# Load all the chapter text into a list of strings
chapters = []
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r', encoding="utf-8") as file:
            chapters.append(clean_text(file.read()))

# Extract and process final set of concepts
concepts = extract_concepts_from_chapters(chapters)

# Save the final concepts to a CSV file
with open('final_cleaned_concepts.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Concepts'])  # Header row
    for concept in concepts:
        if concept:  # If the concept is non-empty
            writer.writerow([concept])

# Display the recalculated concepts
print(f"Extracted unique concepts: {concepts}")

In [58]:
import csv
import ollama

# Load concepts from the CSV file
def load_concepts_from_csv(file_path):
    concepts = []
    try:
        with open(file_path, 'r', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            next(reader)  # Skip header
            for row in reader:
                if row:  # Ensure the row is not empty
                    concepts.append(row[0].strip())  # Assuming the concept is in the first column and stripping whitespace
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
    except Exception as e:
        print(f"An error occurred while loading concepts: {e}")
    return concepts

def call_llama_model(concepts):
    # Prepare the prompt for the Llama model
    prompt = (
        "Please provide a filtered list of relevant concepts related to project risk management from the following terms:\n"
        f"{', '.join(concepts)}\n\n"
        "Output only the relevant terms in a numbered list format, with no additional commentary."
    )
    
    # Call the Llama model
    try:
        response = ollama.generate(
            model="llama3.1:latest",
            prompt=prompt
        )
        
        # Check the structure of the response
        if isinstance(response, dict) and 'response' in response:
            return response['response'].strip().split('\n')  # Split by new lines for a list format
        else:
            print("Unexpected response format:", response)
            return []  # Return an empty list in case of unexpected format
    except Exception as e:
        print(f"An error occurred while calling the Llama model: {e}")
        return []  # Return an empty list in case of error

# Function to correct formatting issues in concepts
def correct_formatting(concepts):
    return [concept.replace("speciﬁ c", "specific") for concept in concepts]  # Add more replacements as needed

# Main process
if __name__ == "__main__":
    try:
        concepts = load_concepts_from_csv('unique_concepts.csv')  # Load concepts from CSV
        corrected_concepts = correct_formatting(concepts)  # Correct formatting issues
        relevant_concepts = call_llama_model(corrected_concepts)  # Get relevant concepts from Llama

        # Save relevant concepts to a new CSV file
        with open('filtered_concepts.csv', 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Concepts'])  # Write header
            for concept in relevant_concepts:
                writer.writerow([concept])  # Write relevant concepts

        print("Filtered concepts saved to 'filtered_concepts.csv'")

    except Exception as e:
        print(f"An error occurred: {e}")


Filtered concepts saved to 'filtered_concepts.csv'
