## Import required modules

In [2]:
# Bash: pip install --upgrade python-docx

In [4]:
import os 
import pandas as pd
from PyPDF2 import PdfReader
import openpyxl
import re
from docx import Document

In [6]:
# Create 'docs' directory if it doesn't exist
os.makedirs('docs', exist_ok=True)

print("Now load all your files to the docs folder...")

Now load all your files to the docs folder...


In [7]:
def extract_sentences(file_path):
    """Extract sentences with proper resource cleanup"""
    content_blocks = []
    
    try:
        if file_path.lower().endswith('.xlsx'):
            # Use pandas ExcelFile for better resource management
            with pd.ExcelFile(file_path, engine='openpyxl') as xls:
                for sheet_name in xls.sheet_names:
                    df = xls.parse(sheet_name)
                    for row_idx, row in df.iterrows():
                        for col_idx, value in enumerate(row):
                            cell_text = str(value)
                            sentences = re.split(r'(?<=[.!?])\s+', cell_text)
                            for sentence in sentences:
                                if sentence := sentence.strip():
                                    content_blocks.append({
                                        'source_type': 'worksheet',
                                        'source_name': sheet_name,
                                        'content': sentence,
                                        'location': f"Row {row_idx+1}, Col {df.columns[col_idx]}"
                                    })
        elif file_path.lower().endswith('.pdf'):
            # PDF handling with guaranteed closure
            with open(file_path, 'rb') as f:
                pdf = PdfReader(f)
                for page_num, page in enumerate(pdf.pages, 1):
                    page_text = page.extract_text()
                    if page_text:
                        sentences = re.split(r'(?<=[.!?])\s+', page_text)
                        for sentence in sentences:
                            if sentence := sentence.strip():
                                content_blocks.append({
                                    'source_type': 'page',
                                    'source_name': f"Page {page_num}",
                                    'content': sentence,
                                    'location': None
                                })
        elif file_path.lower().endswith('.docx'):
            # Word document handling
            doc = Document(file_path)
            for para_num, paragraph in enumerate(doc.paragraphs, 1):
                paragraph_text = paragraph.text
                if paragraph_text:
                    sentences = re.split(r'(?<=[.!?])\s+', paragraph_text)
                    for sentence in sentences:
                        if sentence := sentence.strip():
                            content_blocks.append({
                                'source_type': 'paragraph',
                                'source_name': f"Paragraph {para_num}",
                                'content': sentence,
                                'location': None
                            })
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
    
    return content_blocks

In [8]:
def find_keyword_matches(content_blocks, keywords):
    """Find exact keyword-containing sentences"""
    matches = []
    
    for block in content_blocks:
        lower_content = block['content'].lower()
        found_keywords = [kw for kw in keywords if kw.lower() in lower_content]
        
        if found_keywords:
            for keyword in found_keywords:
                matches.append({
                    'File Path': block.get('file_path', ''),
                    'Source Type': block['source_type'],
                    'Source Name': block['source_name'],
                    'Location': block['location'],
                    'Keyword': keyword,
                    'Exact Sentence': block['content']
                })
    
    return matches

In [9]:
def generate_report(folder_path, keywords):
    """Generate report with guaranteed file closure"""
    report = []
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(('.xlsx', '.pdf')):
                file_path = os.path.join(root, file)
                content_blocks = extract_sentences(file_path)
                for block in content_blocks:
                    block['file_path'] = file_path
                matches = find_keyword_matches(content_blocks, keywords)
                report.extend(matches)
    
    return pd.DataFrame(report)

In [10]:
# Configuration
folder_path = 'docs' # Update this path
keywords = ['Gender', 'DEI', 'Diversity', 'Inclusion']  # Add more keywords as needed
# Generate and save report
df = generate_report(folder_path, keywords)
df

Unnamed: 0,File Path,Source Type,Source Name,Location,Keyword,Exact Sentence
0,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,Yr 5 Full Workplan,"Row 2, Col Unnamed: 1",Inclusion,Inclusion Justificaton
1,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,Yr 5 Full Workplan,"Row 15, Col Unnamed: 1",Inclusion,Overview: The inclusion of routine maintenance...
2,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,COMPONENT 1-3,"Row 2, Col Unnamed: 1",Inclusion,Inclusion Justificaton
3,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,COMPONENT 1-3,"Row 11, Col Unnamed: 1",Inclusion,Overview: The inclusion of routine maintenance...
4,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,Yr 5 - Component 1 Updated,"Row 2, Col Unnamed: 1",Inclusion,Inclusion Justificaton
5,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,Yr 5 - Component 1 Updated,"Row 10, Col Unnamed: 1",Inclusion,Overview: The inclusion of routine maintenance...
6,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,Activities,"Row 2, Col Unnamed: 2",Inclusion,Inclusion Justificaton
7,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,Activities,"Row 15, Col Unnamed: 2",Inclusion,Overview: The inclusion of routine maintenance...
8,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,Yr 5 - Component 4 Q1,"Row 2, Col Unnamed: 2",Inclusion,Inclusion Justificaton
9,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,COMPONENT 4,"Row 2, Col Unnamed: 1",Inclusion,Inclusion Justificaton


In [11]:
df.to_csv(f"{folder_path}_keyword_summary.csv", index=False)

In [12]:
print(df.columns)

Index(['File Path', 'Source Type', 'Source Name', 'Location', 'Keyword',
       'Exact Sentence'],
      dtype='object')


In [13]:
df.rename(columns={'File Path': 'Path', 
                   'Exact Sentence': 'Content'}, inplace=True
          )
df.head()

Unnamed: 0,Path,Source Type,Source Name,Location,Keyword,Content
0,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,Yr 5 Full Workplan,"Row 2, Col Unnamed: 1",Inclusion,Inclusion Justificaton
1,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,Yr 5 Full Workplan,"Row 15, Col Unnamed: 1",Inclusion,Overview: The inclusion of routine maintenance...
2,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,COMPONENT 1-3,"Row 2, Col Unnamed: 1",Inclusion,Inclusion Justificaton
3,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,COMPONENT 1-3,"Row 11, Col Unnamed: 1",Inclusion,Overview: The inclusion of routine maintenance...
4,docs\Palladium\1. PK_Year5_CA_Workplan.xlsx,worksheet,Yr 5 - Component 1 Updated,"Row 2, Col Unnamed: 1",Inclusion,Inclusion Justificaton


In [14]:
df_copy = df.assign(
    Partner = df['Path'].str.extract(r'\\(.*?)\.(xlsx|pdf)')[0]
)[['Partner', 'Keyword', 'Content']]

df_copy

Unnamed: 0,Partner,Keyword,Content
0,Palladium\1. PK_Year5_CA_Workplan,Inclusion,Inclusion Justificaton
1,Palladium\1. PK_Year5_CA_Workplan,Inclusion,Overview: The inclusion of routine maintenance...
2,Palladium\1. PK_Year5_CA_Workplan,Inclusion,Inclusion Justificaton
3,Palladium\1. PK_Year5_CA_Workplan,Inclusion,Overview: The inclusion of routine maintenance...
4,Palladium\1. PK_Year5_CA_Workplan,Inclusion,Inclusion Justificaton
5,Palladium\1. PK_Year5_CA_Workplan,Inclusion,Overview: The inclusion of routine maintenance...
6,Palladium\1. PK_Year5_CA_Workplan,Inclusion,Inclusion Justificaton
7,Palladium\1. PK_Year5_CA_Workplan,Inclusion,Overview: The inclusion of routine maintenance...
8,Palladium\1. PK_Year5_CA_Workplan,Inclusion,Inclusion Justificaton
9,Palladium\1. PK_Year5_CA_Workplan,Inclusion,Inclusion Justificaton


In [15]:
# Validation Tests (Win + .)
assert df_copy['Partner'].isnull().sum() == 0, "❌ Missing values found in 'Partner' column"
assert all(df_copy['Partner'].str.contains(r'[^\\/]+', regex=True)), "🤣 'Partner' column contains invalid values"

print("✅ Validation passed: All extracted filenames are correct!")

✅ Validation passed: All extracted filenames are correct!


In [16]:
# Save to excel
df_copy.to_excel(f"{folder_path} Keywords Check.xlsx", 
                 index=True,
                 index_label='No.')