In [3]:

# Only need to run this one time to get python-docx installed in the Env
# !pip install python-docx

In [22]:
## Lego superpowers engaged
from docx import Document
import pandas as pd
import os
import re

In [36]:
## Function to identify a table with just links
def is_links_table(table):
    # Check if the table is a 1x1 table and contains URLs
    if len(table.rows) == 1 and len(table.rows[0].cells) == 1:
        cell_text = table.rows[0].cells[0].text
        # Check for URL patterns or additional information text
        if re.search(r'https?://', cell_text) or re.search(r'additional information', cell_text):
            return True
    return False

## Extract all of the tables into dataFrames - return a list of dataFrames
def extract_tables_from_docx(docx_path):
    doc = Document(docx_path)
    tables_list = []
    for table in doc.tables:
        if is_links_table(table):
            continue  # Skip this table
        
        data = [[cell.text for cell in row.cells] for row in table.rows]
        df_table = pd.DataFrame(data[1:], columns=data[0])  # Use the first row as column headers
        tables_list.append(df_table)
    return tables_list

def process_folder(folder_path):
    all_tables = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".docx"):
            docx_path = os.path.join(folder_path, filename)
            tables = extract_tables_from_docx(docx_path)
            # Add a column to each table indicating the filename
            for table in tables:
                table['Source File'] = filename  # Add filename as a column
            all_tables.extend(tables)
    return all_tables


In [37]:
## Define where our first document lives
docx_path = r'unusnldwp2sampledata\LIBERIA.docx'

## Define the folder where all documents live
folder_path = r'unusnldwp2sampledata'

In [55]:
# ## Test extraction with just one file
# tables = extract_tables_from_docx(docx_path)
# tables[0].head()

In [40]:
all_tables = process_folder(folder_path)

In [56]:
all_tables[1]

Unnamed: 0,Country,Explanatory notes,Example: LIBERIA,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Source File
0,Main event,Modulating event that produces one or more haz...,"Heavy Rainfall, SLR/Coastal Flooding in Montse...",,,,,LIBERIA.docx
1,Hazard,Single hazardous event causing loss and damage,Floods resulting from heavy Rainfall in margib...,,,,,LIBERIA.docx
2,Description & context,Describe the event and its context,"In recent years, climate change related coasta...",,,,,LIBERIA.docx
3,,,,,Needs,Sources of support,Gaps,LIBERIA.docx
4,1. Anticipatory Arrangement (Pre-Phase 1: befo...,Anticipatory arrangements are those that trigg...,Flood mapping has been done to identify hotspo...,,,,,LIBERIA.docx
5,2. Date event declared,,1998 & September 2019,,,,,LIBERIA.docx
6,1. Anticipatory Arrangement (Phase 1),"After event declared/announced, before event h...","Early warning systems (EPA,MoT, NDMA, MME-LHS)...",,,,,LIBERIA.docx
7,1. Anticipatory Arrangement (Phase 2),,\nAdvance cash transfers\nProcurement and prov...,,,,,LIBERIA.docx
8,1. Anticipatory Arrangement (Phase 3),,Pre-positioning of relief materials\nAssemblan...,,,,,LIBERIA.docx
9,,,,,,,,LIBERIA.docx


Unnamed: 0,Country,Explanatory notes,Example: LIBERIA,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,Main event,Modulating event that produces one or more haz...,"Heavy Rainfall, SLR/Coastal Flooding in Montse...",,,,
1,Hazard,Single hazardous event causing loss and damage,Floods resulting from heavy Rainfall in margib...,,,,
2,Description & context,Describe the event and its context,"In recent years, climate change related coasta...",,,,
3,,,,,Needs,Sources of support,Gaps
4,1. Anticipatory Arrangement (Pre-Phase 1: befo...,Anticipatory arrangements are those that trigg...,Flood mapping has been done to identify hotspo...,,,,
5,2. Date event declared,,1998 & September 2019,,,,
6,1. Anticipatory Arrangement (Phase 1),"After event declared/announced, before event h...","Early warning systems (EPA,MoT, NDMA, MME-LHS)...",,,,
7,1. Anticipatory Arrangement (Phase 2),,\nAdvance cash transfers\nProcurement and prov...,,,,
8,1. Anticipatory Arrangement (Phase 3),,Pre-positioning of relief materials\nAssemblan...,,,,
9,,,,,,,
