In [3]:
# Only need to run this one time to get python-docx installed in the Env
# !pip install python-docx

In [1]:
from docx import Document
import pandas as pd
import os

In [2]:
## Extract all of the tables into dataFrames - return a list of dataFrames
def extract_tables_from_docx(docx_path):
    doc = Document(docx_path)
    tables_list = []
    for table in doc.tables:
        data = [[cell.text for cell in row.cells] for row in table.rows]
        df_table = pd.DataFrame(data[1:], columns=data[0])  # Use the first row as column headers
        tables_list.append(df_table)
    return tables_list

## Extract all tables from all of the docx files in a folder
def process_folder(folder_path):
    all_tables = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".docx"):
            docx_path = os.path.join(folder_path, filename)
            tables = extract_tables_from_docx(docx_path)
            all_tables.extend(tables) # This assumes you want to collect all tables from all documents
    return all_tables


In [5]:
## Define where our first document lives
docx_path = r'unusnldwp2sampledata\LIBERIA.docx'

## Define the folder where all documents live
folder_path = r'unusnldwp2sampledata'

In [6]:
tables = extract_tables_from_docx(docx_path)
tables

[                                              Country  \
 0                                          Main event   
 1                                              Hazard   
 2                               Description & context   
 3                                                       
 4   1. Anticipatory Arrangement (Pre-Phase 1: befo...   
 5                              2. Date event declared   
 6               1. Anticipatory Arrangement (Phase 1)   
 7               1. Anticipatory Arrangement (Phase 2)   
 8               1. Anticipatory Arrangement (Phase 3)   
 9                                                       
 10                           2. Start of actual event   
 11                   2. Duration of the primary event   
 12  2. Overall duration of event and its related e...   
 13                   3. Response (Phase 1) - Duration   
 14                   3. Response (Phase 1) - Triggers   
 15                    3. Response (Phase 1) - Actions   
 16           

In [7]:
all_tables = process_folder(folder_path)

# Now you have a list of DataFrames, each representing a table from your collection of .docx files

In [8]:
all_tables

[                                                  Pay  \
 0                                 Événement principal   
 1                                     Aléa climatique   
 2                              Description & contexte   
 3                                                       
 4   1. Dispositions anticipées (Phase préliminaire...   
 5               2. Date de déclaration de l'événement   
 6                1. Dispositions anticipées (Phase 1)   
 7                 1.Dispositions anticipées (Phase 2)   
 8                1. Dispositions anticipées (Phase 3)   
 9                                                       
 10                            2. Début de l'événement   
 11                  2. Durée de l’événement principal   
 12  2. Durée général de l'événement et ses effets ...   
 13                       3. Réponse (Phase 1) – Durée   
 14                3. Réponse (Phase 1) - Déclencheurs   
 15                     3. Réponse (Phase 1) - Actions   
 16           

In [9]:
all_tables[4].describe()

Unnamed: 0,Country,Explanatory notes,"Example: Blantyre, Malawi",Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
count,31.0,31.0,31.0,31.0,31.0,31.0,31.0
unique,30.0,24.0,23.0,1.0,2.0,2.0,2.0
top,,,,,,,
freq,2.0,8.0,8.0,31.0,30.0,30.0,30.0


In [26]:
tables[0]

Unnamed: 0,Country,Explanatory notes,Example: LIBERIA,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,Main event,Modulating event that produces one or more haz...,"Heavy Rainfall, SLR/Coastal Flooding in Montse...",,,,
1,Hazard,Single hazardous event causing loss and damage,Floods resulting from heavy Rainfall in margib...,,,,
2,Description & context,Describe the event and its context,"In recent years, climate change related coasta...",,,,
3,,,,,Needs,Sources of support,Gaps
4,1. Anticipatory Arrangement (Pre-Phase 1: befo...,Anticipatory arrangements are those that trigg...,Flood mapping has been done to identify hotspo...,,,,
5,2. Date event declared,,1998 & September 2019,,,,
6,1. Anticipatory Arrangement (Phase 1),"After event declared/announced, before event h...","Early warning systems (EPA,MoT, NDMA, MME-LHS)...",,,,
7,1. Anticipatory Arrangement (Phase 2),,\nAdvance cash transfers\nProcurement and prov...,,,,
8,1. Anticipatory Arrangement (Phase 3),,Pre-positioning of relief materials\nAssemblan...,,,,
9,,,,,,,


In [13]:
## Combine the tables together

if tables:
    combined_df = pd.concat(tables, ignore_index=True)
    print(combined_df)
else:
    print("No tables found.")

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [9]:
# Extract just the first table into a dataFrame so we can see what this looks like
def extract_first_table_from_docx(docx_path):
    doc = Document(docx_path)
    for table in doc.tables:
        data = [[cell.text for cell in row.cells] for row in table.rows]
        df_table = pd.DataFrame(data[1:], columns=data[0])  # Use the first row as column headers
        return df_table  # Return the first table found
    return None  # Return None if no tables are found

In [10]:
singletable = extract_first_table_from_docx(docx_path)
singletable.head()

Unnamed: 0,Country,Explanatory notes,Example: LIBERIA,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,Main event,Modulating event that produces one or more haz...,"Heavy Rainfall, SLR/Coastal Flooding in Montse...",,,,
1,Hazard,Single hazardous event causing loss and damage,Floods resulting from heavy Rainfall in margib...,,,,
2,Description & context,Describe the event and its context,"In recent years, climate change related coasta...",,,,
3,,,,,Needs,Sources of support,Gaps
4,1. Anticipatory Arrangement (Pre-Phase 1: befo...,Anticipatory arrangements are those that trigg...,Flood mapping has been done to identify hotspo...,,,,


In [None]:
docx_path = 'your_doc_file.docx'
tables = extract_tables_from_docx(docx_path)

if tables:
    combined_df = pd.concat(tables, ignore_index=True)
    print(combined_df)
else:
    print("No tables found.")