In [None]:
import json
import re
from docx import Document
from docx.shared import Pt
import pandas as pd

In [None]:
def extract_from_word(docx_path):
    """
    Extracts apartment information from a Word file based on font size constraints and specific Regex patterns, while excluding underlined text from descriptions.
    """
    doc = Document(docx_path)
    
    apartments = []
    current_apt = {}
    current_desc_parts = []
    
    #Flag to track if we just found a title and need the address next
    waiting_for_address = False
    
    # Regex to ensure that the apartment name starts with "Splendom" (case insensitive)
    title_pattern = re.compile(r'(?i)^splendom.*')

    for para in doc.paragraphs:
        #To detect if the paragraph is a title, Regex is used along with font size check (16pt+).
        
        is_title_size = False
        max_size = 0
        
        for run in para.runs:
            if run.font.size:
                if run.font.size.pt > max_size:
                    max_size = run.font.size.pt
        
        if max_size >= 16:
            is_title_size = True

        #Clean text for Regex check
        clean_text = para.text.strip()
        
        #Regex AND font size greater than 16.
        if is_title_size and title_pattern.match(clean_text):
            #Save previous apartment if exists.
            if current_apt:
                current_apt["description"] = "".join(current_desc_parts).strip()
                apartments.append(current_apt)
            
            # Apartment structure
            current_apt = {
                "name": clean_text,
                "description": "",
                "lat": None,
                "lon": None,
                "geocoded_address": "",
                "Country": "Pending Excel Merge",
                "City": "Pending Excel Merge",
                "url": "",
                "supplier link": "",
                "email": "",
                "phone": ""
            }
            current_desc_parts = []
            
            #We found a title, so the NEXT paragraph should be the address.
            waiting_for_address = True
            
        else:
            #If it's not an apartment title, we check if it is an address or description.
            if current_apt:
                if waiting_for_address:
                    #If the line is not empty, we assume it is the address
                    if clean_text:
                        current_apt["geocoded_address"] = clean_text
                        waiting_for_address = False
                
                else:
                    #If we already have the address, then it's a description.
                    for run in para.runs:
                        #Only add to the description the text is NOT underlined.
                        if not run.font.underline: 
                            current_desc_parts.append(run.text)
                    
                    current_desc_parts.append("\n")

    if current_apt:
        current_apt["description"] = "".join(current_desc_parts).strip()
        apartments.append(current_apt)
        
    return apartments

In [24]:
def extract_from_excel(excel_path, apartments_data):
    """
    Reads an Excel file and merges its data with the apartments JSON structure.
    
    Expected Excel columns:
    - Apartment Name
    - Splendom Link (maps to 'url')
    - Supplier Link (maps to 'supplier link')
    - Email
    - Phone
    """
    
    #Read the Excel file
    df = pd.read_excel(excel_path)
        
    #Create a dictionary mapping apartment names to Excel row data.
    excel_data = {}
    for idx, row in df.iterrows():
        apt_name = row['Apartment Name'].strip().lower()
        excel_data[apt_name] = {
            'url': row['Splendom Link'],
            'supplier link': row['Supplier Link'],
            'email': row['Email'],
            'phone': str(row['Phone'])
        }
        
    #Merge Excel data with apartments data
    for apt in apartments_data:
        apt_name = apt['name'].strip().lower()
        
        #Find a matching apartment in the Excel data
        if apt_name in excel_data:
            excel_row = excel_data[apt_name]
            apt['url'] = excel_row['url']
            apt['supplier link'] = excel_row['supplier link']
            apt['email'] = excel_row['email']
            apt['phone'] = excel_row['phone']
        else:
            print(f"No matching apartment.")
        
    return apartments_data

In [25]:
def save_to_json(word_path, excel_path, json_output_path):
    """
    Main function to extract data from Word and Excel files and save to a JSON file.
    """
    #1. Extract from Word.
    data = extract_from_word(word_path)
    
    #2. Extract from Excel.
    data = extract_from_excel(excel_path, data)
    
    #3. Save to JSON
    with open(json_output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [26]:
save_to_json("dummy_word.docx", "dummy_excel.xlsx", "dummy_corpus.json")