# 청약 applyhome

# 1. Data processing

In [1]:
pwd

'/Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data_processing'

In [2]:
!pip install pdfplumber



In [3]:
import os
import pdfplumber
import pandas as pd
import re
import random

# 현재 작업 디렉토리 확인
current_dir = "/Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan"

# 상대 경로 설정
api_data_path = os.path.join(current_dir, "data/api_data")

# PDF에서 표를 추출하여 데이터프레임으로 변환하는 함수 (특정 키워드 필터링)
def extract_filtered_tables_from_pdf(pdf_path, keyword="공급금액"):
    try:
        tables = []
        with pdfplumber.open(pdf_path) as pdf:
            for page_number, page in enumerate(pdf.pages, start=1):
                page_tables = page.extract_tables()
                for table in page_tables:
                    df = pd.DataFrame(table[1:], columns=table[0])  # 첫 번째 행을 컬럼으로 설정
                    if any(keyword in str(cell) for row in table for cell in row):
                        tables.append(df)
        return tables
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return []

# PDF 파일 읽기 함수
def read_pdf_files(directory_path):
    pdf_files = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))
    return pdf_files

# 텍스트 추출 함수
def extract_text_from_pdf(pdf_path):
    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
        return text.strip()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None

# 메타데이터 추출 함수
def extract_metadata(text, max_supply_price):
    metadata = {
        "supply_name": None,
        "region_name": None,
        "application_schedule": None,
        "special_supply_conditions": [],
        "enter_day": None,
        "max_supply_price": max_supply_price
    }

    # 공급명 추출
    supply_name_match = re.search(r"입주자모집공고주요정보\s*(.+)", text)
    if supply_name_match:
        metadata["supply_name"] = supply_name_match.group(1).strip()

    # 지역명 추출
    region_name_match = re.search(r"공급위치\s*(.+)", text)
    if region_name_match:
        metadata["region_name"] = region_name_match.group(1).strip()

    # 청약일정 추출
    schedule_match = re.search(r"당첨자 발표일\s*(\d{4}-\d{2}-\d{2})", text)
    if schedule_match:
        metadata["application_schedule"] = schedule_match.group(1).strip()

    
    # 특별공급조건 추출
    special_conditions_keywords = ["다자녀", "신혼부", "생애최", "노부모", "신생아", "청년"]
    for keyword in special_conditions_keywords:
        if keyword in text:
            metadata["special_supply_conditions"].append(keyword)

    # 입주예정월 추출
    enter_day_match = re.search(r"입주예정월 :\s*(\d{4}\.\d{2})", text)
    if enter_day_match:
        metadata["enter_day"] = enter_day_match.group(1).strip()

    return metadata

# PDF 파일 가져오기
api_pdfs = read_pdf_files(api_data_path)

# API PDF 메타데이터 추출
api_metadata = []
if api_pdfs:
    for pdf_path in api_pdfs:
        print(f"Reading API PDF: {pdf_path}")
        text = extract_text_from_pdf(pdf_path)

        max_supply_price = None
        filtered_tables = extract_filtered_tables_from_pdf(pdf_path)
        for table in filtered_tables:
            string_values = table.stack().apply(lambda x: x if isinstance(x, str) else None).dropna()
            split_values = string_values.str.split(expand=True).stack()
            numeric_values = split_values.apply(lambda x: int(x.replace(",", "")) if x.replace(",", "").isdigit() else None).dropna()
            if not numeric_values.empty:
                max_supply_price = numeric_values.min()
                break

        if text:
            metadata = extract_metadata(text, max_supply_price)
            api_metadata.append(metadata)

# # 메타데이터 출력
# for i, metadata in enumerate(api_metadata):
#     print(f"\nMetadata {i+1}:")
#     for key, value in metadata.items():
#         print(f"{key}: {value}")


Reading API PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/api_data/9.pdf
Reading API PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/api_data/8.pdf
Reading API PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/api_data/6.pdf
Reading API PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/api_data/7.pdf
Reading API PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/api_data/5.pdf
Reading API PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/api_data/4.pdf
Reading API PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/api_data/1.pdf
Reading API PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/api_data/3.pdf
Reading API PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/api_data/2.pdf

Metadata 1:
supply_name: 서대문 센트럴 아이파크(임의공급 8차)
region_name: 서울특별시 서대문구 홍은동 11-111번지 일원
application_schedule: 2025-01-10
special_supply_conditions: []
enter_day: 2025.06
ma

In [5]:
import os
import pdfplumber
import re

# 현재 작업 디렉토리 확인
current_dir = "/Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan"

# 상대 경로 설정
crawl_data_path = os.path.join(current_dir, "data/crawl_data")

# PDF 파일 읽기 함수
def read_pdf_files(directory_path):
    pdf_files = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))
    return pdf_files

# 텍스트 추출 함수
def extract_text_from_pdf(pdf_path):
    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
        return text.strip()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None

# 메타데이터 추출 함수
def extract_metadata(text):
    metadata = {
        "supply_name": None,
        "region_name": None,
        "supply_type": None,
        "area": None,
        "application_schedule": None,
        "special_supply_conditions": []
    }

    # 공급명 추출
    supply_name_match = re.search(r"(.+?)\.pdf 바로보기", text)
    if supply_name_match:
        metadata["supply_name"] = supply_name_match.group(1).strip()

    # 지역명 추출
    region_name_match = re.search(r"모집지역\s*:\s*(.+?)$|소재지\s*:\s*(.+?)$", text, re.MULTILINE)
    if region_name_match:
        region_name = region_name_match.group(1) or region_name_match.group(2)
        if "확인" not in region_name:
            metadata["region_name"] = region_name.strip()

    # 공급유형 추출
    supply_type_match = re.search(r"유형\s*:\s*(.+?)\s", text)
    if supply_type_match:
        metadata["supply_type"] = supply_type_match.group(1).strip()

    # 면적 추출
    area_match = re.search(r"전용면적\(㎡\)\s*:\s*(\d+\.\d+)", text)
    if area_match:
        metadata["area"] = area_match.group(1).strip()

    # 청약일정 추출
    schedule_match = re.search(r"접수기간\s*:\s*(\d{4}\.\d{2}\.\d{2})\s*~\s*(\d{4}\.\d{2}\.\d{2})", text)
    if schedule_match:
        metadata["application_schedule"] = schedule_match.group(2).strip()

    # 특별공급조건 추출
    special_conditions_keywords = ["다자녀", "신혼", "생애", "노부모", "신생아", "청년"]
    for keyword in special_conditions_keywords:
        if keyword in text:
            metadata["special_supply_conditions"].append(keyword)

    return metadata

# PDF 파일 가져오기
crawl_pdfs = read_pdf_files(crawl_data_path)

# Crawl PDF 메타데이터 추출
crawl_metadata = []
if crawl_pdfs:
    for pdf_path in crawl_pdfs:
        print(f"Reading Crawl PDF: {pdf_path}")
        text = extract_text_from_pdf(pdf_path)
        if text:
            metadata = extract_metadata(text)
            crawl_metadata.append(metadata)

# # 메타데이터 출력
# for i, metadata in enumerate(crawl_metadata):
#     print(f"\nMetadata {i+1}:")
#     for key, value in metadata.items():
#         print(f"{key}: {value}")


Reading Crawl PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/crawl_data/9.pdf
Reading Crawl PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/crawl_data/8.pdf
Reading Crawl PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/crawl_data/16.pdf
Reading Crawl PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/crawl_data/17.pdf
Reading Crawl PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/crawl_data/29.pdf
Reading Crawl PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/crawl_data/15.pdf
Reading Crawl PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/crawl_data/14.pdf
Reading Crawl PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/crawl_data/28.pdf
Reading Crawl PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/crawl_data/10.pdf
Reading Crawl PDF: /Users/hyottz/Desktop/24f-houseplan/24f_daiv_houseplan/data/crawl_data/38.pdf
Reading Crawl PDF: /Users/hyottz

# 2. keyword parsing

In [7]:
import pandas as pd
import numpy as np

# api_metadata를 DataFrame으로 변환
api_data = pd.DataFrame(api_metadata)
crawl_data = pd.DataFrame(crawl_metadata)

In [8]:
# 결합
combined_data = pd.concat([api_data, crawl_data], ignore_index=True)

# 결과 확인
print(combined_data.head())

             supply_name                   region_name application_schedule  \
0  서대문 센트럴 아이파크(임의공급 8차)    서울특별시 서대문구 홍은동 11-111번지 일원           2025-01-10   
1        판교밸리자이 3단지 오피스텔         경기도 성남시 수정구 대왕판교로 961           2025-01-10   
2      천안 두정역 양우내안애 퍼스트로  충청남도 천안시 서북구 두정동 393-11번지 일원           2025-01-13   
3   상월곡역 장위아트포레(도시형생활주택)          서울특별시 성북구 석관동 338-18           2025-01-13   
4    부산에코델타시티 대방 엘리움 리버뷰  부산광역시 강서구 에코델타시티 공동주택용지 31BL           2025-01-16   

       special_supply_conditions enter_day  max_supply_price supply_type area  
0                             []   2025.06               NaN         NaN  NaN  
1                             []      None           95600.0         NaN  NaN  
2  [다자녀, 신혼부, 생애최, 노부모, 신생아, 청년]   2027.12           39000.0         NaN  NaN  
3                             []      None           22500.0         NaN  NaN  
4  [다자녀, 신혼부, 생애최, 노부모, 신생아, 청년]   2027.10          108847.0         NaN  NaN  


In [9]:
# 입력 조건 - Front-End와 연결예정

user_input = {
    #"supply_name": "공급 A",
    "region_name": "인천",
    #"application_schedule": "2025-01-01",
    "special_supply_conditions": ["청년"],
    #"enter_day": "2025.01",
    #"max_supply_price": 500_000_000,
    #"supply_type": "임대주택",
    #"area": 80
}

In [10]:
# 데이터 통합
combined_data = pd.concat([api_data, crawl_data], ignore_index=True)

# 데이터 전처리 함수
def preprocess_data(data):
    # 날짜 및 숫자 변환
    data["enter_day"] = pd.to_datetime(data["enter_day"], format="%Y.%m", errors="coerce")
    data["max_supply_price"] = pd.to_numeric(data["max_supply_price"], errors="coerce")
    data["special_supply_conditions"] = data["special_supply_conditions"].apply(
        lambda x: ", ".join(x) if isinstance(x, list) else ""
    )  # 리스트를 문자열로 변환
    data["area"] = pd.to_numeric(data["area"], errors="coerce")  # 면적 숫자 변환
    return data

# 필터링 함수
def filter_data(data, user_input):
    filtered_data = data.copy()

    # 각 조건에 대해 필터링 (user_input에 값이 존재하는 경우만)
    if "supply_name" in user_input:
        filtered_data = filtered_data[
            filtered_data["supply_name"].str.contains(user_input["supply_name"], na=False)
        ]
    if "region_name" in user_input:
        filtered_data = filtered_data[
            filtered_data["region_name"].str.contains(user_input["region_name"], na=False)
        ]
    if "application_schedule" in user_input:
        filtered_data = filtered_data[
            filtered_data["application_schedule"] >= user_input["application_schedule"]
        ]
    if "special_supply_conditions" in user_input:
        filtered_data = filtered_data[
            filtered_data["special_supply_conditions"].str.contains(
                "|".join(user_input["special_supply_conditions"]), na=False
            )
        ]
    if "enter_day" in user_input:
        user_enter_date = pd.to_datetime(user_input["enter_day"], format="%Y.%m", errors="coerce")
        filtered_data = filtered_data[filtered_data["enter_day"] >= user_enter_date]
    if "max_supply_price" in user_input:
        filtered_data = filtered_data[
            filtered_data["max_supply_price"] <= user_input["max_supply_price"]
        ]
    if "supply_type" in user_input:
        filtered_data = filtered_data[
            filtered_data["supply_type"].str.contains(user_input["supply_type"], na=False)
        ]
    if "area" in user_input:
        filtered_data = filtered_data[
            filtered_data["area"] >= user_input["area"]
        ]

    return filtered_data

# 추천 알고리즘
def recommend_supply(data, user_input):
    # 데이터 전처리
    data = preprocess_data(data)

    # 필터링
    filtered_data = filter_data(data, user_input)

    # 정렬 (max_supply_price → enter_day → special_supply_conditions 순서)
    filtered_data = filtered_data.sort_values(
        by=["max_supply_price", "enter_day", "special_supply_conditions"],
        ascending=[True, True, False]
    )

    return filtered_data

# 추천 결과
recommended_supplies = recommend_supply(combined_data, user_input)

# 결과 출력 (nan이면 미출력)
if not recommended_supplies.empty:
    print("\n=== 추천 청약 목록 ===")
    for i, row in recommended_supplies.iterrows():
        print(f"공급명: {row['supply_name']}")
        print(f"지역: {row['region_name']}")
        print(f"청약일정: {row['application_schedule']}")
        
        # 특별공급조건 출력 (값이 있는 경우만)
        if pd.notna(row["special_supply_conditions"]) and row["special_supply_conditions"]:
            print(f"특별공급조건: {row['special_supply_conditions']}")
        
        # 입주 예정월 출력 (값이 있는 경우만)
        if pd.notna(row["enter_day"]):
            print(f"입주 예정월: {row['enter_day'].strftime('%Y.%m')}")
        
        # 최대 공급금액 출력 (값이 있는 경우만)
        if pd.notna(row["max_supply_price"]):
            print(f"최대 공급금액: {row['max_supply_price']}")
        
        # 공급유형 출력 (값이 있는 경우만)
        if pd.notna(row["supply_type"]):
            print(f"공급유형: {row['supply_type']}")
        
        # 면적 출력 (값이 있는 경우만)
        if pd.notna(row["area"]):
            print(f"면적: {row['area']}㎡")
        
        print("-" * 30)
else:
    print("조건에 맞는 청약 정보를 찾을 수 없습니다.")




=== 추천 청약 목록 ===
공급명: 인천강화서희스타힐스 1단지(조합원 취소세대)
지역: 인천광역시 강화군 선원면 창리456번지 일원
청약일정: 2025-01-17
특별공급조건: 다자녀, 신혼부, 생애최, 노부모, 신생아, 청년
입주 예정월: 2025.02
최대 공급금액: 37574.0
------------------------------
