# **중급 프로젝트 EDA**

*PDF, HWP 형식의 RPF(기업 및 정부 제안 요청서)*

# 사전 준비

## 필요 라이브러리 임포트

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
import re
from collections import Counter
from glob import glob

import pdfplumber
import olefile
from kiwipiepy import Kiwi

# 데이터 EDA

## 데이터 불러오기

In [3]:
data_dir = "../data/raw/files"
files_dir = os.path.join(data_dir, "files")

files_path = glob(os.path.join(files_dir, "*"))
csv_path = os.path.join(data_dir, "data_list.csv")

## 데이터 구조 확인

HWP/PDF 파일 개수 확인

In [5]:
hwp_cnt = 0
pdf_cnt = 0

for file in files_path:
    if file.endswith(".hwp"):
        hwp_cnt += 1
    elif file.endswith(".pdf"):
        pdf_cnt += 1

print(f"hwp file count: {hwp_cnt}")
print(f"pdf file count: {pdf_cnt}")

hwp file count: 96
pdf file count: 4


HWP/PDF 파일 텍스트 추출 함수

In [6]:
def extract_text(file_path):
    text = ""
    if file_path.endswith(".hwp"):
        try:
            ole = olefile.OleFileIO(file_path)
            for stream_name in ole.listdir():
                if "BodyText" in stream_name:
                    data = ole.openstream(stream_name).read()
                    text += data.decode('utf-16', errors='ignore')
            return text
        except Exception as e:
            print(f"HWP Extraction Error {file_path}: {e}")
            return ""
    elif file_path.endswith(".pdf"):
        try:
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    text += page.extract_text() + "\n"
            return text
        except Exception as e:
            print(f"PDF Extraction Error {file_path}: {e}")
            return ""

무작위 샘플 파일 확인 함수

In [7]:
def sampler(files_path):
    sampled_files = random.sample(files_path, 10)
    for file in sampled_files:
        text = extract_text(file)
        print(f"File: {file}\nExtracted Text:\n{text[:500]}\n{'-'*80}")

In [8]:
sampler(files_path)

File: ../data\files\한국보건산업진흥원_의료기기산업 종합정보시스템(정보관리기관) 기능.hwp
Extracted Text:
垬桏圔︜㩌泙喖隒풒떎褫몭孕ꈐꦩ⩿ᒑꐔ髄୐کཱ免䅨ᐤḊ蜶⻮ᐡ銌ቃ擜ḋ퉒孄楋얼ྲྀ脊鯞晱杶묶ﳬ绻뻳泟⭬ᔀਆ䁀쾡갸ꨚﺂ̋챗᐀ﵪ鞃췗먘疐䳝ﾭᆺ呇贑鴆ᙩꥴ訝鷵䴦Ỷࠕ搅탬ἤ烫嵔걕⿹粩磬秧ꉼ闇⯫狀䌷嗴纈훮䱱窟햁끎퇿㩱徰慩禌翝㮶縏⢟㵱젯태捤䔕銎尠필蓁蓇骐녉Ⴘ讓䧖娆ದ鹱຦㖥ꁧ欻摠䃖ꇺ횁ｩ屄놦캞晑숧臐塈䋶줃僜࢜᜷陋錖얇ࣥ佽컼궡⓮呇瘷碵㬣윪㧠숟붉ῥ❂姞跦迷䇱䱠綕낍闦跠浰ự꬟뼱㗓矦욶컼Ⓙ從⽾辮栗﵏釼능᠆徃땜晶犙枖홎瞴뙻껉Ⅽꌿ玧暅鷫皰幤㣡뚛탭갭⦧쏠꒷簻뙧쟈몷窓ᆂ啥埧瓜蕕缉撑딙뛏뜃肚췯⏖⼢于츽쓪꒲뢡ᨖ䊚쩪陀楌㻸ㄭ霤荸些盈㯿뉵㔧꘩㈐霪ꃨㆤ튓濭铝Ꮮᜑ鶥ꔦᑷ홞᪤劒㭂≰視왥Ҧ譗맍쳲馝뽳膵䙿町阯멁ꚝ斍⦭ꖠ덤硭㤼桬殻؋팆鋪ꞁ뒿ᳳ睄౩坳ᛵ뫳ᐫ쭗ﰭ憛点믾끄矼튘糈㚍춖꾛현䡲ꗳ㡯♛餝✭輡լॽ끵譳ꝟ錋南嬼縏砝㼴卫ꨛ骀閥蝳簇숪躿㞽⏼䉰綕纔到㇆섪䠛蹔彗币磺ፁ⽏孽筠쌐⻃賾ã获⠧㭽暈酞殝꘼䱮㯪봼統㮟闳秳忔弲䋚퟿딟쯅ퟦ돇娪啽촕膿붐姒⪼〸⫓덧혪쾣멿屒哑緇뒵榢⏒Еሣ㋏侤쑅䡥팁源ᢖὒ쟏୭⥚₲䯧鍤⋹ᪿ煑㋕휅瀾㦓∡㈮毦ី昙닕ꗞ䞹咮깉緯뼳Ἆ쏡꫎雉㏲Ꞟ堭Ṿ䙧댔ෛྜ⋛ᒜ霑
--------------------------------------------------------------------------------
File: ../data\files\(사）한국대학스포츠협의회_KUSF 체육특기자 경기기록 관리시스템 개발.hwp
Extracted Text:
垬桯䜔缔뮳⹋苶ꂦ몞酖覨ꚹ뗝䐪詭芈前㒣脵픈멾㔏ꂭ྘⟢⣁⨪⢆ꢍ䪹≍⨏∥╁੕ἥ씅鯫붽鈿馾췽뽻珹헯鱀یԀ㠴‍簓䊦굋즢㐒紷ꇛ䀫擖鞾﭂难ᒽꆊ襎鑲䫵㉘⹾藝寸痀⏵呂銙禌⧶ප〯热弬熨韩W捏丄쀆㌢뢋㕋슑詚䦰嬬嗤峒衂ｓ쪛ꀉ棽팂偯頧蜱밽벒儴ࠅ䬥頉傖⌝揌⩨ڠ삗ﺘ亴笿ṇ㈍䐫朲힡⸞酯龬⭤ᾙ랊幫㱮紟퍔烱邋퀢᭚᯷ᓾ嬉鱀넥ꀄᥱ芗䒳কⲔꙚ堚蛇᭰帒蘭㝮롍幪맜꨿쉗Ԫꦏቓ퇭獴麩嘢镨剩䚥푥跅୷繵庮鵁鯲뫻炜胳荿┻㨴땡驦遐쉮₄戻ᡜﻛ礉㐃囀₈샤싘솑ꄇ꺾഑彩終［點ꟁ켷影悃똚놣읛箇苇㾹八폦왣覜옫召岅잂ꭤἳ季⌽᣻냪䩜簹㙋爹꼸熴

In [15]:
file_stats = []

for file in files_path:
    ext = os.path.splitext(file)[1].lower()
    text = extract_text(file)
    n_chars = len(text)
    n_words = len(text.split())
    n_lines = text.count('\n') + 1
    file_stats.append({
        "file": os.path.basename(file),
        "ext": ext,
        "chars": n_chars,
        "words": n_words,
        "lines": n_lines
    })

df_stats = pd.DataFrame(file_stats)
print(df_stats.describe())

               chars       words        lines
count     100.000000    100.0000   100.000000
mean    91553.410000   1270.2100   181.230000
std     34231.791062   6064.4946   993.445172
min     41912.000000     17.0000     1.000000
25%     70954.250000     34.0000     1.750000
50%     83903.500000     46.5000     2.000000
75%    100732.750000     55.0000     3.000000
max    282680.000000  45670.0000  8017.000000


## HWP, PDF 파일 형태 분포 확인

In [19]:
project_root = os.path.abspath(os.path.join(".."))
data_dir = os.path.join(project_root, "data", "raw", "files")
file_list = os.listdir(data_dir)
exts = [os.path.splitext(file)[1].lower() for file in file_list]
ext_counts = Counter(exts)
print(ext_counts)

Counter({'.hwp': 96, '.pdf': 4})
