In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = "https://www.jeju.go.kr/culture/dialect/dictionary.htm?pageSize=20&page="  # 제주어 사전 


dialect_data = []
def extract_data_from_page(soup):
    tbody = soup.select_one('div.table-responsive > table > tbody')  # 공통 body tag => Copy selector 
    rows = tbody.find_all('tr')
    for row in rows:
        dialect_td = row.select_one('td:nth-child(1)')  # td tag 1번째 => 방언 
        meaning_td = row.select_one('td.dotdotdot.title') # td tag dotdotdot title => 방언뜻풀이
        if dialect_td and meaning_td:
            dialect = dialect_td.text.strip()
            meaning = meaning_td.text.strip()
            dialect_data.append([dialect, meaning])


## 전체 페이지 358, row 20개씩 존재 
for page_number in range(1, 359):
    response = requests.get(f"{base_url}{page_number}")
    response.encoding = 'utf-8'  # Ensuring correct encoding
    soup = BeautifulSoup(response.text, 'html.parser')
    extract_data_from_page(soup)


## 데이터 프레임 생성 
df = pd.DataFrame(dialect_data, columns=['방언', '방언뜻풀이'])
df.head() 


Unnamed: 0,방언,방언뜻풀이
0,돗-거미,황금거미
1,할망,할머니
2,하르방,할아버지
3,싱,정말. 의심할 때 행여나 될까 하고 추측하는 뜻을 나타내는 말
4,마,마-고라=마-라. 아니원


In [29]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7159 entries, 0 to 7158
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   방언      7159 non-null   object
 1   방언뜻풀이   7159 non-null   object
dtypes: object(2)
memory usage: 112.0+ KB


In [30]:
df.to_csv('jeju_dict.csv', index=False, encoding='utf-8-sig')

#### 간단한 전처리 

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re


## 제주어 사전 
base_url = "https://www.jeju.go.kr/culture/dialect/dictionary.htm?pageSize=20&page="


## 방언 데이터 저장 
dialect_data = []

def preprocess_text(text):
    # 숫자 공백 제거 
    text = re.sub(r'\d+', '', text) 
    text = re.sub(r'\s+', '', text) 
    return text

def extract_data_from_page(soup):
    tbody = soup.select_one('div.table-responsive > table > tbody')  # 공통 body tag => Copy selector
    rows = tbody.find_all('tr')
    for row in rows:
        dialect_td = row.select_one('td:nth-child(1)')  # 방언 
        meaning_td = row.select_one('td.dotdotdot.title')  # 표준어 
        if dialect_td and meaning_td:
            dialect = preprocess_text(dialect_td.text.strip())  # 방언
            meaning = preprocess_text(meaning_td.text.strip())  # 표준어 
            dialect_data.append([dialect, meaning])

# 페이지 반복 총 358*20개 데이터 
for page_number in range(1, 359):
    response = requests.get(f"{base_url}{page_number}")
    response.encoding = 'utf-8'  
    soup = BeautifulSoup(response.text, 'html.parser')
    extract_data_from_page(soup)


df = pd.DataFrame(dialect_data, columns=['방언', '방언뜻풀이'])  # 데이터프레임 생성 
df.head()

Unnamed: 0,방언,방언뜻풀이
0,돗-거미,황금거미
1,할망,할머니
2,하르방,할아버지
3,싱,정말.의심할때행여나될까하고추측하는뜻을나타내는말
4,마,마-고라=마-라.아니원


In [5]:
df.to_csv('jeju_dict.csv', index=False, encoding='utf-8-sig')

### 전처리 

약간의 전처리 추가 

In [None]:
# !pip install OldHangeul
# !pip install jamo
# !pip install hangul-utils

In [58]:
import pandas as pd
from OldHangeul import OLD_TEXTS  
from jamo import h2j, j2hcj
from hangul_utils import join_jamos

# Load the data
data = pd.read_csv('jeju_dict.csv', encoding='utf-8-sig')


def concatenate_characters(text):
    old_text = OLD_TEXTS(text)  # 옛한글 표시되도록 
    old_text = ''.join(old_text)
    return old_text

def replace_old_hangul(text):
    ## 특수문자 제거
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'([ᄀ-ᇿ])ᆞ', r'\1ㅏ', text)  # 아래아를 ㅏ로 변경 
    
    # 초성 
    text = text.replace('ᄀ', 'ㄱ').replace('ᄁ', 'ㄲ').replace('ᄂ', 'ㄴ').replace('ᄃ', 'ㄷ').replace('ᄄ', 'ㄸ')\
               .replace('ᄅ', 'ㄹ').replace('ᄆ', 'ㅁ').replace('ᄇ', 'ㅂ').replace('ᄈ', 'ㅃ').replace('ᄉ', 'ㅅ')\
               .replace('ᄊ', 'ㅆ').replace('ᄋ', 'ㅇ').replace('ᄌ', 'ㅈ').replace('ᄍ', 'ㅉ').replace('ᄎ', 'ㅊ')\
               .replace('ᄏ', 'ㅋ').replace('ᄐ', 'ㅌ').replace('ᄑ', 'ㅍ').replace('ᄒ', 'ㅎ')

    # 종성 
    text = text.replace('ᆨ', 'ㄱ').replace('ᆩ', 'ㄲ').replace('ᆫ', 'ㄴ').replace('ᆮ', 'ㄷ').replace('ᆯ', 'ㄹ')\
               .replace('ᆷ', 'ㅁ').replace('ᆸ', 'ㅂ').replace('ᆹ', 'ㅄ').replace('ᆺ', 'ㅅ').replace('ᆻ', 'ㅆ')\
               .replace('ᆼ', 'ㅇ').replace('ᆽ', 'ㅈ').replace('ᆾ', 'ㅊ').replace('ᆿ', 'ㅋ').replace('ᇀ', 'ㅌ')\
               .replace('ᇁ', 'ㅍ').replace('ᇂ', 'ㅎ')    
    
    text = join_jamos(text)
    return text

data['방언'] = data['방언'].apply(concatenate_characters)
data['방언뜻풀이'] = data['방언뜻풀이'].apply(concatenate_characters)

data['방언'] = data['방언'].apply(replace_old_hangul)
data['방언뜻풀이'] = data['방언뜻풀이'].apply(replace_old_hangul)


data 

Unnamed: 0,방언,방언뜻풀이
0,돗거미,황금거미
1,할망,할머니
2,하르방,할아버지
3,차싱,정말_의심할_때_행여나_될까_하고_추측하는_뜻을_나타내는_말
4,차마,차마고라차마가라_아니원
...,...,...
7154,가맹이,아지랑이
7155,가랑비,가랑비
7156,건들마,여름_장마철_남쪽에서_강약을_달리하면서_잇달아_오는_바람
7157,강쳉이,갑자기_이는_폭풍


In [60]:
data.to_csv('jeju_preprocessed.csv', index=False, encoding='utf-8-sig')