In [None]:
import re

def preprocess_text(text):
    # HTML 태그 제거
    text = re.sub(r'<[^>]+>', '', text)
    # 특수 문자 제거
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # 대소문자 통일
    text = text.lower()
    return text


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)


In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]
    return ' '.join(lemmatized_words)


In [None]:
def filter_by_length(text, min_length=10):
    return text if len(text.split()) >= min_length else None


In [None]:
from transformers import pipeline

# 사전 학습된 BERT 모델을 사용한 텍스트 복구 예제
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

def recover_text(text):
    # 예시: [MASK] 위치에 적절한 단어를 예측
    recovered_text = fill_mask(text)
    return recovered_text


In [None]:
def clean_and_recover_wikipedia_text(text):
    # 전처리
    text = preprocess_text(text)
    # 불용어 제거
    text = remove_stopwords(text)
    # 어간 추출 및 표제어 추출
    text = stem_and_lemmatize(text)
    # 길이 필터링
    text = filter_by_length(text)
    if text is None:
        return None
    # 텍스트 복구
    recovered_text = recover_text(text)
    return recovered_text


In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from transformers import pipeline
from spellchecker import SpellChecker

# NLTK 리소스 다운로드
nltk.download('stopwords')
nltk.download('punkt')

# 전처리 함수
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    return text

# 불용어 제거 함수
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# 어간 추출 및 표제어 추출 함수
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]
    return ' '.join(lemmatized_words)

# 길이 필터링 함수
def filter_by_length(text, min_length=10):
    return text if len(text.split()) >= min_length else None

# 철자 검사 및 수정 함수
spell = SpellChecker()

def correct_spelling(text):
    words = word_tokenize(text)
    corrected_words = [spell.correction(word) for word in words]
    return ' '.join(corrected_words)

# 텍스트 복구 함수 (BERT 모델 사용)
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

def recover_text(text):
    masked_text = text.replace('[MASK]', '<mask>')
    recovered = fill_mask(masked_text)
    return recovered[0]['sequence'] if recovered else text

# 파일에서 데이터 읽기
file_path = '/media/eternal/5TB HDD/sogang-nlp-rag/processed_wikipedia.txt/processed_wikipedia.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# 데이터 정화 및 복구
cleaned_lines = []
for line in lines:
    cleaned_line = preprocess_text(line)
    cleaned_line = remove_stopwords(cleaned_line)
    cleaned_line = correct_spelling(cleaned_line)
    cleaned_line = stem_and_lemmatize(cleaned_line)
    cleaned_line = filter_by_length(cleaned_line)
    if cleaned_line:
        cleaned_line = recover_text(cleaned_line)
        cleaned_lines.append(cleaned_line)

# 결과를 파일에 저장
output_path = '/media/eternal/5TB HDD/sogang-nlp-rag/processed_wikipedia_cleaned.txt'
with open(output_path, 'w', encoding='utf-8') as file:
    for cleaned_line in cleaned_lines:
        file.write(cleaned_line + '\n')

# 상위 10개의 데이터 출력
print("Top 10 cleaned data samples:")
for i in range(min(10, len(cleaned_lines))):
    print(f"{i+1}: {cleaned_lines[i]}")

print(f"\nData cleaned and saved to {output_path}")
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from transformers import pipeline
from spellchecker import SpellChecker

# NLTK 리소스 다운로드
nltk.download('stopwords')
nltk.download('punkt')

# 전처리 함수
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    return text

# 불용어 제거 함수
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# 어간 추출 및 표제어 추출 함수
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]
    return ' '.join(stemmed_words)

# 길이 필터링 함수
def filter_by_length(text, min_length=10):
    return text if len(text.split()) >= min_length else None

# 철자 검사 및 수정 함수
spell = SpellChecker()

def correct_spelling(text):
    words = word_tokenize(text)
    corrected_words = [spell.correction(word) if spell.correction(word) else word for word in words]
    return ' '.join(corrected_words)

# 텍스트 복구 함수 (BERT 모델 사용)
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

def recover_text(text):
    masked_text = text.replace('[MASK]', '<mask>')
    recovered = fill_mask(masked_text)
    return recovered[0]['sequence'] if recovered else text

# 파일에서 데이터 읽기
file_path = '/media/eternal/5TB HDD/sogang-nlp-rag/processed_wikipedia.txt/processed_wikipedia.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# 데이터 정화 및 복구
cleaned_lines = []
for line in lines:
    cleaned_line = preprocess_text(line)
    cleaned_line = remove_stopwords(cleaned_line)
    cleaned_line = correct_spelling(cleaned_line)
    cleaned_line = stem_and_lemmatize(cleaned_line)
    cleaned_line = filter_by_length(cleaned_line)
    if cleaned_line:  # None 체크
        cleaned_line = recover_text(cleaned_line)
        if cleaned_line:  # None 체크
            cleaned_lines.append(cleaned_line)

# 결과를 파일에 저장
output_path = '/media/eternal/5TB HDD/sogang-nlp-rag/processed_wikipedia_cleaned.txt'
with open(output_path, 'w', encoding='utf-8') as file:
    for cleaned_line in cleaned_lines:
        file.write(cleaned_line + '\n')

# 상위 10개의 데이터 출력
print("Top 10 cleaned data samples:")
for i in range(min(10, len(cleaned_lines))):
    print(f"{i+1}: {cleaned_lines[i]}")

print(f"\nData cleaned and saved to {output_path}")


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eternal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eternal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TypeError: sequence item 1: expected str instance, NoneType found

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from transformers import pipeline
from spellchecker import SpellChecker
from tqdm import tqdm

# NLTK 리소스 다운로드
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# 전처리 함수
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    return text

# 불용어 제거 함수
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# 어간 추출 및 표제어 추출 함수
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# 길이 필터링 함수
def filter_by_length(text, min_length=10):
    return text if len(text.split()) >= min_length else None

# 철자 검사 및 수정 함수
spell = SpellChecker()

def correct_spelling(text):
    words = word_tokenize(text)
    corrected_words = [spell.correction(word) if spell.correction(word) else word for word in words]
    return ' '.join(corrected_words)

# 텍스트 복구 함수 (BERT 모델 사용)
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

def recover_text(text):
    if '[MASK]' in text:
        masked_text = text.replace('[MASK]', '<mask>')
        recovered = fill_mask(masked_text)
        return recovered[0]['sequence'] if recovered else text
    else:
        return text  # [MASK] 토큰이 없으면 원래 텍스트 반환

# 파일에서 데이터 읽기
file_path = '/media/eternal/5TB HDD/sogang-nlp-rag/processed_wikipedia.txt/processed_wikipedia.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# 데이터 정화 및 복구
cleaned_lines = []
for line in tqdm(lines, desc="Processing lines"):
    cleaned_line = preprocess_text(line)
    cleaned_line = remove_stopwords(cleaned_line)
    cleaned_line = correct_spelling(cleaned_line)
    cleaned_line = stem_and_lemmatize(cleaned_line)
    cleaned_line = filter_by_length(cleaned_line)
    if cleaned_line:  # None 체크
        cleaned_line = recover_text(cleaned_line)
        if cleaned_line:  # None 체크
            cleaned_lines.append(cleaned_line)

# 결과를 파일에 저장
output_path = '/media/eternal/5TB HDD/sogang-nlp-rag/processed_wikipedia_cleaned.txt'
with open(output_path, 'w', encoding='utf-8') as file:
    for cleaned_line in cleaned_lines:
        file.write(cleaned_line + '\n')

# 상위 10개의 데이터 출력
print("Top 10 cleaned data samples:")
for i in range(min(10, len(cleaned_lines))):
    print(f"{i+1}: {cleaned_lines[i]}")

print(f"\nData cleaned and saved to {output_path}")


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eternal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eternal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eternal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSe

In [None]:
import re

# 파일에서 데이터 읽기
file_path = '/media/eternal/5TB HDD/sogang-nlp-rag/processed_wikipedia.txt/processed_wikipedia.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# 단어 수 세기
def count_words(text):
    # 특수 문자 제거
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # 단어 단위로 분할
    words = text.split()
    # 단어 수 반환
    return len(words)

word_count = count_words(text)
print(f"Total number of words in the file: {word_count}")


In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from spellchecker import SpellChecker
from tqdm import tqdm

# NLTK 리소스 다운로드
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# 전처리 함수
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    return text

# 불용어 제거 함수
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# 어간 추출 및 표제어 추출 함수
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# 길이 필터링 함수
def filter_by_length(text, min_length=10):
    return text if len(text.split()) >= min_length else None

# 철자 검사 및 수정 함수
spell = SpellChecker()

def correct_spelling(text):
    words = word_tokenize(text)
    corrected_words = [spell.correction(word) if spell.correction(word) else word for word in words]
    return ' '.join(corrected_words)

# 단어 수 세기 함수
def count_words(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = text.split()
    return len(words)

# 데이터 정화 및 복구 함수
def clean_line(line):
    cleaned_line = preprocess_text(line)
    cleaned_line = remove_stopwords(cleaned_line)
    cleaned_line = correct_spelling(cleaned_line)
    cleaned_line = stem_and_lemmatize(cleaned_line)
    cleaned_line = filter_by_length(cleaned_line)
    return cleaned_line

# 파일에서 데이터 읽기
file_path = '/home/eternal/processed_wikipedia.txt/processed_wikipedia.txt'

# 결과를 저장할 파일 열기
output_path = '/home/eternal/processed_wikipedia.txt/processed_wikipedia_cleaned.txt'
with open(output_path, 'w', encoding='utf-8') as output_file:

    # 청크 단위로 데이터를 읽고 처리
    chunk_size = 1000
    total_lines = 0
    processed_word_count = 0

    with open(file_path, 'r', encoding='utf-8') as file:
        while True:
            lines = file.readlines(chunk_size)
            if not lines:
                break

            cleaned_lines = []
            for line in tqdm(lines, desc="Processing lines"):
                cleaned_line = clean_line(line)
                if cleaned_line:  # None 체크
                    cleaned_lines.append(cleaned_line)

            # 처리된 데이터의 단어 수 계산
            processed_text = ' '.join(cleaned_lines)
            processed_word_count += count_words(processed_text)

            # 결과를 파일에 저장
            for cleaned_line in cleaned_lines:
                output_file.write(cleaned_line + '\n')

            total_lines += len(lines)

print(f"Total lines processed: {total_lines}")
print(f"Processed word count: {processed_word_count}")

# 상위 10개의 데이터 출력
with open(output_path, 'r', encoding='utf-8') as file:
    print("Top 10 cleaned data samples:")
    for i in range(10):
        print(f"{i+1}: {file.readline().strip()}")

print(f"\nData cleaned and saved to {output_path}")




[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eternal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eternal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eternal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Processing lines: 100%|███████████████████████████| 8/8 [00:07<00:00,  1.12it/s]
Processing lines: 100%|███████████████████████████| 8/8 [00:09<00:00,  1.22s/it]
Processing lines: 100%|███████████████████████████| 7/7 [00:07<00:00,  1.00s/it]
Processing lines: 100%|███████████████████████████| 8/8 [00:08<00:00,  1.05s/it]
Processing lines: 100%|███████████████████████████| 9/9 [00:07<00:00,  1.23it/s]
Processing lines: 100%|█████████████████████████| 10/10 [00:07<00:00,  1.35it/s]
Processing lines: 100%|███████████████████████████| 9/9 [00:07<00:00,  1.14it/s]
Processing lines: 100%|█████████████████████████

KeyboardInterrupt: 

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from spellchecker import SpellChecker
from tqdm import tqdm

# NLTK 리소스 다운로드
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# 전처리 함수
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    return text

# 불용어 제거 함수
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# 어간 추출 및 표제어 추출 함수
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# 길이 필터링 함수
def filter_by_length(text, min_length=10):
    return text if len(text.split()) >= min_length else None

# 철자 검사 및 수정 함수
spell = SpellChecker()

def correct_spelling(text):
    words = word_tokenize(text)
    corrected_words = [spell.correction(word) if spell.correction(word) else word for word in words]
    return ' '.join(corrected_words)

# 단어 수 세기 함수
def count_words(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = text.split()
    return len(words)

# 데이터 정화 및 복구 함수
def clean_line(line):
    cleaned_line = preprocess_text(line)
    cleaned_line = remove_stopwords(cleaned_line)
    cleaned_line = correct_spelling(cleaned_line)
    cleaned_line = stem_and_lemmatize(cleaned_line)
    cleaned_line = filter_by_length(cleaned_line)
    return cleaned_line

# 파일에서 데이터 읽기
file_path = '/media/eternal/5TB HDD/sogang-nlp-rag/processed_wikipedia.txt/processed_wikipedia.txt'

# 파일의 총 줄 수 계산
with open(file_path, 'r', encoding='utf-8') as file:
    total_lines = sum(1 for line in file)

# 결과를 저장할 파일 열기
output_path = '/media/eternal/5TB HDD/sogang-nlp-rag/processed_wikipedia_cleaned.txt'
with open(output_path, 'w', encoding='utf-8') as output_file:

    # 청크 단위로 데이터를 읽고 처리
    chunk_size = 1000
    total_processed_lines = 0
    processed_word_count = 0

    with open(file_path, 'r', encoding='utf-8') as file:
        with tqdm(total=total_lines, desc="Total Progress") as pbar:
            while True:
                lines = file.readlines(chunk_size)
                if not lines:
                    break

                cleaned_lines = []
                for line in tqdm(lines, desc="Processing lines", leave=False):
                    cleaned_line = clean_line(line)
                    if cleaned_line:  # None 체크
                        cleaned_lines.append(cleaned_line)

                # 처리된 데이터의 단어 수 계산
                processed_text = ' '.join(cleaned_lines)
                processed_word_count += count_words(processed_text)

                # 결과를 파일에 저장
                for cleaned_line in cleaned_lines:
                    output_file.write(cleaned_line + '\n')

                total_processed_lines += len(lines)
                pbar.update(len(lines))

print(f"Total lines processed: {total_processed_lines}")
print(f"Processed word count: {processed_word_count}")

# 상위 10개의 데이터 출력
with open(output_path, 'r', encoding='utf-8') as file:
    print("Top 10 cleaned data samples:")
    for i in range(10):
        print(f"{i+1}: {file.readline().strip()}")

print(f"\nData cleaned and saved to {output_path}")


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eternal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eternal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eternal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Total Progress:   0%|                             | 0/105502474 [00:00<?, ?it/s]
Processing lines:   0%|                                   | 0/8 [00:00<?, ?it/s][A
Processing lines:  12%|███▍                       | 1/8 [00:01<00:12,  1.82s/it][A
Processing lines:  25%|██████▊                    | 2/8 [00:02<00:06,  1.13s/it][A
Processing lines:  38%|██████████▏                | 3/8 [00:03<00:04,  1.06it/s][A
Processing lines:  50%|█████████████▌             | 4/8 [00:04<00:04,  1.01s/it][A
Processing lines:  62%|████████████████▉          | 5/8 [00:04<00:02,  1.18it/s][A
Processing lines:  75%|███████

In [None]:
import re
import pandas as pd
from nltk.corpus import stopwords
from spellchecker import SpellChecker
from tqdm import tqdm
import nltk

# NLTK stopwords 다운로드
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 정규 표현식을 사용한 텍스트 전처리 함수
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # HTML 태그 제거
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # 알파벳과 숫자, 공백을 제외한 모든 문자 제거
    text = re.sub(r'\s+', ' ', text)  # 다중 공백을 단일 공백으로 변경
    text = text.lower()  # 소문자 변환
    return text

# 불용어 제거 함수
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# 철자 검사 및 수정 함수
spell = SpellChecker()
def correct_spelling(text):
    words = text.split()
    corrected_words = [spell.correction(word) if spell.correction(word) else word for word in words]
    return ' '.join(corrected_words)

# 어간 추출 및 표제어 추출 함수
from nltk.stem import PorterStemmer, WordNetLemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# 길이 필터링 함수
def filter_by_length(text, min_length=10):
    return text if len(text.split()) >= min_length else None

# 데이터 정화 및 복구 함수
def clean_line(line):
    cleaned_line = preprocess_text(line)
    cleaned_line = correct_spelling(cleaned_line)
    cleaned_line = remove_stopwords(cleaned_line)
    cleaned_line = stem_and_lemmatize(cleaned_line)
    cleaned_line = filter_by_length(cleaned_line)
    return cleaned_line

def process_chunk(chunk):
    cleaned_lines = [clean_line(line) for line in chunk]
    return [line for line in cleaned_lines if line is not None]

# 파일에서 데이터 읽기
file_path = '/media/eternal/5TB HDD/sogang-nlp-rag/processed_wikipedia.txt/processed_wikipedia.txt'

# 파일의 총 줄 수 계산
with open(file_path, 'r', encoding='utf-8') as file:
    total_lines = sum(1 for line in file)

# 결과를 저장할 파일 열기
output_path = '/media/eternal/5TB HDD/sogang-nlp-rag/processed_wikipedia_cleaned.txt'

chunk_size = 10000  # 청크 크기를 줄여서 메모리 사용 최적화

def chunk_reader(file, chunk_size):
    while True:
        lines = file.readlines(chunk_size)
        if not lines:
            break
        yield lines

with open(output_path, 'w', encoding='utf-8') as output_file:
    with open(file_path, 'r', encoding='utf-8') as file:
        with tqdm(total=total_lines, desc="Total Progress") as pbar:
            for chunk in chunk_reader(file, chunk_size):
                cleaned_lines = process_chunk(chunk)
                for cleaned_line in cleaned_lines:
                    output_file.write(cleaned_line + '\n')
                pbar.update(len(cleaned_lines))

print(f"Data cleaned and saved to {output_path}")




In [None]:
import re
import pandas as pd
from nltk.corpus import stopwords
from spellchecker import SpellChecker
from tqdm import tqdm
import nltk
from concurrent.futures import ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count

# NLTK stopwords 다운로드
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 정규 표현식을 사용한 텍스트 전처리 함수
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # HTML 태그 제거
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # 알파벳과 숫자, 공백을 제외한 모든 문자 제거
    text = re.sub(r'\s+', ' ', text)  # 다중 공백을 단일 공백으로 변경
    text = text.lower()  # 소문자 변환
    return text

# 불용어 제거 함수
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# 철자 검사 및 수정 함수
spell = SpellChecker()
def correct_spelling(text):
    words = text.split()
    corrected_words = [spell.correction(word) if spell.correction(word) else word for word in words]
    return ' '.join(corrected_words)

# 어간 추출 및 표제어 추출 함수
from nltk.stem import PorterStemmer, WordNetLemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# 길이 필터링 함수
def filter_by_length(text, min_length=10):
    return text if len(text.split()) >= min_length else None

# 데이터 정화 및 복구 함수
def clean_line(line):
    cleaned_line = preprocess_text(line)
    cleaned_line = correct_spelling(cleaned_line)
    cleaned_line = remove_stopwords(cleaned_line)
    cleaned_line = stem_and_lemmatize(cleaned_line)
    cleaned_line = filter_by_length(cleaned_line)
    return cleaned_line

def process_chunk(chunk, pbar):
    cleaned_lines = [clean_line(line) for line in chunk]
    cleaned_lines = [line for line in cleaned_lines if line is not None]
    pbar.update(len(cleaned_lines))
    return cleaned_lines

# 파일에서 데이터 읽기
file_path = '/home/eternal/processed_wikipedia.txt/processed_wikipedia.txt'

# 파일의 총 줄 수 계산
with open(file_path, 'r', encoding='utf-8') as file:
    total_lines = sum(1 for line in file)

# 결과를 저장할 파일 열기
output_path = '/home/eternal/processed_wikipedia.txt/processed_wikipedia_cleaned.txt'

chunk_size = 10000  # 청크 크기를 줄여서 메모리 사용 최적화
num_workers = cpu_count()

def chunk_reader(file, chunk_size):
    while True:
        lines = file.readlines(chunk_size)
        if not lines:
            break
        yield lines

def process_and_write_chunks(chunks, output_file):
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = []
        for chunk in chunks:
            pbar = tqdm(total=len(chunk), desc="Chunk Progress")
            futures.append(executor.submit(process_chunk, chunk, pbar))
        for future in as_completed(futures):
            cleaned_lines = future.result()
            for cleaned_line in cleaned_lines:
                output_file.write(cleaned_line + '\n')

if __name__ == '__main__':
    with open(output_path, 'w', encoding='utf-8') as output_file:
        with open(file_path, 'r', encoding='utf-8') as file:
            chunks = list(chunk_reader(file, chunk_size))
            with tqdm(total=total_lines, desc="Total Progress") as pbar:
                process_and_write_chunks(chunks, output_file)
                pbar.update(total_lines)

    print(f"Data cleaned and saved to {output_path}")


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eternal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Total Progress:   0%|                             | 0/105502474 [00:00<?, ?it/s]
Chunk Progress:   0%|                                    | 0/79 [00:00<?, ?it/s][A

Chunk Progress:   0%|                                    | 0/70 [00:00<?, ?it/s][A[A


Chunk Progress:   0%|                                    | 0/83 [00:00<?, ?it/s][A[A[A



Chunk Progress:   0%|                                    | 0/63 [00:00<?, ?it/s][A[A[A[A




Chunk Progress:   0%|                                    | 0/61 [00:00<?, ?it/s][A[A[A[A[A





Chunk Progress:   0%|                                    | 0/60 [00:00<?, ?it/s][A[A[A[A[A[A






Chunk Progress:   0%|                                    | 0/71 [00:00<?, ?it/s][A[A[A[A[A[A[A







Chunk Progress:   0%|                                    | 0/53 [00:00<?, ?it/s][A[A[A[A[A[