In [None]:
#커널폭파범

import json
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import logging

# 로깅 설정
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 모델과 토크나이저 초기화 (예: BERT 모델)
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to('cuda')

def extract_questions(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    questions = [item['question'] for item in data]
    return questions

def preprocess_text(text):
    return text.lower().strip()

def encode_sentences(sentences):
    try:
        inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True).to('cuda')
        with torch.no_grad():
            outputs = model(**inputs)
        torch.cuda.empty_cache()  # GPU 메모리 해제
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    except Exception as e:
        logging.error(f"Error in encode_sentences: {e}")
        return np.zeros((len(sentences), 768))  # 에러 발생 시 기본값 반환

def find_similar_sentences(questions, sentences, threshold=0.7):
    question_encodings = encode_sentences(questions)
    sentence_encodings = encode_sentences(sentences)
    
    similar_sentences = {}
    for question, question_encoding in zip(questions, question_encodings):
        similarities = cosine_similarity([question_encoding], sentence_encodings)[0]
        similar_indices = np.where(similarities > threshold)[0]
        for idx in similar_indices:
            similar_sentences[sentences[idx]] = question
    
    return similar_sentences

def process_chunk(chunk, questions, threshold):
    try:
        similar_sentences = find_similar_sentences(questions, chunk, threshold)
        processed_chunk = []
        for sentence in chunk:
            if sentence in similar_sentences:
                preprocessed_sentence = preprocess_text(sentence)
                processed_chunk.append(f"{sentence} -> {preprocessed_sentence}")
            else:
                processed_chunk.append(sentence)
        return processed_chunk
    except Exception as e:
        logging.error(f"Error processing chunk: {e}")
        return chunk

def process_data_sequentially(input_file, text_file, output_file, chunk_size=100, threshold=0.7):
    questions = extract_questions(input_file)
    
    with open(text_file, 'r', encoding='utf-8') as f:
        original_text = f.read()
    
    sentences = original_text.split('. ')
    processed_text = []
    
    for i in range(0, len(sentences), chunk_size):
        chunk = sentences[i:i + chunk_size]
        processed_chunk = process_chunk(chunk, questions, threshold)
        processed_text.extend(processed_chunk)
        logging.info(f"Processed chunk {i // chunk_size + 1}/{len(sentences) // chunk_size + 1}")
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('. '.join(processed_text))
# 파일 경로 설정
input_file = '/home/eternal/qa_train.json'
text_file = '/home/eternal/processed_wikipedia.txt/processed_wikipedia.txt'
output_file = '/home/eternal/processed_large_text_file.txt'

# 데이터 처리 실행
process_data_sequentially(input_file, text_file, output_file)


In [5]:
input_file = '/home/eternal/qa_train.json'
import json
with open(input_file) as f:
        data = json.load(f)
data

[{'answer': '1966',
  'question': 'In what year the the venue that Marcia White is president of open?'},
 {'answer': 'Portugal',
  'question': 'What country is home to the sports club loaning Bruno Paulista to Vasco da Gama?'},
 {'answer': 'Jerry Clower',
  'question': '"Southern Air" featured Ray Stevens, Minnie Pearl and what other Southern comedian?'},
 {'answer': 'Neshnabé',
  'question': 'The Treaty of Prairie du Chien may refer to any of several treaties made between the United States and representatives from the native american people who called themselves what?'},
 {'answer': 'writer',
  'question': 'What profession do Thomas Merton and Michael Moorcock have in common?'},
 {'answer': 'Carlo Rovelli',
  'question': 'Who was the first person to describe Relational Quantum mechanics and has worked in Italy, the United States, and France?'},
 {'answer': 'Branson, Missouri',
  'question': 'Where is the Silver Dollar city theme park with the Grand exposition  steel roller coaster loc

In [1]:
import ijson

def extract_questions(file_path):
    questions = []
    with open(file_path, 'r', encoding='utf-8') as file:
        parser = ijson.parse(file)
        for prefix, event, value in parser:
            if prefix.endswith('.question') and event == 'string':
                questions.append(value)
    return questions

qa_file_path = '/home/eternal/qa_train.json'
questions = extract_questions(qa_file_path)
print(questions[:10])  # 추출된 질문 일부 출력


['In what year the the venue that Marcia White is president of open?', 'What country is home to the sports club loaning Bruno Paulista to Vasco da Gama?', '"Southern Air" featured Ray Stevens, Minnie Pearl and what other Southern comedian?', 'The Treaty of Prairie du Chien may refer to any of several treaties made between the United States and representatives from the native american people who called themselves what?', 'What profession do Thomas Merton and Michael Moorcock have in common?', 'Who was the first person to describe Relational Quantum mechanics and has worked in Italy, the United States, and France?', 'Where is the Silver Dollar city theme park with the Grand exposition  steel roller coaster located?', 'Finna Get Loose is a song by Puff Daddy and what American rapper who is also a record and film producer?', 'What town is Hore Abbey located near?', 'In what year was the actress who was starred in "The Telling" with Holly Madison born?']
Extracted 105501507 relevant sentenc

In [2]:
import ijson

def extract_questions(file_path):
    questions = []
    with open(file_path, 'r', encoding='utf-8') as file:
        parser = ijson.parse(file)
        for prefix, event, value in parser:
            if prefix.endswith('.question') and event == 'string':
                questions.append(value)
    return questions

qa_file_path = '/home/eternal/qa_train.json'
questions = extract_questions(qa_file_path)
print(questions[:10])  # 추출된 질문 일부 출력


def extract_relevant_sentences(questions, wiki_file_path, output_file_path):
    relevant_sentences = []
    with open(wiki_file_path, 'r', encoding='utf-8') as wiki_file:
        for line in wiki_file:
            for question in questions:
                if any(word in line for word in question.split()):
                    relevant_sentences.append(line)
                    break

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for sentence in relevant_sentences:
            output_file.write(sentence)
    
    return relevant_sentences

wiki_file_path = '/home/eternal/processed_wikipedia.txt/processed_wikipedia.txt'
output_file_path='/home/eternal/processed_wikipedia.txt/relevant_sentences.txt'
relevant_sentences = extract_relevant_sentences(questions, wiki_file_path, output_file_path)
print(f"Extracted {len(relevant_sentences)} relevant sentences.")


Extracted 105501507 relevant sentences.


In [9]:
import ijson
import re
from difflib import SequenceMatcher
import nltk
from nltk.corpus import words
from spellchecker import SpellChecker
import sys
import time
from multiprocessing import Pool, cpu_count

# Ensure nltk word list is downloaded
nltk.download('words')

# 파일 경로 설정
qa_file_path = '/home/eternal/this/qa_train.json'
wiki_file_path = '/home/eternal/this/processed_wikipedia.txt/processed_wikipedia.txt'
output_file_path = '/home/eternal/this/processed_wikipedia_filtered.txt'

'''
# 기존 파일 삭제하고 새 파일로 대체
import os
print("Replacing original file with the filtered file...")
os.remove(wiki_file_path)
os.rename(output_file_path, wiki_file_path)
print("Replacement complete.")
'''
def extract_questions(file_path):
    questions = []
    with open(file_path, 'r', encoding='utf-8') as file:
        parser = ijson.parse(file)
        for prefix, event, value in parser:
            if prefix.endswith('.question') and event == 'string':
                questions.append(value)
    return questions

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text

def correct_spelling(text):
    spell = SpellChecker()
    corrected_text = []
    for word in text.split():
        corrected_word = spell.correction(word)
        if corrected_word is None:
            corrected_word = word
        corrected_text.append(corrected_word)
    return ' '.join(corrected_text)

def is_relevant_sentence(question, sentence, threshold=0.6):
    question = correct_spelling(preprocess_text(question))
    sentence = correct_spelling(preprocess_text(sentence))
    similarity = SequenceMatcher(None, question, sentence).ratio()
    return similarity > threshold

def process_chunk(args):
    questions, chunk, threshold = args
    relevant_sentences = set()
    for line in chunk:
        for question in questions:
            if is_relevant_sentence(question, line, threshold):
                relevant_sentences.add(line.strip())
                break
    return relevant_sentences

def extract_and_replace_relevant_sentences(questions, wiki_file_path, output_file_path, chunk_size=1000000, num_workers=4):
    relevant_sentences = set()
    total_lines = sum(1 for _ in open(wiki_file_path, 'r', encoding='utf-8'))

    start_time = time.time()
    with open(wiki_file_path, 'r', encoding='utf-8') as wiki_file:
        pool = Pool(num_workers)
        chunk = []
        for i, line in enumerate(wiki_file):
            chunk.append(line)
            if (i + 1) % chunk_size == 0:
                try:
                    elapsed_time = time.time() - start_time
                    lines_processed = i + 1
                    remaining_lines = total_lines - lines_processed
                    estimated_total_time = (elapsed_time / lines_processed) * total_lines
                    estimated_remaining_time = estimated_total_time - elapsed_time
                    print(f"Processing line {lines_processed}/{total_lines}")
                    print(f"Elapsed time: {elapsed_time:.2f} seconds")
                    print(f"Estimated remaining time: {estimated_remaining_time / 3600:.2f} hours")
                    sys.stdout.flush()
                    # Use starmap to pass multiple arguments to process_chunk
                    results = pool.starmap(process_chunk, [(questions, chunk, 0.6)])
                    for result in results:
                        relevant_sentences.update(result)
                    chunk = []
                except Exception as e:
                    print(f"Error processing chunk at line {i+1}: {e}")
                    sys.stdout.flush()
        
        # Process remaining lines
        if chunk:
            try:
                results = pool.starmap(process_chunk, [(questions, chunk, 0.6)])
                for result in results:
                    relevant_sentences.update(result)
            except Exception as e:
                print(f"Error processing final chunk: {e}")
                sys.stdout.flush()

    pool.close()
    pool.join()

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for sentence in relevant_sentences:
            output_file.write(sentence + '\n')

    return relevant_sentences


# 질문 추출
print("Extracting questions...")
questions = extract_questions(qa_file_path)
print(f"Extracted {len(questions)} questions.")
print(questions[:10])  # 추출된 질문 일부 출력

# 관련 문장 추출 및 저장
print("Extracting relevant sentences...")
relevant_sentences = extract_and_replace_relevant_sentences(questions, wiki_file_path, output_file_path)
print(f"Extracted {len(relevant_sentences)} relevant sentences.")




[nltk_data] Downloading package words to /home/eternal/nltk_data...
[nltk_data]   Package words is already up-to-date!


Extracting questions...
Extracted 88066 questions.
['In what year the the venue that Marcia White is president of open?', 'What country is home to the sports club loaning Bruno Paulista to Vasco da Gama?', '"Southern Air" featured Ray Stevens, Minnie Pearl and what other Southern comedian?', 'The Treaty of Prairie du Chien may refer to any of several treaties made between the United States and representatives from the native american people who called themselves what?', 'What profession do Thomas Merton and Michael Moorcock have in common?', 'Who was the first person to describe Relational Quantum mechanics and has worked in Italy, the United States, and France?', 'Where is the Silver Dollar city theme park with the Grand exposition  steel roller coaster located?', 'Finna Get Loose is a song by Puff Daddy and what American rapper who is also a record and film producer?', 'What town is Hore Abbey located near?', 'In what year was the actress who was starred in "The Telling" with Holly M

Process ForkPoolWorker-2:
Process ForkPoolWorker-3:
Process ForkPoolWorker-4:
Process ForkPoolWorker-1:


KeyboardInterrupt: 

Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/usr/lib/python3.

In [16]:
#!pip install sentence-transformers
#!pip install spellchecker
#!pip install language-tool-python

import json
import re
from sentence_transformers import SentenceTransformer, util
from spellchecker import SpellChecker
import language_tool_python
from tqdm import tqdm

# SentenceTransformer 모델 로드
model = SentenceTransformer('all-MiniLM-L6-v2')

# 언어 도구와 맞춤법 검사기 설정
spell = SpellChecker()
tool = language_tool_python.LanguageTool('en-US')

# 텍스트 정제 함수
def clean_text(text):
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)  # 여러 개의 공백을 단일 공백으로 대체
    text = re.sub(r'[^\w\s.,!?]', '', text)  # 특수 문자 제거
    return text

# 오탈자 수정 함수
def correct_spelling(text):
    corrected_text = []
    for word in text.split():
        corrected_word = spell.correction(word)
        if corrected_word is None:
            corrected_text.append(word)
        else:
            corrected_text.append(corrected_word)
    return ' '.join(corrected_text)

# 문법 검사 및 수정 함수
def correct_grammar(text):
    matches = tool.check(text)
    corrected_text = language_tool_python.utils.correct(text, matches)
    return corrected_text

# 전체 텍스트 정제 함수
def preprocess_text(text):
    text = clean_text(text)
    text = correct_spelling(text)
    text = correct_grammar(text)
    return text

# 데이터 로드 및 유사도 기반 전처리 (데이터의 일부만 처리)
def load_and_clean_data(file_path, top_k=5, limit=None):
    with open(file_path, "r") as file:
        data = json.load(file)
    
    if limit:
        data = data[:limit]
    
    clean_data = []
    for item in tqdm(data, desc="Processing data"):
        question = clean_text(item["question"])
        question_embedding = embed_texts([question])
        
        answers = [clean_text(answer) for answer in item.get("answers", [])]
        if not answers:
            continue
        
        answer_embeddings = embed_texts(answers)
        similarities = calculate_similarity(question_embedding, answer_embeddings)
        
        top_indices = similarities.argsort(descending=True).tolist()[0][:top_k]
        for idx in top_indices:
            clean_item = {
                "question": question,
                "answer": preprocess_text(answers[idx])
            }
            clean_data.append(clean_item)
    
    return clean_data

# 데이터의 일부만 로드
#train_data = load_and_clean_data("/home/eternal/this/qa_train.json", limit=1000)
#test_data = load_and_clean_data("/home/eternal/this/qa_test.json", limit=1000)

#with open("/home/eternal/this/processed_wikipedia.txt", "r") as file:
#    knowledge_base = [preprocess_text(line) for line in tqdm(file, desc="Processing knowledge base", total=1000)]


train_data = load_and_clean_data("/home/eternal/this/qa_train.json", limit=1000)
test_data = load_and_clean_data("/home/eternal/this/qa_test.json", limit=1000)

with open(knowledge_base_file, "r") as file:
    knowledge_base = [preprocess_text(line) for line in tqdm(file, desc="Processing knowledge base", total=total_lines)]


Processing data: 100%|█████████████████████| 1000/1000 [00:01<00:00, 576.47it/s]
Processing data: 100%|█████████████████████| 1000/1000 [00:01<00:00, 588.78it/s]
Processing knowledge base:  23%|██▉          | 227/1000 [02:50<06:08,  2.10it/s]

KeyboardInterrupt: 

In [None]:
knowledge_base

In [None]:
import language_tool_python

tool = language_tool_python.LanguageTool('en-US')

def correct_grammar(text):
    matches = tool.check(text)
    corrected_text = language_tool_python.utils.correct(text, matches)
    return corrected_text

def postprocess_text(text):
    text = clean_text(text)
    text = correct_grammar(text)
    return text


In [None]:
# 테스트 질문에 대한 답변 생성 및 포스트 프로세싱 적용
results = []

for question in test_questions:
    inputs = tokenizer(question, return_tensors="pt")
    generated_answers = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
    decoded_answers = tokenizer.batch_decode(generated_answers, skip_special_tokens=True)
    postprocessed_answer = postprocess_text(decoded_answers[0])
    results.append({"question": question, "answer": postprocessed_answer})

# 결과 저장
with open("predictions.json", "w") as file:
    json.dump(results, file, indent=4)


In [8]:
#작동안되는 무한루프

import re
import time
import multiprocessing
import ijson
import os
from datetime import datetime

def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[^\w\s]', '', sentence)
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    return sentence

def process_lines(lines, questions):
    processed_lines = []
    for line in lines:
        original_line = line.strip()
        if any(word in original_line for question in questions for word in question.split()):
            processed_line = preprocess_sentence(original_line)
            processed_lines.append(processed_line + '\n')
        else:
            processed_lines.append(original_line + '\n')
    return processed_lines

def worker(wiki_file_path, questions, start, end, output_queue):
    with open(wiki_file_path, 'r', encoding='utf-8') as wiki_file:
        wiki_file.seek(start)
        lines = wiki_file.read(end - start).splitlines()
        processed_lines = process_lines(lines, questions)
        output_queue.put(processed_lines)

def extract_and_preprocess_sentences(questions, wiki_file_path, output_file_path, num_workers=4):
    start_time = datetime.now()
    print(f"Processing started at: {start_time}")

    file_size = os.path.getsize(wiki_file_path)
    chunk_size = file_size // num_workers

    output_queue = multiprocessing.Queue()
    processes = []
    for i in range(num_workers):
        start = i * chunk_size
        end = file_size if i == num_workers - 1 else (i + 1) * chunk_size
        p = multiprocessing.Process(target=worker, args=(wiki_file_path, questions, start, end, output_queue))
        processes.append(p)
        p.start()

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for _ in range(num_workers):
            processed_lines = output_queue.get()
            output_file.writelines(processed_lines)

    for p in processes:
        p.join()

    end_time = datetime.now()
    elapsed_time = (end_time - start_time).total_seconds()
    print(f"Processing completed at: {end_time}")
    print(f"Total processing time: {elapsed_time:.2f} seconds")

def extract_questions(file_path):
    questions = []
    with open(file_path, 'r', encoding='utf-8') as file:
        parser = ijson.parse(file)
        for prefix, event, value in parser:
            if prefix.endswith('.question') and event == 'string':
                questions.append(value)
    return questions

qa_file_path = '/home/eternal/qa_train.json'
questions = extract_questions(qa_file_path)

wiki_file_path = '/home/eternal/processed_wikipedia.txt/processed_wikipedia.txt'
output_file_path = '/home/eternal/processed_wikipedia.txt/processed_wikipedia_cleaned.txt'
extract_and_preprocess_sentences(questions, wiki_file_path, output_file_path)


Processing started at: 2024-06-09 16:06:49.719791


KeyboardInterrupt: 

In [9]:
#시간 많이 걸림

import re
import time
import multiprocessing
import ijson
import os
from datetime import datetime

def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[^\w\s]', '', sentence)
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    return sentence

def process_lines(lines, questions):
    processed_lines = []
    for line in lines:
        original_line = line.strip()
        if any(word in original_line for question in questions for word in question.split()):
            processed_line = preprocess_sentence(original_line)
            processed_lines.append(processed_line + '\n')
        else:
            processed_lines.append(original_line + '\n')
    return processed_lines

def worker(wiki_file_path, questions, start, end, output_queue):
    with open(wiki_file_path, 'r', encoding='utf-8') as wiki_file:
        wiki_file.seek(start)
        lines = wiki_file.read(end - start).splitlines()
        processed_lines = process_lines(lines, questions)
        output_queue.put(processed_lines)
    output_queue.put("DONE")

def extract_and_preprocess_sentences(questions, wiki_file_path, output_file_path, num_workers=4):
    start_time = datetime.now()
    print(f"Processing started at: {start_time}")

    file_size = os.path.getsize(wiki_file_path)
    chunk_size = file_size // num_workers

    output_queue = multiprocessing.Queue()
    processes = []
    for i in range(num_workers):
        start = i * chunk_size
        end = file_size if i == num_workers - 1 else (i + 1) * chunk_size
        p = multiprocessing.Process(target=worker, args=(wiki_file_path, questions, start, end, output_queue))
        processes.append(p)
        p.start()

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        completed_processes = 0
        while completed_processes < num_workers:
            processed_lines = output_queue.get()
            if processed_lines == "DONE":
                completed_processes += 1
            else:
                output_file.writelines(processed_lines)

    for p in processes:
        p.join()

    end_time = datetime.now()
    elapsed_time = (end_time - start_time).total_seconds()
    print(f"Processing completed at: {end_time}")
    print(f"Total processing time: {elapsed_time:.2f} seconds")

def extract_questions(file_path):
    questions = []
    with open(file_path, 'r', encoding='utf-8') as file:
        parser = ijson.parse(file)
        for prefix, event, value in parser:
            if prefix.endswith('.question') and event == 'string':
                questions.append(value)
    return questions



qa_file_path = '/home/eternal/qa_train.json'
questions = extract_questions(qa_file_path)

wiki_file_path = '/home/eternal/processed_wikipedia.txt/processed_wikipedia.txt'
output_file_path = '/home/eternal/processed_wikipedia.txt/processed_wikipedia_cleaned.txt'
extract_and_preprocess_sentences(questions, wiki_file_path, output_file_path)

Processing started at: 2024-06-09 16:34:42.950179


KeyboardInterrupt: 

In [1]:
#컴퓨터 데미지 입힌 코드

import ijson
import re
import os
import time
import multiprocessing
from rank_bm25 import BM25Okapi
from datetime import datetime

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    return text

def extract_questions(file_path):
    questions = []
    with open(file_path, 'r', encoding='utf-8') as file:
        parser = ijson.parse(file)
        for prefix, event, value in parser:
            if prefix.endswith('.question') and event == 'string':
                questions.append(preprocess_text(value))
    return questions

def bm25_worker(wiki_lines, questions, worker_id, output_queue):
    try:
        tokenized_corpus = [preprocess_text(line).split() for line in wiki_lines]
        bm25 = BM25Okapi(tokenized_corpus)
        relevant_sentences = []

        for question in questions:
            tokenized_question = question.split()
            top_n = bm25.get_top_n(tokenized_question, wiki_lines, n=5)  # 상위 5개 문장 선택 (조정 가능)
            relevant_sentences.extend(top_n)

        output_queue.put((worker_id, relevant_sentences))
        output_queue.put(("DONE", worker_id))
    except Exception as e:
        output_queue.put((worker_id, f"ERROR: {e}"))

def extract_and_preprocess_sentences(questions, wiki_file_path, output_file_path, num_workers=4):
    start_time = datetime.now()
    print(f"Processing started at: {start_time}")

    with open(wiki_file_path, 'r', encoding='utf-8') as wiki_file:
        wiki_lines = wiki_file.readlines()

    chunk_size = len(wiki_lines) // num_workers
    output_queue = multiprocessing.Queue()
    processes = []

    for i in range(num_workers):
        start = i * chunk_size
        end = len(wiki_lines) if i == num_workers - 1 else (i + 1) * chunk_size
        p = multiprocessing.Process(target=bm25_worker, args=(wiki_lines[start:end], questions, i, output_queue))
        processes.append(p)
        p.start()

    processed_count = 0
    total_lines = len(wiki_lines)

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        completed_processes = 0
        while completed_processes < num_workers:
            result = output_queue.get()
            if result[0] == "DONE":
                worker_id = result[1]
                completed_processes += 1
                print(f"Worker {worker_id} completed. Total completed processes: {completed_processes}/{num_workers}")
            elif result[0] == "ERROR":
                worker_id = result[1]
                error_message = result[1]
                print(f"Worker {worker_id} encountered an error: {error_message}")
            else:
                worker_id, processed_lines = result
                output_file.writelines(processed_lines)
                processed_count += len(processed_lines)
                print(f"Worker {worker_id} processed {len(processed_lines)} lines. Total processed: {processed_count}/{total_lines}")

    for p in processes:
        p.join()

    end_time = datetime.now()
    elapsed_time = (end_time - start_time).total_seconds()
    print(f"Processing completed at: {end_time}")
    print(f"Total processing time: {elapsed_time:.2f} seconds")

qa_file_path = '/home/eternal/qa_train.json'
questions = extract_questions(qa_file_path)
print(questions[:10])  # 추출된 질문 일부 출력

wiki_file_path = '/home/eternal/processed_wikipedia.txt/processed_wikipedia.txt'
output_file_path = '/home/eternal/processed_wikipedia.txt/relevant_sentences1.txt'
extract_and_preprocess_sentences(questions, wiki_file_path, output_file_path)
print(f"Extracted relevant sentences saved to {output_file_path}.")


['in what year the the venue that marcia white is president of open', 'what country is home to the sports club loaning bruno paulista to vasco da gama', 'southern air featured ray stevens minnie pearl and what other southern comedian', 'the treaty of prairie du chien may refer to any of several treaties made between the united states and representatives from the native american people who called themselves what', 'what profession do thomas merton and michael moorcock have in common', 'who was the first person to describe relational quantum mechanics and has worked in italy the united states and france', 'where is the silver dollar city theme park with the grand exposition  steel roller coaster located', 'finna get loose is a song by puff daddy and what american rapper who is also a record and film producer', 'what town is hore abbey located near', 'in what year was the actress who was starred in the telling with holly madison born']
Processing started at: 2024-06-09 16:49:07.102569


KeyboardInterrupt: 

In [1]:
# 데미지2

import ijson
import re
import os
import time
import multiprocessing
from rank_bm25 import BM25Okapi
from datetime import datetime
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

def extract_questions(file_path):
    questions = []
    with open(file_path, 'r', encoding='utf-8') as file:
        parser = ijson.parse(file)
        for prefix, event, value in parser:
            if prefix.endswith('.question') and event == 'string':
                questions.append(preprocess_text(value))
    return questions

def get_bert_embeddings(sentences, model, tokenizer):
    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

def calculate_similarity(embedding1, embedding2):
    return torch.cosine_similarity(embedding1, embedding2)


In [2]:
qa_file_path = '/home/eternal/qa_train.json'
questions = extract_questions(qa_file_path)
print(questions[:10])  # 추출된 질문 일부 출력


['in what year the the venue that marcia white is president of open', 'what country is home to the sports club loaning bruno paulista to vasco da gama', 'southern air featured ray stevens minnie pearl and what other southern comedian', 'the treaty of prairie du chien may refer to any of several treaties made between the united states and representatives from the native american people who called themselves what', 'what profession do thomas merton and michael moorcock have in common', 'who was the first person to describe relational quantum mechanics and has worked in italy the united states and france', 'where is the silver dollar city theme park with the grand exposition  steel roller coaster located', 'finna get loose is a song by puff daddy and what american rapper who is also a record and film producer', 'what town is hore abbey located near', 'in what year was the actress who was starred in the telling with holly madison born']


In [3]:
def bm25_bert_worker(wiki_lines, questions, worker_id, output_queue, progress_array, model_name, tokenizer_name):
    try:
        print(f"Worker {worker_id} started.")
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
        model = BertModel.from_pretrained(model_name)
        tokenized_corpus = [preprocess_text(line).split() for line in wiki_lines]
        bm25 = BM25Okapi(tokenized_corpus)
        relevant_sentences = []

        total_questions = len(questions)
        for idx, question in enumerate(questions):
            if idx % 100 == 0:
                print(f"Worker {worker_id}: Processing question {idx}/{total_questions}")
            
            tokenized_question = question.split()
            bm25_scores = bm25.get_scores(tokenized_question)
            top_n_indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:5]
            top_n_sentences = [wiki_lines[i] for i in top_n_indices]

            question_embedding = get_bert_embeddings([question], model, tokenizer)
            sentence_embeddings = get_bert_embeddings(top_n_sentences, model, tokenizer)
            similarities = [calculate_similarity(question_embedding, sentence_embedding.unsqueeze(0)) for sentence_embedding in sentence_embeddings]

            best_sentence_idx = similarities.index(max(similarities))
            relevant_sentences.append(top_n_sentences[best_sentence_idx])

            if (idx + 1) % 10 == 0:  # 10개의 질문마다 진행 상황을 업데이트
                progress_array[worker_id] = idx + 1

        output_queue.put((worker_id, relevant_sentences))
        output_queue.put(("DONE", worker_id))
    except Exception as e:
        output_queue.put((worker_id, f"ERROR: {e}"))
        print(f"Worker {worker_id} encountered an error: {e}")

def extract_and_preprocess_sentences_single_process(questions, wiki_file_path, output_file_path):
    start_time = datetime.now()
    print(f"Processing started at: {start_time}")

    model_name = 'bert-base-uncased'
    tokenizer_name = 'bert-base-uncased'

    with open(wiki_file_path, 'r', encoding='utf-8') as wiki_file:
        wiki_lines = wiki_file.readlines()

    output_queue = multiprocessing.Queue()
    progress_array = multiprocessing.Array('i', 1)

    bm25_bert_worker(wiki_lines, questions, 0, output_queue, progress_array, model_name, tokenizer_name)

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        while not output_queue.empty():
            result = output_queue.get()
            if result[0] == "DONE":
                print(f"Worker completed.")
            elif result[0] == "ERROR":
                worker_id, error_message = result
                print(f"Worker encountered an error: {error_message}")
            else:
                worker_id, processed_lines = result
                output_file.writelines(processed_lines)
                print(f"Processed {len(processed_lines)} lines.")

    end_time = datetime.now()
    elapsed_time = (end_time - start_time).total_seconds()
    print(f"Processing completed at: {end_time}")
    print(f"Total processing time: {elapsed_time:.2f} seconds")





In [None]:
wiki_file_path = '/home/eternal/processed_wikipedia.txt/processed_wikipedia.txt'
output_file_path = '/home/eternal/processed_wikipedia.txt/relevant_sentences.txt'
extract_and_preprocess_sentences_single_process(questions, wiki_file_path, output_file_path)
print(f"Extracted relevant sentences saved to {output_file_path}.")


Processing started at: 2024-06-09 19:10:30.271591
Worker 0 started.


In [None]:
#동일하게 데미지

import ijson
import re
from difflib import SequenceMatcher
import nltk
from nltk.corpus import words
from spellchecker import SpellChecker
import sys
import time
from multiprocessing import Pool

# Ensure nltk word list is downloaded
nltk.download('words')

In [14]:
def extract_questions(file_path):
    questions = []
    with open(file_path, 'r', encoding='utf-8') as file:
        parser = ijson.parse(file)
        for prefix, event, value in parser:
            if prefix.endswith('.question') and event == 'string':
                questions.append(value)
    return questions

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text

def correct_spelling(text):
    spell = SpellChecker()
    corrected_text = []
    for word in text.split():
        corrected_word = spell.correction(word)
        if corrected_word is None:
            corrected_word = word
        corrected_text.append(corrected_word)
    return ' '.join(corrected_text)

def is_relevant_sentence(question, sentence, threshold=0.6):
    question = correct_spelling(preprocess_text(question))
    sentence = correct_spelling(preprocess_text(sentence))
    similarity = SequenceMatcher(None, question, sentence).ratio()
    return similarity > threshold

def process_chunk(questions, chunk, threshold=0.6):
    relevant_sentences = set()
    for line in chunk:
        for question in questions:
            if is_relevant_sentence(question, line, threshold):
                relevant_sentences.add(line.strip())
                break
    return relevant_sentences

def extract_and_replace_relevant_sentences(questions, wiki_file_path, output_file_path, chunk_size=1000, num_workers=12):
    relevant_sentences = set()
    total_lines = sum(1 for _ in open(wiki_file_path, 'r', encoding='utf-8'))

    start_time = time.time()
    with open(wiki_file_path, 'r', encoding='utf-8') as wiki_file:
        pool = Pool(num_workers)
        chunk = []
        for i, line in enumerate(wiki_file):
            chunk.append(line)
            if (i + 1) % chunk_size == 0:
                elapsed_time = time.time() - start_time
                lines_processed = i + 1
                remaining_lines = total_lines - lines_processed
                estimated_total_time = (elapsed_time / lines_processed) * total_lines
                estimated_remaining_time = estimated_total_time - elapsed_time
                print(f"Processing line {lines_processed}/{total_lines}")
                print(f"Elapsed time: {elapsed_time:.2f} seconds")
                print(f"Estimated remaining time: {estimated_remaining_time / 3600:.2f} hours")
                sys.stdout.flush()
                # Use starmap to pass multiple arguments to process_chunk
                results = pool.starmap(process_chunk, [(questions, chunk, 0.6)])
                for result in results:
                    relevant_sentences.update(result)
                chunk = []
        
        # Process remaining lines
        if chunk:
            results = pool.starmap(process_chunk, [(questions, chunk, 0.6)])
            for result in results:
                relevant_sentences.update(result)

    pool.close()
    pool.join()

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for sentence in relevant_sentences:
            output_file.write(sentence + '\n')

    return relevant_sentences


In [15]:
# 파일 경로 설정
qa_file_path = '/home/eternal/qa_train.json'
wiki_file_path = '/home/eternal/processed_wikipedia.txt/processed_wikipedia.txt'
output_file_path = '/home/eternal/processed_wikipedia.txt/processed_wikipedia_filtered.txt'

# 질문 추출
print("Extracting questions...")
questions = extract_questions(qa_file_path)
print(f"Extracted {len(questions)} questions.")
print(questions[:10])  # 추출된 질문 일부 출력

# 관련 문장 추출 및 저장
print("Extracting relevant sentences...")
relevant_sentences = extract_and_replace_relevant_sentences(questions, wiki_file_path, output_file_path)
print(f"Extracted {len(relevant_sentences)} relevant sentences.")


Extracting questions...
Extracted 88066 questions.
['In what year the the venue that Marcia White is president of open?', 'What country is home to the sports club loaning Bruno Paulista to Vasco da Gama?', '"Southern Air" featured Ray Stevens, Minnie Pearl and what other Southern comedian?', 'The Treaty of Prairie du Chien may refer to any of several treaties made between the United States and representatives from the native american people who called themselves what?', 'What profession do Thomas Merton and Michael Moorcock have in common?', 'Who was the first person to describe Relational Quantum mechanics and has worked in Italy, the United States, and France?', 'Where is the Silver Dollar city theme park with the Grand exposition  steel roller coaster located?', 'Finna Get Loose is a song by Puff Daddy and what American rapper who is also a record and film producer?', 'What town is Hore Abbey located near?', 'In what year was the actress who was starred in "The Telling" with Holly M

Process ForkPoolWorker-106:
Process ForkPoolWorker-107:
Process ForkPoolWorker-102:
Process ForkPoolWorker-97:
Process ForkPoolWorker-98:
Process ForkPoolWorker-108:


KeyboardInterrupt: 

Process ForkPoolWorker-105:
Process ForkPoolWorker-100:
Process ForkPoolWorker-104:
Process ForkPoolWorker-101:
Process ForkPoolWorker-103:
Process ForkPoolWorker-99:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
Tra