In [3]:
import os
import pypdf  # ✅ PyPDF 대신 pypdf 사용!

def load_pdf_text(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = pypdf.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

# 논문 PDF 파일 경로 지정
pdf_path = "/Users/anchanghun/Downloads/Document_ach/paper/9_AnomalyDetection/TSSAN_Time-Space_Separable_Attention_Network_for_Intrusion_Detection.pdf"
text = load_pdf_text(pdf_path)

# 출력 확인
print(f"📄 논문에서 추출한 텍스트 (일부):\n{text[:500]}")

📄 논문에서 추출한 텍스트 (일부):
Received 14 June 2024, accepted 13 July 2024, date of publication 16 July 2024, date of current version 24 July 2024.
Digital Object Identifier 10.1 109/ACCESS.2024.3429420
TSSAN: Time-Space Separable Attention Network
for Intrusion Detection
RUI XU
 , QI ZHANG, AND YUNJIE ZHANG
School of Computer Science and Technology, Soochow University, Suzhou, Jiangsu 215008, China
Corresponding author: Rui Xu (rxu1026@stu.suda.edu.cn)
ABSTRACT With the continuous evolution of novel network attacks, traditi


In [4]:
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np

# 무료 임베딩 모델 (Hugging Face)
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# 텍스트를 문장 단위로 나누기
sentences = text.split("\n")
embeddings = embedding_model.encode(sentences)

# FAISS 벡터 저장소 생성
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))

# 저장된 벡터 개수 확인
print(f"🔍 저장된 문장 개수: {index.ntotal}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔍 저장된 문장 개수: 1265


In [5]:
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np

# 무료 임베딩 모델 (Hugging Face)
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# 텍스트를 문장 단위로 나누기
sentences = text.split("\n")
embeddings = embedding_model.encode(sentences)

# FAISS 벡터 저장소 생성
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))

# 저장된 벡터 개수 확인
print(f"🔍 저장된 문장 개수: {index.ntotal}")

🔍 저장된 문장 개수: 1265


In [6]:
def search_relevant_text(query, top_k=3):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    
    results = [sentences[idx] for idx in indices[0]]
    return "\n".join(results)

# 사용자 질문 입력
query = "이 논문의 주요 기여는 무엇인가?"
retrieved_text = search_relevant_text(query)

print(f"🔍 검색된 문서 내용:\n{retrieved_text}")

🔍 검색된 문서 내용:
To assess the performance of TSSAN, we conducted
K LN
TSSAN (supervised). Nevertheless, in terms of training


In [None]:
from transformers import pipeline

# ✅ 무료 공개 LLM 모델 사용 (Gated Repo 필요 없음)
qa_pipeline = pipeline("text-generation", model="tiiuae/falcon-7b-instruct")

def generate_answer(query, context):
    prompt = f"질문: {query}\n\n관련 정보: {context}\n\n답변:"
    response = qa_pipeline(prompt, max_length=200, do_sample=True)
    return response[0]['generated_text']

# 테스트 실행
query = "이 논문의 주요 기여는 무엇인가?"
context = "논문의 핵심 내용을 포함한 검색 결과"
answer = generate_answer(query, context)

print(f"📝 질문: {query}\n💡 답변: {answer}")

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def summarize_text(text):
    prompt = f"다음 논문 내용을 요약해줘:\n{text[:2000]}"  # 길이 제한 (2000자)
    response = qa_pipeline(prompt, max_length=300, do_sample=True)
    return response[0]['generated_text']

summary = summarize_text(text)

print(f"📄 논문 요약:\n{summary}")