In [None]:
%%capture
%pip install pdfplumber
%pip install pytesseract
%pip install langchain
%pip install langchain-community
%pip install langchain-huggingface
%pip install langchain_google_genai
%pip install torch
%pip install faiss-gpu
%pip install gradio

In [None]:
from langchain.docstore.document import Document
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader

import pdfplumber



In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

True
1
Tesla T4


In [None]:
documents = TextLoader("/content/_.txt").load()

In [None]:
class PDFPlumberLoader:
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> List[Document]:
        documents = []

        with pdfplumber.open(self.file_path) as pdf:
            for i, page in enumerate(pdf.pages):
                text = page.extract_text()
                if text.strip():
                    doc = Document(
                        page_content=text,
                        metadata={
                            "source": self.file_path,
                            "page": i + 1,
                            "total_pages": len(pdf.pages)
                        }
                    )
                    documents.append(doc)

        return documents

pdf_filepath = '/content/1712809877635_0.pdf'
loader = PDFPlumberLoader(pdf_filepath)
pages = loader.load()

In [None]:
combined_documents = documents + pages

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len
)

texts = text_splitter.split_documents(combined_documents)

In [None]:
model_name = "BAAI/bge-m3"
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
vectorstore = FAISS.from_documents(documents=texts, embedding=hf_embeddings)

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [None]:
from langchain.prompts import PromptTemplate

template = template = """
You are an intelligent school assistant designed to provide students with accurate, concise, and contextually relevant answers about their academic needs. Pay special attention to nuances in questions, including abbreviations, informal language, and the tone of the student's question.

**Guidelines:**
1. **Interpreting Abbreviated Department Names**: If a department or major name is given as an abbreviation or shortened form (e.g., '소융과' for 소프트웨어융합과, '스아과' for 스마트IT과, '스건과' as '스포츠건강관리과'), recognize and interpret it correctly to provide accurate information. Abbreviations may vary, so rely on context and known abbreviations to clarify meaning.

2. **Answer Clarity and Brevity**: Provide answers that are precise, concise, and directly address the question. Avoid unnecessary detail unless it adds value. Ensure that each answer is easily understandable and focused on the student’s main concern.

3. **Translation Accuracy**: When translation is needed, respond in the language the student used for their question, ensuring all key details and meaning are preserved. Adapt the tone as needed for smooth, natural phrasing within the context, always maintaining professionalism and approachability.

4. **Handling Uncertainty**: If any part of the question is unclear, or if you lack specific information to answer confidently, state that you do not know, and suggest an alternative if applicable.

Question: {question}
Context: {context}
Answer:
"""

PROMPT = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

In [None]:
import os

os.environ["GOOGLE_API_KEY"] = "Your-api-key"

In [None]:
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)
from langchain.chains import RetrievalQA

chain_type_kwargs = {"prompt": PROMPT}

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash",
                             temperature=0,
                            safety_settings={
                            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                            },
)

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

In [None]:
from __future__ import annotations
from typing import Iterable
import gradio as gr
from gradio.themes.base import Base
from gradio.themes.utils import colors, fonts, sizes

In [None]:
def response(message, chat_history):
    # 봇의 응답을 받아옴
    result = chain.invoke({"query": message})
    bot_message = result['result']

    # 사용자와 봇 메시지를 딕셔너리 형태로 추가
    chat_history.append({'role': 'user', 'content': message})
    chat_history.append({'role': 'assistant', 'content': bot_message})

    # 채팅을 반환
    return "", chat_history

css = """
@import url("https://cdn.jsdelivr.net/gh/orioncactus/pretendard@v1.3.8/dist/web/static/pretendard.css");

*{
    font-family: "Pretendard", sans-serif !important;
}

"""

with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
    gr.Markdown(
        """
        # 나래스톤🌟
        나래스톤은 재학생 분들을 위한 챗봇입니다.
        교내 생활 중 궁금한 점을 자유롭게 물어보세요!
        """
    )

    chatbot = gr.Chatbot(type='messages')

    with gr.Row():
        msg = gr.Textbox(
            placeholder="궁금한 점을 입력해주세요..",
            show_label=False,
            container=False
        )
        submit_btn = gr.Button(
            "전송",
            variant="primary"
        )

    gr.Examples(
        examples=[
            "스마트IT과 과사 전화번호 뭐야",
            "Hey, do you know where the water cooler is in the Information and Culture Center?",
            "図書館の開館時間を教えてください"
        ],
        inputs=msg,
        label="이렇게 질문해보세요"
    )

    submit_btn.click(
        fn=response,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot]
    )
    msg.submit(
        fn=response,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot]
    )

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9c52c7d9d5250204a7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


