In [38]:
import os
import pandas as pd
from src.utils import read_json
import google.generativeai as genai
import google.ai.generativelanguage as glm

from langchain_huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS
from langchain.chains import RetrievalQA
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [23]:
organized_result = pd.read_csv('temp_result/organized_result_1.csv')
organized_result.shape

(76, 5)

In [9]:
configs = read_json('.env/configs.json')
api_key =configs['g_key']
os.environ["GOOGLE_API_KEY"] = api_key
genai.configure(api_key=api_key)

In [10]:
generation_config = {
  "temperature": 0.9,
  "top_p": 1,
  "top_k": 1,
  "max_output_tokens": 2048,
}

safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  }
]

print()
print(genai.get_model('models/gemini-pro'),'\n')
print(genai.get_model('models/gemini-pro-vision'),'\n')


Model(name='models/gemini-pro',
      base_model_id='',
      version='001',
      display_name='Gemini 1.0 Pro',
      description='The best model for scaling across a wide range of tasks',
      input_token_limit=30720,
      output_token_limit=2048,
      supported_generation_methods=['generateContent', 'countTokens'],
      temperature=0.9,
      top_p=1.0,
      top_k=None) 

Model(name='models/gemini-pro-vision',
      base_model_id='',
      version='001',
      display_name='Gemini 1.0 Pro Vision',
      description='The best image understanding model to handle a broad range of applications',
      input_token_limit=12288,
      output_token_limit=4096,
      supported_generation_methods=['generateContent', 'countTokens'],
      temperature=0.4,
      top_p=1.0,
      top_k=32) 



In [17]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 500,
    length_function= len
)

In [24]:
reason_loader = DataFrameLoader(organized_result, page_content_column='reason')
reason_data = reason_loader.load()

target_loader = DataFrameLoader(organized_result, page_content_column='target')
target_data = target_loader.load()

reason_documents = text_splitter.transform_documents(reason_data)
print("已為所有文件進行 chunk ",len(reason_documents),"筆")

已為所有文件進行 chunk  76 筆


In [26]:
model_name = "sentence-transformers/distiluse-base-multilingual-cased-v1"
embeddings = HuggingFaceEmbeddings(model_name=model_name)



In [30]:
vector_db_path = "./vector_db/"
chroma_collection_name = "langchain"
if not os.path.exists(vector_db_path):
    os.makedirs(vector_db_path)
    print(f"Created folder: {vector_db_path}")
    
vector_store = FAISS.from_documents(reason_documents, embeddings)
vector_store.save_local(folder_path=vector_db_path)

# load FAISS
vectorstore = FAISS.load_local(folder_path=vector_db_path, allow_dangerous_deserialization=True, embeddings=embeddings)

In [34]:
# query = "查詢: 行政院原住民委員會的採購案"
# embedding_vector = embeddings.embed_query(query)
# docs = vectorstore.similarity_search_by_vector(embedding_vector, k=1)

# match_content_list = []
# for page in docs:
#     match_content_list.append(page.page_content)
#     print(page.page_content)
#     print(page.metadata["target"])
#     print(page.metadata['document'])

行政院客家委員會、行政院原住民族委員會及僑務委員會將客家電視、原住民電視及臺灣宏觀電視等頻道節目之製播，依政府採購法委由財團法人公共電視文化事業基金會辦理，雖非無據，惟造成公視基金會與客委會、原民會及僑委會針對上開頻道節目之製播各有立場、互生歧見，致紛爭不斷，詎行政院新聞局坐視上開爭端，非但未積極研修法令，亦未尋求妥適解決辦法，核有怠失，爰依法提案糾正
行政院新聞局
52587.pdf


In [35]:
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)

llm = ChatGoogleGenerativeAI(
    model="gemini-pro",
    convert_system_message_to_human=True,
    safety_settings={
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    },
)

In [74]:
def query_similarity_content(query, top_k=1):
    embedding_vector = embeddings.embed_query(query)
    docs = vectorstore.similarity_search_by_vector(embedding_vector, k=top_k)
    reference = {}
    for i, page in enumerate(docs):
        # match_content_list.append(page.page_content)
        reference[i] = {
            "reference_content": page.page_content,
            "metadata": page.metadata
        }


    template = """
    ### INSTRUCTION: 你是一位資深的監察院資料查詢專家。你的目標是解釋以下 REF 資料，用簡潔準確的方式説明其他人瞭解查詢到的相關內容。",
    ### REF: {reference}
    ### USER: {query}
    ### ASSISTANT: """
    prompt = PromptTemplate.from_template(template)
    chain = prompt | llm
    data = {
          "reference": " ".join(reference[0]['reference_content']),
          "query": query,
      }
    return chain, data, reference

query = "行政院原住民委員會的相關資訊"

chain, data, reference = query_similarity_content(query)
for s in chain.stream(data):
    print(s.content, end="", flush=True)
    
print("\n資料來源:", reference[0]['metadata']['document'])



行政院原住民委員會將原住民電視頻道的節目製作委託給公共電視文化事業基金會辦理，但與基金會在節目製作上意見分歧，導致糾紛不斷。行政院新聞局未積極研議法令或尋求解決方案，因此監察院糾正行政院新聞局。
資料來源: 52587.pdf
