<a href="https://colab.research.google.com/github/estorl03-tech/RAG/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade --quiet openai

In [None]:
pip show openai

In [None]:
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
client = OpenAI()

def vectorize_text(text):
  response = client.embeddings.create(
    input = text,
    model = "text-embedding-3-small"
    )
  return response.data[0].embedding

def scrape_article(url):

  response = requests.get(url)
  soup = BeautifulSoup(response.text, "html.parser")

  text_nodes = soup.find_all("div", class_="entry-content")

  joined_text = "".join(t.text.replace("/n", "") for t in text_nodes)

   #Q&Aの開始目印の位置を探す（例：「未来の宇宙航空技術について」あたり）
  start_keyword = "未来の宇宙航空技術について"
  start_pos = joined_text.find(start_keyword)

   #Q&Aの終了目印の位置を探す（例：「Designed with WordPress」あたり）
  end_keyword = "Designed with WordPress"
  end_pos = joined_text.find(end_keyword)

  if start_pos != -1 and end_pos != -1 and end_pos > start_pos:
      clean_text = joined_text[start_pos:end_pos].strip()
  else:
      clean_text = joined_text  # キーワードが見つからなければ元のまま

  return clean_text

def chunk_text(text, chunk_size, overlap):
  chunks = []
  start = 0

  while start < len(text):
    end = min(start + chunk_size, len(text))
    chunks.append(text[start:end])
    start += chunk_size - overlap
  return chunks

def find_most_similar(question_vector, chunk_vectors, chunks):
    max_similarity = -1
    best_chunk = ""
    for vec, chunk in zip(chunk_vectors, chunks):
        similarity = cosine_similarity([question_vector], [vec])[0][0]
        if similarity > max_similarity:
            max_similarity = similarity
            best_chunk = chunk
    return best_chunk

def ask_question(question, context):
    messages = [
        {"role": "system", "content": ""},
        {"role": "user", "content": f"[情報]\n{context}\n\n[質問]\n{question}"}
    ]
    response = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=messages,
        max_tokens=300
    )
    return response.choices[0].message.content



url = "https://space0024.wordpress.com/"
chunk_size = 1000
overlap = 100

article_text = scrape_article(url)

text_chunks = chunk_text(article_text, chunk_size, overlap)
vectors = [vectorize_text(chunk) for chunk in text_chunks]

question = "宇宙太陽光発電とはどんな計画ですか？"

question_vector = vectorize_text(question)

best_chunk = find_most_similar(question_vector, vectors, text_chunks)

answer = ask_question(question, best_chunk)
print(answer)

宇宙太陽光発電は、「宇宙太陽光利用システム（SSPS）」として知られ、静止軌道上で太陽光を集め、それを地上へ送る構想です。このシステムでは、宇宙空間で集めた太陽エネルギーを効率的に利用し、電力を地上に供給することが目指されています。
