In [4]:
import requests
from bs4 import BeautifulSoup
import os
from time import sleep

def get_html(url):
    response = requests.get(url)
    return response.text

def parse_walkthrough_parts(game_url):
    html_content = get_html(game_url)
    soup = BeautifulSoup(html_content, 'html.parser')
    parts = []

    table = soup.find('table', {'class': 'roundy'})

    if table:
        for row in table.find_all('tr', {'style': 'background: #FFF;'}):
            part = row.find('th')
            keywords = row.find('td')
            if part and keywords:
                part_name = part.get_text(strip=True)
                part_url = 'https://bulbapedia.bulbagarden.net' + part.find('a')['href']
                keywords_text = keywords.get_text(strip=True)
                parts.append((part_name, part_url, keywords_text))

    return parts

def scrape_walkthrough_text(part_url):
    html_content = get_html(part_url)
    soup = BeautifulSoup(html_content, 'html.parser')
    
    walkthrough_div = soup.find('div', class_='mw-parser-output')
    if walkthrough_div:
        walkthrough_text = walkthrough_div.get_text(separator='\n', strip=True)
        return walkthrough_text
    return ""

def scrape_core_game_walkthrough(core_game_url):
    parts = parse_walkthrough_parts(core_game_url)
    walkthrough_data = []

    for part_name, part_url, keywords in parts:
        text = scrape_walkthrough_text(part_url)
        walkthrough_data.append({
            'Part': part_name,
            'URL': part_url,
            'Keywords': keywords,
            'Text': text
        })
        sleep(1)  # To avoid overwhelming the server

    return walkthrough_data

def save_walkthrough_as_text(game_name, walkthrough_data):
    folder_path = os.path.join('walkthroughs', game_name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    for part in walkthrough_data:
        filename = os.path.join(folder_path, f"{part['Part'].replace('/', '_')}.txt")
        
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(f"Part: {part['Part']}\n")
            file.write(f"URL: {part['URL']}\n")
            file.write(f"Keywords: {part['Keywords']}\n\n")
            file.write("Walkthrough Text:\n")
            file.write(part['Text'])
        
        print(f"Saved {filename}")

def main():
    core_games = [
        ('LegendsArceus', 'https://bulbapedia.bulbagarden.net/wiki/Appendix:Legends:_Arceus_walkthrough'),
        ('Scarlet_and_Violet', 'https://bulbapedia.bulbagarden.net/wiki/Appendix:Scarlet_and_Violet_walkthrough')
    ]

    # Create the main walkthroughs folder if it doesn't exist
    if not os.path.exists('walkthroughs'):
        os.makedirs('walkthroughs')

    for game_name, game_url in core_games:
        print(f"Scraping walkthrough for {game_name}...")
        walkthrough_data = scrape_core_game_walkthrough(game_url)
        save_walkthrough_as_text(game_name, walkthrough_data)
        print(f"Finished scraping {game_name}\n")

if __name__ == '__main__':
    main()

Scraping walkthrough for LegendsArceus...
Saved walkthroughs\LegendsArceus\Part 1.txt
Saved walkthroughs\LegendsArceus\Part 2.txt
Saved walkthroughs\LegendsArceus\Part 3.txt
Saved walkthroughs\LegendsArceus\Part 4.txt
Saved walkthroughs\LegendsArceus\Part 5.txt
Saved walkthroughs\LegendsArceus\Part 6.txt
Saved walkthroughs\LegendsArceus\Part 7.txt
Saved walkthroughs\LegendsArceus\Part 8.txt
Saved walkthroughs\LegendsArceus\Part 9.txt
Saved walkthroughs\LegendsArceus\Part 10.txt
Saved walkthroughs\LegendsArceus\Part 11.txt
Saved walkthroughs\LegendsArceus\Part 12.txt
Saved walkthroughs\LegendsArceus\Part 13.txt
Saved walkthroughs\LegendsArceus\Part 14.txt
Saved walkthroughs\LegendsArceus\Part 15.txt
Saved walkthroughs\LegendsArceus\Part 16.txt
Saved walkthroughs\LegendsArceus\Part 17.txt
Saved walkthroughs\LegendsArceus\Requests: Part 1.txt
Saved walkthroughs\LegendsArceus\Requests: Part 2.txt
Saved walkthroughs\LegendsArceus\Requests: Part 3.txt
Saved walkthroughs\LegendsArceus\Request

In [6]:
import os
from typing import List
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
from dotenv import load_dotenv

load_dotenv()

def load_and_embed_walkthroughs(file_paths: List[str]) -> FAISS:
    documents = []
    for file_path in file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                # Extract metadata from the content
                lines = content.split('\n')
                metadata = {
                    'game': lines[0].split(': ')[1],
                    'part': lines[1].split(': ')[1],
                    'keywords': lines[2].split(': ')[1],
                    'file_name': os.path.basename(file_path)
                }
                # Create a Document with the content and metadata
                doc = Document(page_content=content, metadata=metadata)
                documents.append(doc)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")

    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(documents)
    
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(texts, embeddings)
    
    return vectorstore

# Test with the first 4 files
test_files = [
    r"walkthrough_rewrites\Black_and_White\Black_and_White - Part 1 - Introduction, Nuvema Town, Juniper's Lab, Route 1, Accumula Town.txt",
    r"walkthrough_rewrites\Black_and_White\Black_and_White - Part 2 - Route 2, Striaton City, The Dreamyard, Striaton Gym.txt",
    r"walkthrough_rewrites\Black_and_White\Black_and_White - Part 3 - Route 3, Wellspring Cave, Nacrene City, Nacrene Gym.txt",
    r"walkthrough_rewrites\Black_and_White\Black_and_White - Part 4 - Pinwheel Forest, Skyarrow Bridge, Castelia City, Castelia Gym.txt"
]

# Create and save the test vectorstore
test_vectorstore = load_and_embed_walkthroughs(test_files)
test_vectorstore.save_local("test_vectorstore")

print("Test vectorstore created and saved successfully!")

# Function to get all walkthrough files
def get_all_walkthrough_files(root_dir: str) -> List[str]:
    all_files = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))
    return all_files

# Uncomment the following lines when ready to process all files
all_walkthrough_files = get_all_walkthrough_files('walkthrough_rewrites')
full_vectorstore = load_and_embed_walkthroughs(all_walkthrough_files)
full_vectorstore.save_local("full_vectorstore")
print("Full vectorstore created and saved successfully!")

Created a chunk of size 1081, which is longer than the specified 1000
Created a chunk of size 1067, which is longer than the specified 1000
Created a chunk of size 1236, which is longer than the specified 1000


Test vectorstore created and saved successfully!
Full vectorstore created and saved successfully!
