In [1]:
# Import required libraries
from dotenv import load_dotenv, find_dotenv
from langchain_community.document_loaders import JSONLoader
import json
from bs4 import BeautifulSoup
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

In [2]:
# Load environment variables
# Read local .env file for environment variables
# We use OpenAI during embedding so you need to save your OpenAI API Key in the .env file in the same directory as this script
_ = load_dotenv(find_dotenv())  

In [4]:
def load_data(json_file_path):
    """
    Load the JSON data from the specified file.

    Args:
        json_file_path (str): Path to the JSON file containing blog posts.

    Returns:
        dict: The loaded JSON data.
    """
    with open(json_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

In [5]:
def clean_html(content):
    """
    Remove HTML tags from the content using BeautifulSoup.

    Args:
        content (str): The original HTML content.

    Returns:
        str: The cleaned content with HTML tags removed.
    """
    soup = BeautifulSoup(content, "html.parser")
    return soup.get_text(separator=' ')

In [6]:
def metadata_func(record, metadata):
    """
    Extract metadata from a record.

    Args:
        record (dict): The original record data.
        metadata (dict): An empty or existing metadata dictionary.

    Returns:
        dict: The updated metadata dictionary with extracted values.
    """
    metadata["title"] = record.get("title")
    metadata["published_date"] = record.get("published_date")
    metadata["url"] = record.get("url")
    return metadata

In [7]:
# Load and process the data
json_file_path = "all_posts.json"  # Placeholder for the JSON file path: path/to/your/all_posts.json
published_posts = load_data(json_file_path)

In [8]:
# Ensure no empty lists in categories or tags
for post in published_posts["posts"]:
    post["categories"] = post.get("categories", ["None"])
    post["tags"] = post.get("tags", ["None"])

In [9]:
# Clean each post content of irrelevant HTML tags
for post in published_posts["posts"]:
    post["cleaned_content"] = clean_html(post["content"])

In [10]:
# Print a sample post for verification
sample_post = published_posts["posts"][0]
print(f"Sample post after cleaning:\nTitle: {sample_post['title']}\nContent: {sample_post['cleaned_content'][:100]}...")

Sample post after cleaning:
Title: Zion National Park guide for First-timer Families with kids
Content: 
 Nestled in the heart of Utah's canyon country, Zion National Park offers a stunning array of red r...


In [11]:
# Save the cleaned content back to the JSON file
with open(json_file_path, "w", encoding="utf-8") as f:
    json.dump(published_posts, f, ensure_ascii=False, indent=4)
print("Cleaned content saved back to JSON file.")

Cleaned content saved back to JSON file.


In [12]:
# Load the data using the JSONLoader with specified configurations
loader = JSONLoader(
    file_path=json_file_path,
    jq_schema='.posts[]',  # Adjust according to your JSON structure
    content_key="cleaned_content",
    metadata_func=metadata_func
)

In [14]:
# Load the documents from the loader and print a sample
documents = loader.load()
print(f"Loaded {len(documents)} documents.")

Loaded 6 documents.


In [15]:
# Define the token splitter with specific configurations
token_splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap=0,  # Overlap between chunks
    tokens_per_chunk=256  # Number of tokens per chunk
)

In [16]:
# Split the documents into chunks based on tokens
all_splits = token_splitter.split_documents(documents)
print(f"Total document splits: {len(all_splits)}")

Total document splits: 30


In [17]:
# randomly sample meta data from a chunk to check
all_splits[10].metadata

{'source': '/Users/chandler/Documents/chatbot_public/all_posts.json',
 'seq_num': 3,
 'title': 'What I am (still) grateful for after 2 years in the US',
 'published_date': 'Thu, 14 Dec 2023 03:30:00 +0000',
 'url': 'https://www.chandlernguyen.com/blog/2023/12/13/what-i-am-still-grateful-for-after-2-years-in-the-us/'}

In [None]:
# Initialize embeddings and FAISS vector store
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(all_splits, embeddings)

In [None]:
# Save the vector store locally
db.save_local("path/to/save/faiss_index")  # Placeholder for save path and index name. Change to your preference. 