In [1]:
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

from src.chunking import document_chunker

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Create a simple text-based chunker since we can't download models due to SSL issues
import os
import re

def simple_text_chunker(directory_path, chunk_size=256, overlap=0):
    """
    Simple text chunker that doesn't require external model downloads
    """
    documents = {}
    
    # Read each file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            # Simple word-based chunking
            words = content.split()
            chunks = []
            
            for i in range(0, len(words), chunk_size - overlap):
                chunk = ' '.join(words[i:i + chunk_size])
                chunks.append(chunk)
            
            documents[filename] = chunks
    
    return documents

# Use the simple chunker instead
docs = simple_text_chunker(directory_path='../data/text_data', chunk_size=50, overlap=10)
keys = list(docs.keys())
print(f"Number of documents processed: {len(docs)}")
print(f"First document: {keys[0]}")
print(f"Number of chunks in first document: {len(docs[keys[0]])}")
print(f"First chunk preview: {docs[keys[0]][0][:200]}...")

Number of documents processed: 105
First document: ENT-5002.txt
Number of chunks in first document: 1
First chunk preview: Gaming console storage unit in sleek black, featuring designated compartments for systems, controllers, and games. Ventilated to prevent overheating. Manufactured by GameHub. Dimensions: 42"W x 16"D x...


In [None]:
# Alternative approach: Fix SSL certificates on macOS
# Run this in terminal to fix SSL certificate issues permanently:
# /Applications/Python\ 3.x/Install\ Certificates.command
# or
# pip install --upgrade certifi

# For a temporary fix in this session, you can also try:
import os
import requests
from urllib3.util import ssl_

# Monkey patch to disable SSL verification for this session only
original_ssl_wrap_socket = ssl_.ssl_wrap_socket
def patched_ssl_wrap_socket(*args, **kwargs):
    kwargs['cert_reqs'] = ssl_.CERT_NONE
    return original_ssl_wrap_socket(*args, **kwargs)

ssl_.ssl_wrap_socket = patched_ssl_wrap_socket

# Now try the original chunker with SSL patch
try:
    print("Attempting to use original document_chunker with SSL patch...")
    docs_original = document_chunker(directory_path='../data/text_data',
                                   model_name='BAAI/bge-small-en-v1.5',
                                   chunk_size=256)
    print("SUCCESS: Original chunker works with SSL patch!")
    print(f"Processed {len(docs_original)} documents")
except Exception as e:
    print(f"Still having issues: {e}")
    print("The simple text chunker above is a good alternative for now.")