In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

## Preparing the data for processing

### Pipeline 1: Collecting and preparing the documents

In [None]:
# File name for file management
graph_name = "Marketing"

ufilename = graph_name + "_urls.txt"

import requests
from bs4 import BeautifulSoup
import re

with open(ufilename, 'r') as file:
    urls = [line.strip() for line in file]

print("Read URLs:")
for url in urls:
    print(url)

In [None]:
def clean_text(content):
    # Remove references and unwanted characters
    content = re.sub(r'\[\d+\]', '', content)   # Remove references
    content = re.sub(r'[^\w\s\.]', '', content)  # Remove punctuation (except periods)
    return content

def fetch_and_clean(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        soup = BeautifulSoup(response.content, 'html.parser')

        # Prioritise "mw-parser-output" but fall back to "content" node if not found
        content = soup.find('div', {'class': 'mw-parser-output'}) or soup.find('div', {'id': 'content'})
        if content is None:
            return None
        
        # Remove specific unwanted sections, including nested ones
        for section_title in ['References', 'Bibliography', 'External links', 'See also', 'Notes']:
            section = content.find('span', id=section_title)
            while section:
                for sib in section.parent.find_next_siblings():
                    sib.decompose()  # Remove the section and its siblings
                section.parent.decompose()  # Remove the section itself
                section = content.find('span', id=section_title)

        # Extract and clean text
        text = content.get_text(separator=' ', strip=True)  # Use space as separator and strip whitespace
        text = clean_text(text)
        return text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None  # Return None if there's an error
    
# Directory to store the output file
output_dir = './data/'
os.makedirs(output_dir, exist_ok=True)

# Processing the URLs and skipping invalid ones
reload = False  # Set to True to reprocess all URLs
if reload==True:
    for url in urls:
        article_name = url.split('/')[-1].replace('.html', '')
        filename = os.path.join(output_dir, f"{article_name}.txt")

        clean_article_text = fetch_and_clean(url)
        if clean_article_text:  # Only write if text is not None
            with open(filename, 'w', encoding='utf-8') as file:
                file.write(clean_article_text)
                print(f"Saved {filename}")

In [None]:
from llama_index.core import SimpleDirectoryReader

# Load documents
documents = SimpleDirectoryReader("./data").load_data()
print(documents[0])

## Pipeline 2: Creating and populating the Deeplake Vector Store

In [None]:
# Setup embedding model
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# loads https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2
embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L12-v2",
    embed_batch_size=10,  # default value (100)
)

In [None]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex, Settings

Settings.embed_model = embed_model

# Path for vector store and dataset
db = "hub://honglin/marketing01"
vector_store_path = db
dataset_path = db

# Create an index over the documents
# Overwrites the existing dataset if True
ow = True
vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=ow)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)