In [9]:
import os
from dotenv import load_dotenv
load_dotenv()

False

## Preparing the data for processing

### Pipeline 1: Collecting and preparing the documents

In [10]:
# File name for file management
graph_name = "Marketing"

ufilename = graph_name + "_urls.txt"

import requests
from bs4 import BeautifulSoup
import re

with open(ufilename, 'r') as file:
    urls = [line.strip() for line in file]

print("Read URLs:")
for url in urls:
    print(url)

Read URLs:
https://en.wikipedia.org/wiki/Marketing
https://en.wikipedia.org/wiki/24-hour_news_cycle
https://en.wikipedia.org/wiki/Account-based_marketing
https://en.wikipedia.org/wiki/Activism
https://en.wikipedia.org/wiki/Adam_Smith
https://en.wikipedia.org/wiki/Adam_Smith_Institute
https://en.wikipedia.org/wiki/Advertising
https://en.wikipedia.org/wiki/Advertising_agency
https://en.wikipedia.org/wiki/Advertising_mail
https://en.wikipedia.org/wiki/Advertising_management
https://en.wikipedia.org/wiki/Advertising_slogan
https://en.wikipedia.org/wiki/Advocacy
https://en.wikipedia.org/wiki/Advocacy_group
https://en.wikipedia.org/wiki/Affinity_marketing
https://en.wikipedia.org/wiki/Agenda-setting_theory
https://en.wikipedia.org/wiki/Agile_marketing
https://en.wikipedia.org/wiki/Agricultural_Marketing_Service
https://en.wikipedia.org/wiki/Agricultural_marketing
https://en.wikipedia.org/wiki/Airborne_leaflet_propaganda
https://en.wikipedia.org/wiki/Alternative_facts
https://en.wikipedia.org

In [11]:
def clean_text(content):
    # Remove references and unwanted characters
    content = re.sub(r'\[\d+\]', '', content)   # Remove references
    content = re.sub(r'[^\w\s\.]', '', content)  # Remove punctuation (except periods)
    return content

def safe_file_name(s):
    # Replace spaces with underscores
    s = s.replace(' ', '_')
    
    # Remove any characters that are not allowed in file names
    safe_str = ''.join(c for c in s if c.isalpha() or c.isdigit() or c in [' ', '.', '_', '-'])
    
    return safe_str

def file_exists_and_has_content(file_path):
    # Check if the file exists
    if not os.path.exists(file_path):
        return False
    
    # Check if the file is not empty
    with open(file_path, 'r', encoding='utf-8') as file:
        first_char = file.read(1)
        if first_char:
            return True
        else:
            return False

def fetch_and_clean(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        soup = BeautifulSoup(response.content, 'html.parser')

        # Prioritise "mw-parser-output" but fall back to "content" node if not found
        content = soup.find('div', {'class': 'mw-parser-output'}) or soup.find('div', {'id': 'content'})
        if content is None:
            return None
        
        # Remove specific unwanted sections, including nested ones
        for section_title in ['References', 'Bibliography', 'External links', 'See also', 'Notes']:
            section = content.find('span', id=section_title)
            while section:
                for sib in section.parent.find_next_siblings():
                    sib.decompose()  # Remove the section and its siblings
                section.parent.decompose()  # Remove the section itself
                section = content.find('span', id=section_title)

        # Extract and clean text
        text = content.get_text(separator=' ', strip=True)  # Use space as separator and strip whitespace
        text = clean_text(text)
        return text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None  # Return None if there's an error
    
# Directory to store the output file
output_dir = './data/'
os.makedirs(output_dir, exist_ok=True)

# Processing the URLs and skipping invalid ones
reload = True  # Set to True to reprocess all URLs
if reload==True:
    for url in urls:
        article_name = url.split('/')[-1].replace('.html', '')
        filename = os.path.join(output_dir, f"{safe_file_name(article_name)}.txt")

        if file_exists_and_has_content(filename) is True:
            print(f"Existed {filename}")
            continue
        else:
            clean_article_text = fetch_and_clean(url)
            if clean_article_text:  # Only write if text is not None
                with open(filename, 'w', encoding='utf-8') as file:
                    file.write(clean_article_text)
                    print(f"Saved {filename}")

Existed ./data/24-hour_news_cycle.txt
Existed ./data/Account-based_marketing.txt
Existed ./data/Activism.txt
Existed ./data/Adam_Smith_Institute.txt
Existed ./data/Advertising_mail.txt
Existed ./data/Advertising_management.txt
Existed ./data/Advertising_slogan.txt
Existed ./data/Advocacy.txt
Existed ./data/Advocacy_group.txt
Existed ./data/Affinity_marketing.txt
Existed ./data/Agenda-setting_theory.txt
Existed ./data/Agile_marketing.txt
Existed ./data/Agricultural_Marketing_Service.txt
Existed ./data/Agricultural_marketing.txt
Existed ./data/Airborne_leaflet_propaganda.txt
Existed ./data/Alternative_facts.txt
Existed ./data/Alternative_media.txt
Existed ./data/Ambush_marketing.txt
Existed ./data/American_Marketing_Association.txt
Existed ./data/American_business_history.txt
Existed ./data/Annoyance_factor.txt
Existed ./data/Anthropology.txt
Existed ./data/Applied_ethics.txt
Existed ./data/April_Fools27_Day.txt
Existed ./data/Art_director.txt
Existed ./data/Astroturfing.txt
Existed ./da

In [12]:
from llama_index.core import SimpleDirectoryReader

# Load documents
documents = SimpleDirectoryReader("./data").load_data()
print(documents[0])

Doc ID: 6f107f73-6ef5-4ee5-a437-c08f60e24dc8
Text: Investigation and reporting of news concomitant with fastpaced
lifestyles This article is about the fastpaced cycle of news media in
technologically advanced societies. For the longerterm cycle of news
and information see information cycle . Several simultaneous NBC News
broadcasts including MSNBC  NBC s Today and CNBC s Squawk Box
displayed on...


## Pipeline 2: Creating and populating the Deeplake Vector Store

In [13]:
# Setup embedding model
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2
# https://huggingface.co/BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L12-v2",
    embed_batch_size=5,  # default value (100)
)

In [14]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex, Settings

Settings.embed_model = embed_model

# Path for vector store and dataset
# db = "hub://honglin/marketing01"
db = "./dataset/marketing01"
vector_store_path = db
dataset_path = db

# Create an index over the documents
# Overwrites the existing dataset if True
ow = True
try:
    vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=ow)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
except Exception as e:
    print(f"An error occurred: {e}")
    print(f"Error type: {type(e)}")
    print(f"Error traceback: {e.__traceback__}")