In [None]:
pip install requests beautifulsoup4 pandas

In [None]:
import requests
from bs4 import BeautifulSoup
import re

# Helper function to clean text
def clean_text(text):
    # Remove unwanted symbols and extra spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s.,]', '', text)  # Remove non-alphanumeric characters (except ., and spaces)
    return text.strip()

# Function to scrape financial advice from a given URL
def scrape_url(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to retrieve: {url}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all paragraphs and list items, common places for financial advice
    advice = []
    for tag in soup.find_all(['p', 'li']):
        text = clean_text(tag.get_text())
        if text and len(text) > 50:  # Only add text if it's long enough to be useful advice
            advice.append(text)

    return advice

# List of URLs to scrape financial advice from
urls = [
    # "https://www.reddit.com/r/personalfinance/wiki/index/#wiki_please_read_this_basic_financial_advice_everyone_should_follow",
    "https://www.investor.gov/free-financial-planning-tools",
    "https://www.investopedia.com/personal-finance-4427760",
    # "https://www.nerdwallet.com/article/investing/how-to-invest",
    "https://www.thebalance.com/investing-4074025",
    "https://www.fidelity.com/learning-center/investment-products/stocks/basics-of-stock-investing"
]

# Initialize an empty list to store all the scraped advice
all_advice = []

# Scrape each URL and collect the advice
for url in urls:
    print(f"Scraping: {url}")
    advice = scrape_url(url)
    all_advice.extend(advice)

# Save the scraped advice to a document (text file) for RAG use
with open('personal_finance_advice_corpus.txt', 'w') as f:
    for advice in all_advice:
        f.write(advice + "\n\n")  # Add new lines between pieces of advice

print(f"Scraping completed. {len(all_advice)} pieces of advice saved to 'personal_finance_advice_corpus.txt'.")

