In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re

def fetch_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"An error occurred: {e}")
        return None
def extract_author_and_date(input_str):
    # Define regex patterns for author and date
    author_pattern = r'By\s*([\w\s,]*)\s*–'
    date_pattern = r'Updated\s*([\w\s\d,]*)\sat'
    
    # Search for author and date in the input string
    author_match = re.search(author_pattern, input_str)
    date_match = re.search(date_pattern, input_str)
    
    # Initialize variables to hold the extracted data
    author = None
    date = None
    
    # Check if matches were found and extract the data
    if author_match:
        author = author_match.group(1).strip()
    
    if date_match:
        date = date_match.group(1).strip()
        
    return author, date
    
def extract_articles_and_content(html_content):
    # Initialize BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the container that holds all articles
    article_container = soup.find('div', {'class': 'grid grid-cols-2 md:grid-cols-4 lg:grid-cols-1 text-md gap-16px'})
    
    if article_container is None:
        print("Article container not found")
        return None
    
    # Initialize lists to hold scraped data
    article_titles = []
    article_links = []
    article_contents = []
    article_anthor = []
    article_date = [] # This will hold the article contents
    
    # Loop through each article div to get title and href
    for article_div in article_container.find_all('div'):
        title_tag = article_div.find('a', {'class': 'font-medium text-gray-1100'})
        
        if title_tag:
            article_titles.append(title_tag.text)
            article_links.append("https://www.fool.com" +title_tag['href'])
            
    for link in article_links:
        
        html_content = fetch_page(link)
        article_soup = BeautifulSoup(html_content, 'html.parser')

        article_content_div = article_soup.find('div', {'class': 'max-w-full w-full mx-auto tailwind-article-body'})
        if article_content_div:
            article_contents.append(article_content_div.text.strip())
        else:
            article_contents.append("Content not found")
        
        article_anthor_div = article_soup.find('div', {'class': 'font-medium text-gray-800 text-md md:text-lg mb-16px mt-20px md:my-24px'})
        if article_anthor_div:
            a, b = extract_author_and_date(article_anthor_div.text.strip())
            article_anthor.append(a)
            article_date.append(b)
        else:
            article_anthor.append("Content not found")
            article_date.append("Content not found")
       
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Article Title': article_titles,
        'Article Link': article_links,
        'Article Content': article_contents,
        'Article Anthor': article_anthor,
        'Article Date': article_date
    })
    
    return df


# URL of the page to scrape (replace with the actual URL)
url = 'https://www.fool.com/'

# Fetch the HTML content of the page
html_content = fetch_page(url)

# If fetching was successful, proceed to extract card info
if html_content:
    # Extract card info and convert it to a Pandas DataFrame
    df = extract_articles_and_content(html_content)


In [3]:
df['Label'] = 'Popular Artical'
df['Article Source'] = 'Motley'

In [4]:
def extract_author_and_date2(given_str):
    date_str = given_str.split("Updated ")[-1].strip()

    # Convert abbreviated month name to full month name for parsing
    date_str = date_str.replace("Jan.", "January").replace("Feb.", "February").replace("Mar.", "March")
    date_str = date_str.replace("Apr.", "April").replace("May", "May").replace("Jun.", "June")
    date_str = date_str.replace("Jul.", "July").replace("Aug.", "August").replace("Sept.", "September")
    date_str = date_str.replace("Oct.", "October").replace("Nov.", "November").replace("Dec.", "December")

        
    return date_str
    
def extract_articles_and_content2(html_content):
    # Initialize BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the container that holds all articles
    article_container = soup.find_all('div', {'class': 'flex dropdown'})
    article_container_2 = article_container[3].find('div', {'class': 'flex justify-between topnav-wrapper p-32px xl:px-0px'})
    # print(article_container_2)
    if article_container_2 is None:
        print("Article container not found")
        return None
    
    # Initialize lists to hold scraped data
    article_titles = []
    article_links = []
    article_contents = []
    article_anthor = []
    article_date = [] # This will hold the article contents
    
    # Loop through each article div to get title and href
    for article_div in article_container_2.find_all('a'):
        if article_div:
            article_links.append(article_div.get('href'))
        link = article_div.get('href')
        html_content = fetch_page(link)
        article_soup = BeautifulSoup(html_content, 'html.parser')

        article_titles_div = article_soup.find('div', {'class': 'block pg-title-item px-0'})
        if article_titles_div:
            article_titles.append(article_titles_div.text.strip())
        else:
            article_titles.append("Content not found")
            
        if article_soup.find('div', {'class': 'creditcard_page'}):
            article_content_div = article_soup.find('div', {'class': 'creditcard_page'})
            article_contents.append(article_content_div.text.strip())
        elif article_soup.find('div', {'class': 'banking_page'}):
            article_content_div = article_soup.find('div', {'class': 'banking_page'})
            article_contents.append(article_content_div.text.strip())
        elif article_soup.find('div', {'class': 'mortgage_page row'}):
            article_content_div = article_soup.find('div', {'class': 'mortgage_page row'})
            article_contents.append(article_content_div.text.strip())
        elif article_soup.find('div', {'class': 'banking_landing_page row'}):
            article_content_div = article_soup.find('div', {'class': 'banking_landing_page row'})
            article_contents.append(article_content_div.text.strip())
        elif article_soup.find('div', {'class': 'loans_page row'}):
            article_content_div = article_soup.find('div', {'class': 'loans_page row'})
            article_contents.append(article_content_div.text.strip())
        elif article_soup.find('div', {'class': 'row'}):
            article_content_div = article_soup.find('div', {'class': 'row'})
            article_contents.append(article_content_div.text.strip())
        else:
            article_contents.append("Content not found")
        
            
        
        article_anthor_div = article_soup.find('div', {'class': 'col-12 col-md author-bio-author'})
        if article_anthor_div:
            article_anthor.append(article_anthor_div.text.strip())
        else:
            article_anthor.append("Content not found")
            
        article_date_div = article_soup.find('div', {'class': 'col-12 fs-12'})
        if article_date_div:
            article_date.append(extract_author_and_date2(article_date_div.text.strip()))
        else:
            article_date.append("Content not found")
        
       
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Article Title': article_titles,
        'Article Link': article_links,
        'Article Content': article_contents,
        'Article Anthor': article_anthor,
        'Article Date': article_date
    })
    
    return df


# URL of the page to scrape (replace with the actual URL)
url = 'https://www.fool.com/'

# Fetch the HTML content of the page
html_content = fetch_page(url)

# If fetching was successful, proceed to extract card info
if html_content:
    # Extract card info and convert it to a Pandas DataFrame
    df2 = extract_articles_and_content2(html_content)


In [5]:
df2['Label'] = 'Perspnal Finance'
df2['Article Source'] = 'Motley'

In [6]:
df_all = pd.concat([df, df2]).reset_index(drop=True)

In [7]:
df_all

Unnamed: 0,Article Title,Article Link,Article Content,Article Anthor,Article Date,Label,Article Source
0,Best Stocks to Buy In 2023,https://www.fool.com/investing/top-stocks-to-buy/,There are literally thousands of publicly trad...,"Matthew Frankel, CFP","Sep 11, 2023",Popular Artical,Motley
1,Top AI Stocks to Buy,https://www.fool.com/investing/stock-market/ma...,Computers excel at crunching numbers but not a...,Jeremy Bowman,"Sep 18, 2023",Popular Artical,Motley
2,How to Invest in OpenAI & ChatGPT,https://www.fool.com/investing/stock-market/ma...,"In just a few months, ChatGPT, the disruptive ...",Jeremy Bowman,"Sep 18, 2023",Popular Artical,Motley
3,6 Steps to Learn How to Buy Stocks,https://www.fool.com/investing/how-to-invest/s...,So you've decided to invest in the stock marke...,"Matthew Frankel, CFP","Jul 19, 2023",Popular Artical,Motley
4,Top Marijuana Stocks to Invest In,https://www.fool.com/investing/stock-market/ma...,The marijuana industry is expected to expand a...,Keith Speights,"Jul 17, 2023",Popular Artical,Motley
5,Best ETFs to Buy,https://www.fool.com/investing/how-to-invest/e...,Exchange-traded funds offer investors an appea...,Matthew DiLallo,"Aug 22, 2023",Popular Artical,Motley
6,Top 3 S&P 500 Index Funds,https://www.fool.com/investing/how-to-invest/i...,S&P 500 index funds are passive investments al...,Matthew DiLallo,"Aug 23, 2023",Popular Artical,Motley
7,5 Top Hydrogen Stocks to Watch,https://www.fool.com/investing/stock-market/ma...,Hydrogen is an energy source that emits no gre...,Matthew DiLallo,"Sep 20, 2023",Popular Artical,Motley
8,High-Yield Dividend Stocks to Buy Right Now,https://www.fool.com/investing/stock-market/ty...,There's no official definition of a high-yield...,Matthew DiLallo,"May 30, 2023",Popular Artical,Motley
9,Dictionary of Financial Terms,https://www.fool.com/terms/,Content not found,Content not found,Content not found,Popular Artical,Motley


In [8]:
import json
def save_to_json(df, filename):
    df.to_json(filename, orient='records', lines=True)
    print(f"Data saved to {filename}")

In [9]:
save_to_json(df_all, 'Motley_updated.json')

Data saved to Motley_updated.json
