In [55]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time

def fetch_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"An error occurred: {e}")
        return None
    
def extract_articles_and_content(html_content):
    # Initialize BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    # Find the container that holds all articles
    articles = soup.find('div', {'class': 'tdv2-applet-stream Bdc(#e2e2e6) Pos(r) Z(1)'})
    articles_all = articles.find_all('div', {'class': 'Py(14px) Pos(r)'})
    if articles is None:
        print("Article container not found")
        return None
    
    # Initialize lists to hold scraped data
    article_titles = []
    article_links = []
    article_contents = []
    article_anthor = []
    article_date = [] # This will hold the article contents
    
    # Loop through each article div to get title and href
    for article_div in articles_all:
        title_tag = article_div.find('a', {'class': 'Fw(b) Fz(18px) Lh(23px) LineClamp(2,46px) Fz(17px)--sm1024 Lh(19px)--sm1024 LineClamp(2,38px)--sm1024 mega-item-header-link Td(n) C(#0078ff):h C(#000) LineClamp(2,46px) LineClamp(2,38px)--sm1024 not-isInStreamVideoEnabled'})
        if title_tag:
            article_titles.append(title_tag.text)
            article_links.append(title_tag['href'])
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Article Title': article_titles,
        'Article Link': article_links,
    })
    
    return df

In [56]:
driver = webdriver.Chrome()

# Open the URL
driver.get("https://finance.yahoo.com/topic/personal-finance/")

# Scroll down multiple times
time.sleep(5)

# Scroll down multiple times
try:
    body = driver.find_element(By.TAG_NAME, 'body')  # Locate the body element
    for i in range(7):
        body.send_keys(Keys.END)
        time.sleep(2)  # Wait for content to load
except Exception as e:
    print("Error while scrolling:", e)
    driver.quit()
    exit()
    
# Get page source
page_source = driver.page_source

# Your scraping logic here
df = extract_articles_and_content(page_source)
print(df)
# Close the WebDriver
driver.quit()


                                        Article Title  \
0              How many credit cards should you have?   
1                   Is accident forgiveness worth it?   
2   Powerball jackpot just hit $1.2 billion. Here'...   
3           How does pay-per-mile car insurance work?   
4   What is high-risk car insurance and who needs it?   
..                                                ...   
95                           What is a personal loan?   
96             How to open an online checking account   
97  How to consolidate credit card debt with a per...   
98  Money market account vs. CD: Which is the best...   
99    What assigned risk means for your car insurance   

                                         Article Link  
0   https://finance.yahoo.com/personal-finance/how...  
1   https://finance.yahoo.com/personal-finance/acc...  
2   https://finance.yahoo.com/personal-finance/pow...  
3   https://finance.yahoo.com/personal-finance/pay...  
4   https://finance.yahoo.com/perso

In [57]:
from datetime import datetime
import re
def extract_and_convert_to_datetime(s):
    # Extract the date part using regex
    match = re.search(r'([a-zA-Z]{3}, [a-zA-Z]{3} \d{1,2}, \d{4})', s)
    if match:
        date_str = match.group(1)
        # Convert the date string to a datetime object
        return date_str
    return None

In [58]:
print(extract_and_convert_to_datetime('Updated Fri, Sep 1, 20238 min read'))
print(extract_and_convert_to_datetime('Updated Fri, Sep 29, 20238 min read'))

Fri, Sep 1, 2023
Fri, Sep 29, 2023


In [59]:
def extract_content(df):
    article_contents = []
    article_anthor = []
    article_date = [] 
    for link in df['Article Link']:

        html_content = fetch_page(link)
        article_soup = BeautifulSoup(html_content, 'html.parser')

        article_content_div = article_soup.find('div', {'class': 'caas-body'})
        if article_content_div:
            article_contents.append(article_content_div.text.strip())
        else:
            article_contents.append("Content not found")

        article_anthor_div = article_soup.find('div', {'class': 'caas-attr-item-author'})
        if article_anthor_div:
            article_anthor.append(article_anthor_div.text.strip())
        else:
            article_anthor.append("Content not found")

        article_date_div = article_soup.find('div', {'class': 'caas-attr-time-style'})
        if article_date_div:
            article_date.append(article_date_div.text.strip())
        else:
            article_date.append("Content not found")

    df = pd.DataFrame({
        'Article Content': article_contents,
        'Article Anthor': article_anthor,
        'Article Date': [extract_and_convert_to_datetime(s) for s in article_date],
    })
    
    return df

In [60]:
df_content = extract_content(df[:72])
print(df_content)

                                      Article Content  \
0   The offers on this page are from advertisers w...   
1   Lyudinka via Getty ImagesAccident forgiveness ...   
2   Oleg Blokhin via Getty ImagesAfter Saturday ni...   
3   Lyudinka via Getty ImagesIf you drive only occ...   
4   S-S-S via Getty ImagesIf you’re labeled a high...   
..                                                ...   
67  A deductible makes car insurance premiums chea...   
68  Term life insurance is a type of life insuranc...   
69  Standard car insurance typically does not cove...   
70  Where you keep your money is just as important...   
71  concept of financial wealth, investment succes...   

              Article Anthor      Article Date  
0                 Ivana Pino  Tue, Oct 3, 2023  
1        Stephanie Colestock  Tue, Oct 3, 2023  
2                  Kaz Weida  Tue, Oct 3, 2023  
3               Jess Ullrich  Mon, Oct 2, 2023  
4        Stephanie Colestock  Mon, Oct 2, 2023  
..                   

In [61]:
access_df = df[:72]

In [62]:
access_df['Article Content'] =df_content['Article Content'] 
access_df['Article Anthor'] =df_content['Article Anthor'] 
access_df['Article Date'] =df_content['Article Date'] 
access_df['Article Source'] = 'YahooFinance'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  access_df['Article Content'] =df_content['Article Content']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  access_df['Article Anthor'] =df_content['Article Anthor']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  access_df['Article Date'] =df_content['Article Date']
A value is trying to be set on a 

In [63]:
access_df

Unnamed: 0,Article Title,Article Link,Article Content,Article Anthor,Article Date,Article Source
0,How many credit cards should you have?,https://finance.yahoo.com/personal-finance/how...,The offers on this page are from advertisers w...,Ivana Pino,"Tue, Oct 3, 2023",YahooFinance
1,Is accident forgiveness worth it?,https://finance.yahoo.com/personal-finance/acc...,Lyudinka via Getty ImagesAccident forgiveness ...,Stephanie Colestock,"Tue, Oct 3, 2023",YahooFinance
2,Powerball jackpot just hit $1.2 billion. Here'...,https://finance.yahoo.com/personal-finance/pow...,Oleg Blokhin via Getty ImagesAfter Saturday ni...,Kaz Weida,"Tue, Oct 3, 2023",YahooFinance
3,How does pay-per-mile car insurance work?,https://finance.yahoo.com/personal-finance/pay...,Lyudinka via Getty ImagesIf you drive only occ...,Jess Ullrich,"Mon, Oct 2, 2023",YahooFinance
4,What is high-risk car insurance and who needs it?,https://finance.yahoo.com/personal-finance/hig...,S-S-S via Getty ImagesIf you’re labeled a high...,Stephanie Colestock,"Mon, Oct 2, 2023",YahooFinance
...,...,...,...,...,...,...
67,What you need to know about car insurance dedu...,https://finance.yahoo.com/personal-finance/car...,A deductible makes car insurance premiums chea...,Michelle Lambright Black,"Thu, Sep 7, 2023",YahooFinance
68,What is term life insurance?,https://finance.yahoo.com/personal-finance/wha...,Term life insurance is a type of life insuranc...,"Robin Hartill, CFP®","Thu, Sep 7, 2023",YahooFinance
69,Custom parts and equipment insurance: what it ...,https://finance.yahoo.com/personal-finance/cus...,Standard car insurance typically does not cove...,Karen Aho,"Fri, Sep 8, 2023",YahooFinance
70,How many checking accounts should I have?,https://finance.yahoo.com/personal-finance/how...,Where you keep your money is just as important...,Ivana Pino,"Wed, Sep 6, 2023",YahooFinance


In [64]:
import json
def save_to_json(df, filename):
    df.to_json(filename, orient='records', lines=True)
    print(f"Data saved to {filename}")

In [65]:
save_to_json(access_df, 'YahooFinance.json')

Data saved to YahooFinance.json
