In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re

In [3]:
def fetch_page(url: str) -> str:
    response = requests.get(url)
    if response.status_code != 200:
        return ""
    return response.text

def extract_article_info(article_div) -> dict:
    article_info = {}
    title_tag = article_div.find('a', {'class': 'mdc-link mds-list-group__link'})
    if title_tag:
        article_info['Article Title'] = title_tag.text
        article_info['Article Link'] = "https://www.morningstar.com" + title_tag['href']
    return article_info

def extract_content_and_author(soup: BeautifulSoup) -> dict:
    content_and_author = {}
    article_content_div = soup.find('div', {'class': 'story__body mdc-story-body__mdc'})
    article_author_div = soup.find('div', {'class': 'mdc-metadata__list__mdc'})

    content_and_author['Article Content'] = article_content_div.text.strip() if article_content_div else "Content not found"
    content_and_author['Article Author'] = article_author_div.text.strip() if article_author_div else "Author not found"
    
    return content_and_author


In [4]:
def extract_articles_and_content(html_content: str, class_name) -> pd.DataFrame:
    soup = BeautifulSoup(html_content, 'html.parser')
    articles = soup.find_all('ul', {'class': class_name})
    
    if not articles:
        print("Article container not found")
        return None

    article_list = []
    
    for article_div in articles[0].find_all('li', {'class': 'mdc-list-group__item mds-list-group__item mdc-list-group-content-module'}):
        article_info = extract_article_info(article_div)
        if article_info:
            article_html = fetch_page(article_info['Article Link'])
            article_soup = BeautifulSoup(article_html, 'html.parser')
            content_and_author = extract_content_and_author(article_soup)
            article_info.update(content_and_author)
            article_list.append(article_info)
    for article_div in articles[1].find_all('li', {'class': 'mdc-list-group__item mds-list-group__item mdc-list-group-content-module'}):
        article_info = extract_article_info(article_div)
        if article_info:
            article_html = fetch_page(article_info['Article Link'])
            article_soup = BeautifulSoup(article_html, 'html.parser')
            content_and_author = extract_content_and_author(article_soup)
            article_info.update(content_and_author)
            article_list.append(article_info)
            
    return pd.DataFrame(article_list)


In [5]:
def extract_articles_and_content2(html_content,class_name):
    # Initialize BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the container that holds all articles
    articles = soup.find('div', {'class': class_name})
    articles_all = articles.find_all('div', {'class': 'mdc-grid-item__inner'})
    if articles is None:
        print("Article container not found")
        return None
    
    # Initialize lists to hold scraped data
    article_titles = []
    article_links = []
    article_contents = []
    article_anthor = []
    article_date = [] # This will hold the article contents
    
    # Loop through each article div to get title and href
    for article_div in articles_all:
        title_tag = article_div.find('a', {'class': 'mdc-link mdc-grid-item__title mdc-grid-item__title--link'})
        if title_tag:
            article_titles.append(title_tag.text)
            article_links.append("https://www.morningstar.com" +title_tag['href'])
        
        title_tag_2 = article_div.find('a', {'class': 'mdc-link mdc-grid-item__title mdc-grid-item__title--link mdc-grid-item__title--floated-supplemental-content'})
        if title_tag_2:
            article_titles.append(title_tag_2.text)
            article_links.append("https://www.morningstar.com" +title_tag_2['href'])  
            
    for link in article_links:
        
        html_content = fetch_page(link)
        article_soup = BeautifulSoup(html_content, 'html.parser')

        article_content_div = article_soup.find('div', {'class': 'story__body mdc-story-body__mdc'})
        if article_content_div:
            article_contents.append(article_content_div.text.strip())
        else:
            article_contents.append("Content not found")
        
        article_anthor_div = article_soup.find('div', {'class': 'mdc-metadata__list__mdc'})
        if article_anthor_div:
            article_anthor.append(article_anthor_div.text.strip())
        else:
            article_anthor.append("Content not found")
       
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Article Title': article_titles,
        'Article Link': article_links,
        'Article Content': article_contents,
        'Article Author': article_anthor
    })
    
    return df


In [6]:
url = 'https://www.morningstar.com/save-for-college'

# Fetch the HTML content of the page
html_content = fetch_page(url)

# If fetching was successful, proceed to extract card info
if html_content:
    # Extract card info and convert it to a Pandas DataFrame
    df1 = extract_articles_and_content(html_content,'mdc-list-group mds-list-group save-for-college__links-group')
    df1['Label'] = 'save-for-college'
    df2 = extract_articles_and_content2(html_content,'mdc-topic-grid__inner mdc-topic-grid__inner--5')
    df2['Label'] = 'save-for-college'

In [7]:
url2 = 'https://www.morningstar.com/start-investing'

# Fetch the HTML content of the page
html_content2 = fetch_page(url2)

# If fetching was successful, proceed to extract card info
if html_content2:
    # Extract card info and convert it to a Pandas DataFrame
    df3 = extract_articles_and_content(html_content2,'mdc-list-group mds-list-group start-investing__links-group')
    df3['Label'] = 'start-investing'
    df4 = extract_articles_and_content2(html_content2,'mdc-topic-grid__inner mdc-topic-grid__inner--7')
    df4['Label'] = 'start-investing'

In [14]:
url3 = 'https://www.morningstar.com/portfolios'

# Fetch the HTML content of the page
html_content3 = fetch_page(url3)

# If fetching was successful, proceed to extract card info
if html_content3:
    # Extract card info and convert it to a Pandas DataFrame
    df5 = extract_articles_and_content2(html_content3,'mdc-topic-grid__inner mdc-topic-grid__inner--20')
    df5['Label'] = 'Protfolio'

In [15]:
df = pd.concat([df1, df2,df3,df4,df5]).reset_index(drop=True)

In [16]:
df['Article Title'] = df['Article Title'].str.replace(r'[\n\t]', '', regex=True)
df['Article Author'] = df['Article Author'].str.replace(r'[\n\t]', '', regex=True)

In [17]:
df['Article Source'] = 'Morningstar'

In [18]:
def extract_date(s):
    match = re.search(r'((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s\d{1,2},\s\d{4})', s)
    return match.group(0) if match else None
def extract_author(s, date):
    return s.replace(date, '').strip() if date else s

In [19]:
df['Article Date'] = df['Article Author'].apply(extract_date)
df['Article Author'] = df.apply(lambda row: extract_author(row['Article Author'], row['Article Date']), axis=1)

In [20]:
df

Unnamed: 0,Article Title,Article Link,Article Content,Article Author,Label,Article Source,Article Date
0,Do 529 Plans Have Contribution Limits?,https://www.morningstar.com/articles/1094567/d...,"Susan Dziubinski: Hi, I'm Susan Dziubinski wit...","Patricia Oey, Susan Dziubinski",save-for-college,Morningstar,"May 20, 2022"
1,5 Charts Tracking Recent 529 Savings Plan Tre...,https://www.morningstar.com/articles/1095648/5...,The impetus for introducing 529 college saving...,Patricia Oey,save-for-college,Morningstar,"May 25, 2022"
2,Myths About 529 Plans,https://www.morningstar.com/articles/1094568/m...,"Susan Dziubinski: Hi, I'm Susan Dziubinski wit...","Patricia Oey, Susan Dziubinski",save-for-college,Morningstar,"May 20, 2022"
3,What Is a 529 College Savings Plan?,https://www.morningstar.com/articles/1095553/w...,Content not found,Author not found,save-for-college,Morningstar,
4,Vanguard Maintains Its Lead in 529 Plans,https://www.morningstar.com/articles/1094478/v...,Assets in 529 savings accounts grew over the c...,Patricia Oey,save-for-college,Morningstar,"May 17, 2022"
5,How to Allocate Assets for College Savings,https://www.morningstar.com/articles/844386/ho...,A version of this article was originally publi...,Christine Benz,save-for-college,Morningstar,"Jan 19, 2022"
6,3 Themes Shaping the Future of 529 Savings Pl...,https://www.morningstar.com/articles/1040262/3...,"As a graduation month, May augurs promise. Stu...",Madeline Hume,save-for-college,Morningstar,"May 20, 2021"
7,When Should You Start Saving for College?,https://www.morningstar.com/articles/1010710/w...,"Susan Dziubinski: Hi, I'm Susan Dziubinski wit...","Madeline Hume, Susan Dziubinski",save-for-college,Morningstar,"Dec 1, 2020"
8,What Bad Returns at the Wrong Time Can Mean f...,https://www.morningstar.com/articles/1003382/w...,I’ve written a couple of articles about how se...,Amy Arnott,save-for-college,Morningstar,"Oct 5, 2020"
9,How to Avoid Paying Extra Taxes on Your Colle...,https://www.morningstar.com/articles/983931/ho...,Editor’s note: Read the latest on how the coro...,Margaret Giles,save-for-college,Morningstar,"May 11, 2020"


In [21]:
import json
def save_to_json(df, filename):
    df.to_json(filename, orient='records', lines=True)
    print(f"Data saved to {filename}")

In [22]:
save_to_json(df, 'morning_invest_updated.json')

Data saved to morning_invest_updated.json
