# Web Scrapers

In [50]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

## Sraper for news about China

In [51]:
def scrape_china():
    # Define the URL for the region-specific page
    url = 'https://thediplomat.com/regions/east-asia/'
    
    # Set up headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    }
    
    # Send a GET request to the page
    response = requests.get(url, headers=headers)
    
    # Check for successful request
    if response.status_code == 200:
        # Parse the page content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all article blocks - Adjust this based on the site's HTML structure
        articles = soup.find_all('a', class_='td-post')
    
        # Extract titles, links, and summaries
        article_data = []
        for article in articles:
            link = article['href']
            title_tag = article.find('h4')
            if title_tag and link:
                title = title_tag.text.strip()
                #link = title_tag.find('a')['href']
                #summary = article.find('p').text.strip() if article.find('p') else ''
                full_link = 'https://thediplomat.com' + link
    
                # scrape the article 
                response_article = requests.get(full_link, headers=headers)
    
                if response.status_code == 200:
                    soup_article = BeautifulSoup(response_article.content, 'html.parser')
                    content = ""
                    section = soup_article.find("section", id="tda-gated-body")
                    # print(section)
                    paragraphs = section.find_all("p")
                    for para in paragraphs:
                        content += para.text.strip() + '\n'
                
                
                article_data.append({'title':title, 'link':full_link, 'content':content})
                #article_data.append({'title': title, 'link': full_link, 'summary': summary})
    
        # Save to a DataFrame
        df = pd.DataFrame(article_data)
        #print(df)
    
        # Optional: Save to a CSV file
        df.to_csv('data/china.csv', index=False)
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

    return df

df_china = scrape_china()

In [59]:
df_china.head()

Unnamed: 0,title,link,content
0,First Known Survivor of China’s Forced Organ H...,https://thediplomat.com/2024/08/first-known-su...,In a chilling revelation that underscores the ...
1,China Hits Back at the US in Response to Dopin...,https://thediplomat.com/2024/08/china-hits-bac...,China is trying to fight fire with fire in the...
2,SEATO’s 70th Anniversary: Lessons for Asia’s E...,https://thediplomat.com/2024/08/seatos-70th-an...,September 2024 marks the 70th anniversary of t...
3,Nagasaki’s Atomic Bomb Commemoration Overshado...,https://thediplomat.com/2024/08/nagasakis-atom...,"For the Japanese, August is a special month. T..."
4,Japan Issues First ‘Megaquake’ Warning,https://thediplomat.com/2024/08/japan-issues-f...,Japanese Prime Minister Kishida Fumio canceled...


## Scraper for News About Retail

In [54]:
response = requests.get('https://chainstoreage.com/market-segment/retail/discount-store')

In [118]:
def scrape_retail():
    # Define the URL for the region-specific page
    url = 'https://chainstoreage.com/market-segment/retail/discount-store'
    
    # Set up headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    }
    
    # Send a GET request to the page
    response = requests.get(url, headers=headers)
    
    # Check for successful request
    if response.status_code == 200:
        # Parse the page content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all article blocks - Adjust this based on the site's HTML structure
        articles = soup.find_all('a', class_=['astro-hh3v2ni7','heading__link'])
    
        # Extract titles, links, and summaries
        article_data = []
        for article in articles:
            link = article['href']
            if "heading__link" in article["class"]:
                title_tag = article
            else:
                title_tag = article.find('h3')
            if title_tag and link:
                title = title_tag.text.strip()
                #link = title_tag.find('a')['href']
                #summary = article.find('p').text.strip() if article.find('p') else ''
                full_link = 'https://chainstoreage.com' + link
    
                # scrape the article 
                response_article = requests.get(full_link, headers=headers)
    
                
                if response.status_code == 200:
                    soup_article = BeautifulSoup(response_article.content, 'html.parser')
                    content = ""
                    section = soup_article.find("article", class_="content")
                    # print(section)
                    paragraphs = section.find_all(["p", "ul"])
                    #print(paragraphs)
                    for para in paragraphs:
                        content += para.text.strip() + '\n'
                
                
                article_data.append({'title':title, 'link':full_link, 'content':content})
                #article_data.append({'title': title, 'link': full_link, 'summary': summary})
    
        # Save to a DataFrame
        df_retail = pd.DataFrame(article_data)
        #print(df)
    
        # Optional: Save to a CSV file
        df_retail.to_csv('data/retail.csv', index=False)
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
    return df_retail

In [121]:
df_retail

Unnamed: 0,title,link,content
0,Walmart breaks record with Q2 online grocery s...,https://chainstoreage.com/walmart-breaks-recor...,facebooktwitterlinkedInemail\nWalmart Inc. is ...
1,Walmart Inc. is more dominant in the U.S. onli...,https://chainstoreage.com/walmart-breaks-recor...,facebooktwitterlinkedInemail\nWalmart Inc. is ...
2,Big Lots closing about 300 stores; California ...,https://chainstoreage.com/big-lots-closing-abo...,facebooktwitterlinkedInemail\nBig Lots is down...
3,Big Lots is shrinking its brick-and-mortar foo...,https://chainstoreage.com/big-lots-closing-abo...,facebooktwitterlinkedInemail\nBig Lots is down...
4,Dollar General launches omnichannel back-to-sc...,https://chainstoreage.com/dollar-general-launc...,facebooktwitterlinkedInemail\nDollar General i...
5,Family Dollar optimizes assortment with predic...,https://chainstoreage.com/family-dollar-optimi...,facebooktwitterlinkedInemail\nFamily Dollar is...
6,Target lauches back-to-school personalization ...,https://chainstoreage.com/target-lauches-back-...,facebooktwitterlinkedInemail\nTarget Corp. is ...
7,Walmart creates metaverse shop for college stu...,https://chainstoreage.com/walmart-creates-meta...,facebooktwitterlinkedInemail\nWalmart is expan...
8,Walmart targets food waste with in-store 'depa...,https://chainstoreage.com/walmart-targets-food...,facebooktwitterlinkedInemail\nWalmart Inc. is ...
9,Ollie's to launch its first-ever credit card —...,https://chainstoreage.com/ollies-launch-its-fi...,facebooktwitterlinkedInemail\nThe country's la...


## Scraper for Market News

In [132]:
response = requests.get('https://www.fool.com/investing/2024/08/11/down-79-this-growth-stock-could-double-in-the-hous/')

In [155]:
def scrape_market():
    # Define the URL for the region-specific page
    url = 'https://www.fool.com/investing-news/'
    
    # Set up headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    }
    
    # Send a GET request to the page
    response = requests.get(url, headers=headers)
    
    # Check for successful request
    if response.status_code == 200:
        # Parse the page content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all article blocks - Adjust this based on the site's HTML structure
        articles = soup.find_all('a', class_='text-gray-1100')
    
        # Extract titles, links, and summaries
        article_data = []
        for article in articles[:15]:
            link = article['href']
            title_tag = article.find('h5')
            if title_tag and link:
                title = title_tag.text.strip()
                #link = title_tag.find('a')['href']
                #summary = article.find('p').text.strip() if article.find('p') else ''
                full_link = 'https://www.fool.com' + link
    
                # scrape the article 
                response_article = requests.get(full_link, headers=headers)
                
                if response.status_code == 200:
                    soup_article = BeautifulSoup(response_article.content, 'html.parser')
                    content = ""
                    section = soup_article.find_all("div", class_="foolcom-grid-content-sidebar")
                    if len(section) == 2:
                        section = section[1]
                    elif len(section) == 1:
                        section = section[0]
                    else:
                        continue
                    # print(section)
                    paragraphs = section.find_all(["p", "ul"])
                    #print(paragraphs)
                    for para in paragraphs:
                        content += para.text.strip() + '\n'
                
                
                article_data.append({'title':title, 'link':full_link, 'content':content})
                #article_data.append({'title': title, 'link': full_link, 'summary': summary})
    
        # Save to a DataFrame
        df_market = pd.DataFrame(article_data)
        #print(df)
    
        # Optional: Save to a CSV file
        df_market.to_csv('data/market.csv', index=False)
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
    return df_market

In [156]:
df_market

Unnamed: 0,title,link,content
0,"Down 79%, This Growth Stock Could Double in th...",https://www.fool.com/investing/2024/08/11/down...,A recovery in the housing market could help re...
1,"Down 54%, Is It Time to Buy the Dip on This Gr...",https://www.fool.com/investing/2024/08/11/down...,GXO Logistics slipped on its earnings report. ...
2,Down Between 12% and 24% From Their 52-Week Hi...,https://www.fool.com/investing/2024/08/11/down...,Buying excellent companies when they are out o...
3,"Down 79%, This Growth Stock Could Double in th...",https://www.fool.com/investing/2024/08/11/down...,A recovery in the housing market could help re...
4,Is Now the Time to Buy the 2 Worst-Performing ...,https://www.fool.com/investing/2024/08/11/is-n...,One of these stocks is a seemingly more promis...
5,Prediction: 1 Top Artificial Intelligence (AI)...,https://www.fool.com/investing/2024/08/11/pred...,A couple of solid AI-related growth opportunit...
6,Where Will Amazon Stock Be in 3 Years?,https://www.fool.com/investing/2024/08/11/wher...,Can this tech conglomerate hold on to its loft...
7,Market Sell-Off: Is It Time to Buy the Dip on ...,https://www.fool.com/investing/2024/08/11/mark...,Investors might want to take advantage of the ...
8,Elon Musk Just Said Tesla Has a $200 Trillion ...,https://www.fool.com/investing/2024/08/11/elon...,Tesla's next big opportunity has nothing to do...
9,Mark Zuckerberg Has a Bold Vision for Meta's C...,https://www.fool.com/investing/2024/08/11/mark...,Artificial intelligence could enhance Meta's e...
