## Let's use the requests and BeautifulSoup library to scrape data from the html structure of a webpage
### Remember!
- requests fetches the raw data from the webpage 
- BeautifulSoup converts the raw data into a parsable format

In [7]:
# install packages
pip install -r requirements.txt

Collecting blinker==1.6.2 (from -r requirements.txt (line 5))
  Downloading blinker-1.6.2-py3-none-any.whl (13 kB)
Collecting click==8.1.6 (from -r requirements.txt (line 8))
  Obtaining dependency information for click==8.1.6 from https://files.pythonhosted.org/packages/1a/70/e63223f8116931d365993d4a6b7ef653a4d920b41d03de7c59499962821f/click-8.1.6-py3-none-any.whl.metadata
  Downloading click-8.1.6-py3-none-any.whl.metadata (3.0 kB)
Collecting Flask==2.3.2 (from -r requirements.txt (line 13))
  Downloading Flask-2.3.2-py3-none-any.whl (96 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.9/96.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting itsdangerous==2.1.2 (from -r requirements.txt (line 17))
  Downloading itsdangerous-2.1.2-py3-none-any.whl (15 kB)
Collecting numpy==1.25.2 (from -r requirements.txt (line 25))
  Obtaining dependency information for numpy==1.25.2 from https://files.pythonhosted.org/packages/86/a1/b8ef999c32f26a97b5f714887e21f96c12ae99

In [14]:
from bs4 import BeautifulSoup
import requests

url = "http://localhost:5000"  # url of the website

# I would explain how to connect and what local host is
response = requests.get(url)

#print the content of the response (the html of the website)

print(response.content)


ConnectionError: HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x10b2056d0>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [None]:
#initialize the BeautifulSoup object with the content of the response
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
#print the content in a more readable format
print(soup.prettify())

In [None]:
# print the title tag
print(soup.title)


In [None]:
# print the string in the title tag
print(soup.title.string)

In [None]:
# print the parent tag of the title tag
print(soup.title.parent.name)

In [None]:
# get the content of the unordered list
print(soup.ul.text)

## Let's save our output!

### First we can have a look at exploring and printing different parts of the text from the html 

In [4]:

# Let's say we want to extract recipe titles and descriptions
recipe_divs = soup.find_all('div', class_='recipe')

# Let's print out the first recipe title and description
titles_description = []
for recipe_div in recipe_divs:
    #here we save the content of the h2 tag in the variable title
    title = recipe_div.find('h2').text
    #here we save the content of the p tag in the variable description
    description = recipe_div.find('p').text
    #here we print the title and description
    print(f"Recipe Title: {title}")
    print(f"Description: {description}")
    #this is just to print a line between each recipe
    print("-" * 30)
    #we will also append this data to a list so that we can export it to a dataframe later
    titles_description.append([title, description])


Recipe Title: Chocolate Chip Cookies
Description: These delicious cookies are a classic treat.
------------------------------
Recipe Title: Vegetable Stir-Fry
Description: This colorful stir-fry is packed with fresh veggies.
------------------------------


## Second we can structure our ouput in a dataframe and then export it to a .csv using the pandas library

A dataframe is a data structure that has columns and rows, think Excel spreadsheet!


In [8]:
import pandas as pd
#turn our list of lists into a dataframe
df = pd.DataFrame(titles_description)
#check the first rows of the dataframe
print(df.head())
#output our dataframe to a csv
df.to_csv('titles_description.csv', header= ['Title', 'Description'],index=False)

                        0                                                  1
0  Chocolate Chip Cookies       These delicious cookies are a classic treat.
1      Vegetable Stir-Fry  This colorful stir-fry is packed with fresh ve...


In [34]:
# maybe then we could give a real case example?
url_coop = "https://www.coop.ch/de/"

# I would explain how to connect and what local host is
response = requests.get(url_coop)

#print the content of the response (the html of the website)

print(response.content)



It is important to check: robots.txt
https://www.coop.ch/robots.txt

In [35]:
#initialize the BeautifulSoup object with the content of the response
soup = BeautifulSoup(response.content, 'html.parser')

In [36]:
#print the content in a more readable format
print(soup.prettify())

<!DOCTYPE html>
<!--[if IE 9]><html class="ie9 "><![endif]-->
<!--[if gt IE 9 | !IE]><!-->
<html class="no-js" data-error-modal-paragraph="" data-error-modal-title="" lang="de">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <script>
   window.utag_cfg_ovrd = window.utag_cfg_ovrd || {}; window.utag_cfg_ovrd.noview = true;
  </script>
  <script src="//tags.tiqcdn.com/utag/coop-ch/coop-ch/prod/utag.sync.js" type="text/javascript">
  </script>
  <title>
   Der Online-Supermarkt von Coop | coop.ch
  </title>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link crossorigin="" href="https://connect.facebook.net" rel="preconnect"/>
  <link href="https://connect.facebook.net" rel="dns-prefetch"/>
  <link crossorigin="" href="https://bat.bing.com" rel="preconnect"/>
  <link href="https://bat.bing.com" rel="dns-prefetch"/>
  <link crossorigin="" href="https://adservice.google.com" rel="preconnect"/>
  <link href="https://adservice.google.com" rel="dns-prefetch"/

In [37]:
# print the string in the title tag
print(soup.title.string)

Der Online-Supermarkt von Coop | coop.ch


In [None]:
# I could show an example of how to get a product price