# Data Scraper with .csv output
Inspired by the YouTube tutorial by Alex The Analyst

- [BeautifulSoup + Requests | Web Scraping in Python](https://www.youtube.com/watch?v=bargNl2WeN4)
- [Find and Find_All | Web Scraping in Python](https://www.youtube.com/watch?v=xjA1HjvmoMY)

## First try with requests and BeautifulSoup

### Import modules

In [30]:
# !pip install bs4
import requests
from bs4 import BeautifulSoup

### Load one page with 'requests' from URL and check the document

In [None]:
url = "https://www.scrapethissite.com/pages/forms/"
response = requests.get(url)
response.text

### Create the BeautifulSoup object and check the document

In [None]:
soup = BeautifulSoup(response.text, "html")
soup

### Prettify the document data

In [None]:
print(soup.prettify())

### Try to fetch the title

In [34]:
soup.find("title").text

'Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping'

### Try to read the pagination links

In [35]:
pagination = soup.find("ul", class_="pagination")
for anker in pagination.find_all("a"):
    #print(url + "?" + anker["href"].split("?")[1])
    ...

### Try to read the table

In [None]:
table = soup.find("table", class_="table")
rows = table.find_all("tr")
#print(len(rows))

for row in rows:
    ths = row.find_all("th")
    tds = row.find_all("td")

    print([th.text.strip() for th in list(ths)])
    print("|".join([td.text.strip() for td in tds]))

## Let's combine all the parts into our complete scraper

### Import the modules again

In [37]:
import requests
import time
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

### Configure the URL to load and the output filename

In [None]:
output_filename = "data/hockey.csv"

url = "https://www.scrapethissite.com/pages/forms/"
pagination = None

headers = []
data = []

while True:
    print("Load page:", url)
    response = requests.get(url)

    if 200 != response.status_code:  # early exit on error
        break

    soup = BeautifulSoup(response.text, "html")  # Create the BeautifulSoup object

    # should only run once at the first iteration
    if pagination == None:  
        ul_pagination = soup.find("ul", class_="pagination")
        pagination = [url + "?" + anker["href"].split("?")[1] for anker in ul_pagination.find_all("a")][1:-1]

    # read table and rows
    table = soup.find("table", class_="table")
    rows = table.find_all("tr")

    # loop over all rows and extract headers (once) and data
    for row in rows:
        # should also only run once at the first iteration
        ths = row.find_all("th")
        if ths and not headers:
            headers = [th.text.strip() for th in list(ths)]
            continue

        # read all colums in a list and append it to data
        tds = row.find_all("td")
        if tds:  # prevent empty rows
            data.append([td.text.strip() for td in tds])

    # load the next page
    if not pagination:
        print("No more pages! Have a nice day!")
        break
 
    print("Take a little nap!")
    time.sleep(1.5)  # the server owner allows only one request per second 
    url = pagination.pop(0)


df = pd.DataFrame(data)  # Create a Pandas DataFrame with collected data
df.columns = headers  # Set collected headers as column names

print(f"Write to CSV file ({output_filename})")
df.to_csv(output_filename, index=False, header=True)