# Intro to Web Scraping (Demo)

Explore the site: https://www.scrapethissite.com/pages/simple/ 

Task: How many countries are included?

Make a dataframe consisting of each country, capital, population, and area

Export the data as a csv



## Make request

In [None]:
import requests
url = 'https://www.scrapethissite.com/pages/simple/'
response = requests.get(url)

In [None]:
# Check status code
response.status_code

In [None]:
# Preview the content of the response
response.content[:1000]

## Make Soup Object

In [None]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.content)

# To view the soup object, use .prettify()

soup.prettify()

In [None]:
# Make it easier to read by printing (Here we only printed the first 1000 characters)

print(soup.prettify()[:1000])

## Navigate the soup

In [None]:
# Find the title
soup.title

In [None]:
# We can get just the content of the title without the tags by adding .text

soup.title.text

In [None]:
# Find the body
soup.body

In [None]:
# Find all h1 tags
h1_headers = soup.find_all('h1')
len(h1_headers)

In [None]:
h1_headers

In [None]:
# Obtain the country div for each country
country_divs = soup.find_all("div", attrs={'class':'col-md-4 country'})
len(country_divs)

In [None]:
# Find each piece of information within one of the country_divs
country_div = country_divs[0]
country_div

In [None]:
from IPython.display import HTML
HTML(str(country_div))

### Extract the country name

In [None]:
country_name = country_div.find('h3', {'class' : 'country-name'})
country_name

In [None]:
country_name.text

In [None]:
country_name.text.strip()

### Extract the capital

In [None]:
capital = country_div.find('span', attrs = {'class' : 'country-capital'})
capital

In [None]:
capital.text

### Extract the Population

In [None]:
population = country_div.find('span', attrs = {'class' : 'country-population'})
population

In [None]:
population.text

### Extract the Area

In [None]:
area = country_div.find('span', attrs = {'class' : 'country-area'})
area

In [None]:
area.text

## Make a dictionary with all of the info

In [None]:
# Country nanme
country_name = country_name = country_div.find('h3', {'class' : 'country-name'}).text.strip()

# Capital
capital = country_div.find('span', attrs = {'class' : 'country-capital'}).text

# Population
population =country_div.find('span', attrs = {'class' : 'country-population'}).text

# Area
area = country_div.find('span', attrs = {'class' : 'country-area'}).text

country_dict = {
                'Country': country_name,
                'Capital' : capital,
                'Population': population,
                'Area' : area
}

country_dict

## Loop through all of the countries

Create a list of dictionaries

In [None]:
# Create an empty list
country_data_list = []

# Loop through each country div and fill in the dictionary
for country_div in country_divs:
    # Country nanme
    country_name = country_name = country_div.find('h3', {'class' : 'country-name'}).text.strip()

    # Capital
    capital = country_div.find('span', attrs = {'class' : 'country-capital'}).text

    # Population
    population =country_div.find('span', attrs = {'class' : 'country-population'}).text

    # Area
    area = country_div.find('span', attrs = {'class' : 'country-area'}).text

    country_dict = {
                    'Country': country_name,
                    'Capital' : capital,
                    'Population': population,
                    'Area' : area
    }

    country_data_list.append(country_dict)

In [None]:
# Confirm all countries are included
len(country_data_list)

In [None]:
# Inspect a particular country
country_data_list[10]

## Convert List of dictionaries to a dataframe

In [None]:
import pandas as pd

In [None]:
country_df = pd.DataFrame(country_data_list)
country_df.head()

In [None]:
# Export the df as a csv
country_df.to_csv("../Data/country_info_scraped.csv", index=False)