# Day 10 - Web Scrapping
## Import all libraries needed
bs4 stands for beautifulsoup4

In [24]:
import bs4 as bs
import requests

## Get the website using requests.get()

In [25]:
raw = requests.get('https://scrapethissite.com/pages/simple/')
#note: make sure that you put the link as a string

## Access the content of the website

In [26]:
raw.text
#note: this 

'<!doctype html>\n<html lang="en">\n  <head>\n    <meta charset="utf-8">\n    <title>Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping</title>\n    <link rel="icon" type="image/png" href="/static/images/scraper-icon.png" />\n\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    <meta name="description" content="A single page that lists information about all the countries in the world. Good for those just get started with web scraping.">\n\n    <link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" rel="stylesheet" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" crossorigin="anonymous">\n    <link href=\'https://fonts.googleapis.com/css?family=Lato:400,700\' rel=\'stylesheet\' type=\'text/css\'>\n    <link rel="stylesheet" type="text/css" href="/static/css/styles.css"

## Lets find out what data type source.text is

In [27]:
type(raw.text)

str

## Use BeautifulSoup to make things more readable

In [28]:
soup = bs.BeautifulSoup(raw.text)
#reders a more beautifully formatted source

print(soup)

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping</title>
<link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="A single page that lists information about all the countries in the world. Good for those just get started with web scraping." name="description"/>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
<link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
<meta content="noindex" name="robots"/>
<link h

## Apply soup.find_all on the h3 tag

In [62]:
country_raw = soup.find_all('h3')
#identify all of the texts that have h3
#create a list called h3_raw

print(country_raw)

[<h3 class="country-name">
<i class="flag-icon flag-icon-ad"></i>
                            Andorra
                        </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-ae"></i>
                            United Arab Emirates
                        </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-af"></i>
                            Afghanistan
                        </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-ag"></i>
                            Antigua and Barbuda
                        </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-ai"></i>
                            Anguilla
                        </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-al"></i>
                            Albania
                        </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-am"></i>
                            Armenia
                        </h3>, <h3 class="country-name">
<i class="flag-icon

## Testing the information from h3_raw 

In [63]:
country_raw[3]
#a demonstration of the h3_raw list

<h3 class="country-name">
<i class="flag-icon flag-icon-ag"></i>
                            Antigua and Barbuda
                        </h3>

## Extracting a single country name

In [64]:
country_raw[3].get_text()
#note the excess new line characters and the excess white space -- data is not yet fully "cleaned"
#note: this creates a string

example = country_raw[3].get_text()
example.replace('/n','').strip()

print(example)



                            Antigua and Barbuda
                        


## Extracting all country names and saving to a list

In [65]:
country_processed = []
for x in range(len(country_raw)):
    temp = country_raw[x].get_text()
    temp1 = temp.replace('/n','').strip()
    country_processed.append(temp1)
    
country_processed

['Andorra',
 'United Arab Emirates',
 'Afghanistan',
 'Antigua and Barbuda',
 'Anguilla',
 'Albania',
 'Armenia',
 'Angola',
 'Antarctica',
 'Argentina',
 'American Samoa',
 'Austria',
 'Australia',
 'Aruba',
 'Åland',
 'Azerbaijan',
 'Bosnia and Herzegovina',
 'Barbados',
 'Bangladesh',
 'Belgium',
 'Burkina Faso',
 'Bulgaria',
 'Bahrain',
 'Burundi',
 'Benin',
 'Saint Barthélemy',
 'Bermuda',
 'Brunei',
 'Bolivia',
 'Bonaire',
 'Brazil',
 'Bahamas',
 'Bhutan',
 'Bouvet Island',
 'Botswana',
 'Belarus',
 'Belize',
 'Canada',
 'Cocos [Keeling] Islands',
 'Democratic Republic of the Congo',
 'Central African Republic',
 'Republic of the Congo',
 'Switzerland',
 'Ivory Coast',
 'Cook Islands',
 'Chile',
 'Cameroon',
 'China',
 'Colombia',
 'Costa Rica',
 'Cuba',
 'Cape Verde',
 'Curacao',
 'Christmas Island',
 'Cyprus',
 'Czech Republic',
 'Germany',
 'Djibouti',
 'Denmark',
 'Dominica',
 'Dominican Republic',
 'Algeria',
 'Ecuador',
 'Estonia',
 'Egypt',
 'Western Sahara',
 'Eritrea',
 

## Getting all of the other country data and turning them into lists

In [66]:
#getting all of the capitol data
capital_raw = soup.find_all('span', attrs={'class': 'country-capital'})

capital_processed = []
for x in range(len(capital_raw)):
    temp = capital_raw[x].get_text()
    temp1 = temp.replace('/n','').strip()
    capital_processed.append(temp1)
        
#getting all of the population data
population_raw = soup.find_all('span', attrs={'class': 'country-population'})

population_processed = []
for x in range(len(population_raw)):
    temp = population_raw[x].get_text()
    temp1 = temp.replace('/n','').strip()
    population_processed.append(temp1)
    
#getting all of the area data
area_raw = soup.find_all('span', attrs={'class': 'country-area'})

area_processed = []
for x in range(len(area_raw)):
    temp = area_raw[x].get_text()
    temp1 = temp.replace('/n','').strip()
    area_processed.append(temp1)
area_processed

['468.0',
 '82880.0',
 '647500.0',
 '443.0',
 '102.0',
 '28748.0',
 '29800.0',
 '1246700.0',
 '1.4E7',
 '2766890.0',
 '199.0',
 '83858.0',
 '7686850.0',
 '193.0',
 '1580.0',
 '86600.0',
 '51129.0',
 '431.0',
 '144000.0',
 '30510.0',
 '274200.0',
 '110910.0',
 '665.0',
 '27830.0',
 '112620.0',
 '21.0',
 '53.0',
 '5770.0',
 '1098580.0',
 '328.0',
 '8511965.0',
 '13940.0',
 '47000.0',
 '49.0',
 '600370.0',
 '207600.0',
 '22966.0',
 '9984670.0',
 '14.0',
 '2345410.0',
 '622984.0',
 '342000.0',
 '41290.0',
 '322460.0',
 '240.0',
 '756950.0',
 '475440.0',
 '9596960.0',
 '1138910.0',
 '51100.0',
 '110860.0',
 '4033.0',
 '444.0',
 '135.0',
 '9250.0',
 '78866.0',
 '357021.0',
 '23000.0',
 '43094.0',
 '754.0',
 '48730.0',
 '2381740.0',
 '283560.0',
 '45226.0',
 '1001450.0',
 '266000.0',
 '121320.0',
 '504782.0',
 '1127127.0',
 '337030.0',
 '18270.0',
 '12173.0',
 '702.0',
 '1399.0',
 '547030.0',
 '267667.0',
 '244820.0',
 '344.0',
 '69700.0',
 '91000.0',
 '78.0',
 '239460.0',
 '6.5',
 '2166086.0