## Data Acquisition

### Webscraping World Happiness

In [2]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
wrld_happiness_url = 'https://en.wikipedia.org/wiki/World_Happiness_Report'

response = requests.get(wrld_happiness_url)

In [4]:
response.status_code

200

In [5]:
page = response.text

In [6]:
soup = BeautifulSoup(page,"lxml")

In [7]:
tables = soup.find_all('table')

In [8]:
print(tables[0].prettify())

<table class="wikitable sortable">
 <tr valign="top">
  <th style="width: 10px;">
   Overall Rank
  </th>
  <th style="width: 10px;">
   Change in rank
  </th>
  <th style="width: 250px;">
   Country
  </th>
  <th>
   <abbr title="Happiness score">
    Score
   </abbr>
  </th>
  <th style="width: 10px;">
   Change in score
  </th>
  <th style="width: 10px;">
   <abbr title="Explained by: GDP">
    GDP per capita
   </abbr>
  </th>
  <th style="width: 10px;">
   <abbr title="Explained by: Social support">
    Social support
   </abbr>
  </th>
  <th style="width: 10px;">
   <abbr title="Explained by: Healthy life expectancy">
    Healthy life expectancy
   </abbr>
  </th>
  <th style="width: 10px;">
   <abbr title="Explained by: Freedom to make life choices">
    Freedom to make life choices
   </abbr>
  </th>
  <th style="width: 10px;">
   <abbr title="Explained by: Generosity">
    Generosity
   </abbr>
  </th>
  <th style="width: 10px;">
   <abbr title="Explained by: Perceptions of co

In [9]:
#rows=[row for row in tables[1].find_all('tr')]

In [10]:
#rows=rows[1:20]

In [11]:
#countries = {}
#for row in rows:
#    items=row.find_all('td')
#    country=items[1].find('a')['href']
#    countries[country]=[i.text for i in items[2:]]

In [12]:
#countries

In [13]:
happiness_2017 = pd.read_html(str(tables[0]), header = 0, index_col = 0)[0]

In [14]:
happiness_2017.head()

Unnamed: 0_level_0,Change in rank,Country,Score,Change in score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Trust,Residual
Overall Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,Norway,7.537,0.039,1.616,1.534,0.797,0.635,0.362,0.316,2.277
2,-1,Denmark,7.522,-0.004,1.482,1.551,0.793,0.626,0.355,0.401,2.314
3,0,Iceland,7.504,0.003,1.481,1.611,0.834,0.627,0.476,0.154,2.323
4,-2,Switzerland,7.494,-0.015,1.565,1.517,0.858,0.62,0.291,0.367,2.277
5,0,Finland,7.469,0.056,1.444,1.54,0.809,0.618,0.245,0.383,2.43


In [26]:
happiness_2017.info()

<class 'pandas.core.frame.DataFrame'>
Index: 157 entries, 1 to 155
Data columns (total 11 columns):
Change in rank                  154 non-null object
Country                         157 non-null object
Score                           157 non-null object
Change in score                 152 non-null float64
GDP per capita                  155 non-null float64
Social support                  155 non-null float64
Healthy life expectancy         155 non-null float64
Freedom to make life choices    155 non-null float64
Generosity                      155 non-null float64
Trust                           155 non-null float64
Residual                        155 non-null float64
dtypes: float64(8), object(3)
memory usage: 19.7+ KB


### Webscraping Country Fact URLs

Set URL and capture response. Check status of response.

In [16]:
wrld_fact_home_url = 'https://www.cia.gov/library/publications/resources/the-world-factbook/'
response = requests.get(wrld_fact_home_url)
response.status_code

200

Grab html and make a BeautifulSoup object with it.

In [17]:
page = response.text
soup = BeautifulSoup(page,'lxml')
print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
<!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
<!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <!--<![endif]-->
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <link href="css/publications.css" rel="stylesheet" type="text/css"/>
  <link href="css/publications-detail.css" rel="stylesheet" type="text/css"/>
  <meta charset="utf-8"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <title>
   The World Factbook â Central Intelligence Agency
  </title>
  <meta content="" name="description"/>
  <meta content="width=device-width" name="viewport"/>
  <meta content="Apr 01, 2016" name="LastModified"/>
  <link href="css/jobcart.css" rel="stylesheet"/>
  <link href="css/smallscreen.css" rel="

Find html containing Country Name to url mappings and put them in a list.
This is in a dropdown menu on the page, with value equal to the relative url, and text equal to the Country Name.

In [22]:
cntry_select = soup.find(id='cntrySelect').find('select').find_all('option')
cntry_select

[<option value="">Please select a country to view</option>,
 <option value="geos/xx.html"> World </option>,
 <option value="geos/af.html"> Afghanistan </option>,
 <option value="geos/ax.html"> Akrotiri </option>,
 <option value="geos/al.html"> Albania </option>,
 <option value="geos/ag.html"> Algeria </option>,
 <option value="geos/aq.html"> American Samoa </option>,
 <option value="geos/an.html"> Andorra </option>,
 <option value="geos/ao.html"> Angola </option>,
 <option value="geos/av.html"> Anguilla </option>,
 <option value="geos/ay.html"> Antarctica </option>,
 <option value="geos/ac.html"> Antigua and Barbuda </option>,
 <option value="geos/xq.html"> Arctic Ocean </option>,
 <option value="geos/ar.html"> Argentina </option>,
 <option value="geos/am.html"> Armenia </option>,
 <option value="geos/aa.html"> Aruba </option>,
 <option value="geos/at.html"> Ashmore and Cartier Islands </option>,
 <option value="geos/zh.html"> Atlantic Ocean </option>,
 <option value="geos/as.html"> Au

Put these mappings into a dictionary, with the Country name as the key and the relative url as the value.

In [77]:
country_url={}
for option in cntry_select:
    country_url[option.text.strip()] = [option['value']]
country_url

{'Afghanistan': ['geos/af.html'],
 'Akrotiri': ['geos/ax.html'],
 'Albania': ['geos/al.html'],
 'Algeria': ['geos/ag.html'],
 'American Samoa': ['geos/aq.html'],
 'Andorra': ['geos/an.html'],
 'Angola': ['geos/ao.html'],
 'Anguilla': ['geos/av.html'],
 'Antarctica': ['geos/ay.html'],
 'Antigua and Barbuda': ['geos/ac.html'],
 'Arctic Ocean': ['geos/xq.html'],
 'Argentina': ['geos/ar.html'],
 'Armenia': ['geos/am.html'],
 'Aruba': ['geos/aa.html'],
 'Ashmore and Cartier Islands': ['geos/at.html'],
 'Atlantic Ocean': ['geos/zh.html'],
 'Australia': ['geos/as.html'],
 'Austria': ['geos/au.html'],
 'Azerbaijan': ['geos/aj.html'],
 'Bahamas, The': ['geos/bf.html'],
 'Bahrain': ['geos/ba.html'],
 'Baker Island': ['geos/um.html'],
 'Bangladesh': ['geos/bg.html'],
 'Barbados': ['geos/bb.html'],
 'Belarus': ['geos/bo.html'],
 'Belgium': ['geos/be.html'],
 'Belize': ['geos/bh.html'],
 'Benin': ['geos/bn.html'],
 'Bermuda': ['geos/bd.html'],
 'Bhutan': ['geos/bt.html'],
 'Bolivia': ['geos/bl.html

We only want to include countries that are in the happiness_2017 dataframe.

In [100]:
country_url2 = {}
for country in happiness_2017.Country:
    country_url2[country] = country_url.get(country)

In [96]:
country_url2

{'Afghanistan': ['geos/af.html'],
 'Albania': ['geos/al.html'],
 'Algeria': ['geos/ag.html'],
 'Angola': ['geos/ao.html'],
 'Argentina': ['geos/ar.html'],
 'Armenia': ['geos/am.html'],
 'Australia': ['geos/as.html'],
 'Austria': ['geos/au.html'],
 'Azerbaijan': ['geos/aj.html'],
 'Bahrain': ['geos/ba.html'],
 'Bangladesh': ['geos/bg.html'],
 'Belarus': ['geos/bo.html'],
 'Belgium': ['geos/be.html'],
 'Belize': ['geos/bh.html'],
 'Benin': ['geos/bn.html'],
 'Bhutan': ['geos/bt.html'],
 'Bolivia': ['geos/bl.html'],
 'Bosnia and Herzegovina': ['geos/bk.html'],
 'Botswana': ['geos/bc.html'],
 'Brazil': ['geos/br.html'],
 'Bulgaria': ['geos/bu.html'],
 'Burkina Faso': ['geos/uv.html'],
 'Burundi': ['geos/by.html'],
 'Cambodia': ['geos/cb.html'],
 'Cameroon': ['geos/cm.html'],
 'Canada': ['geos/ca.html'],
 'Central African Republic': ['geos/ct.html'],
 'Chad': ['geos/cd.html'],
 'Chile': ['geos/ci.html'],
 'China': ['geos/ch.html'],
 'Colombia': ['geos/co.html'],
 'Congo (Brazzaville)': None

In [99]:
len(country_url2) == len(happiness_2017.Country)

True

So country_url2 contains only the countries in happiness_2017 along with urls to get some more data!

Next step, go to each url in country_url2 and gather more features.

### Webscraping Country Facts