In [34]:
import numpy as np
import pandas as pd
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup

# Web Scraping

- **Spletna stran:** https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population
- **Tabela:** Sovereign states and dependencies by population        

### Naloga

Tabelo na zgornji strani pretvorite v urejen pandas dataframe, ki vsebuje naslednje stolpce (pozor na ustrezen tip in index):
- Rank: (Index) - int
- Country name: - object
- Population - int
- Date - Datetime
- % of world population - int


In [35]:
def getHTMLContent(link):
    html = urlopen(link)
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [36]:
content = getHTMLContent('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population')
tables = content.find_all('table')
for table in tables:
    print(table.prettify())

<table class="wikitable sortable" style="text-align:right">
 <tbody>
  <tr>
   <th data-sort-type="number">
    Rank
   </th>
   <th>
    Country
    <br/>
    <small>
     (or dependent territory)
    </small>
   </th>
   <th>
    Population
   </th>
   <th>
    Date
   </th>
   <th>
    % of world
    <br/>
    population
   </th>
   <th class="unsortable">
    Source
   </th>
  </tr>
  <tr>
   <td>
    1
   </td>
   <td align="left">
    <span class="flagicon" style="display:inline-block;width:25px;">
     <img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Flag_of_the_People%27s_Republic_of_China.svg/23px-Flag_of_the_People%27s_Republic_of_China.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Flag_of_the_People%27s_Republic_of_China.svg/35px-Flag_of_the_People%27s_Republic_of_China.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Flag_

In [37]:
table = content.find('table', {'class': 'wikitable sortable'})
rows = table.find_all('tr')

# List of all links
for row in rows:
    cells = row.find_all('td')
    if len(cells) > 1:
        country_link = cells[1].find('a')
        print(country_link.get('href'))

/wiki/China
/wiki/India
/wiki/United_States
/wiki/Indonesia
/wiki/Brazil
/wiki/Pakistan
/wiki/Nigeria
/wiki/Bangladesh
/wiki/Russia
/wiki/Mexico
/wiki/Japan
/wiki/Philippines
/wiki/Egypt
/wiki/Ethiopia
/wiki/Vietnam
/wiki/Democratic_Republic_of_the_Congo
/wiki/Germany
/wiki/Iran
/wiki/Turkey
/wiki/France
/wiki/Thailand
/wiki/United_Kingdom
/wiki/Italy
/wiki/South_Africa
/wiki/Tanzania
/wiki/Myanmar
/wiki/Kenya
/wiki/South_Korea
/wiki/Colombia
/wiki/Spain
/wiki/Argentina
/wiki/Algeria
/wiki/Ukraine
/wiki/Sudan
/wiki/Uganda
/wiki/Iraq
/wiki/Poland
/wiki/Canada
/wiki/Morocco
/wiki/Uzbekistan
/wiki/Saudi_Arabia
/wiki/Malaysia
/wiki/Peru
/wiki/Venezuela
/wiki/Afghanistan
/wiki/Ghana
/wiki/Angola
/wiki/Nepal
/wiki/Yemen
/wiki/Mozambique
/wiki/Ivory_Coast
/wiki/North_Korea
/wiki/Australia
/wiki/Madagascar
/wiki/Cameroon
/wiki/Taiwan
/wiki/Niger
/wiki/Sri_Lanka
/wiki/Burkina_Faso
/wiki/Mali
/wiki/Romania
/wiki/Chile
/wiki/Syria
/wiki/Kazakhstan
/wiki/Guatemala
/wiki/Malawi
/wiki/Zambia
/wiki/N

In [39]:
def getAdditionalDetails(url):
    try:
        country_page = getHTMLContent('https://en.wikipedia.org' + url)
        table = country_page.find('table', {'class': 'infobox geography vcard'})
        additional_details = []
        read_content = False
        for tr in table.find_all('tr'):
            if (tr.get('class') == ['mergedtoprow'] and not read_content):
                link = tr.find('a')
                if (link and (link.get_text().strip() == 'Area' or
                   (link.get_text().strip() == 'GDP' and tr.find('span').get_text().strip() == '(nominal)'))):
                    read_content = True
                if (link and (link.get_text().strip() == 'Population')):
                    read_content = False
            elif ((tr.get('class') == ['mergedrow'] or tr.get('class') == ['mergedbottomrow']) and read_content):
                additional_details.append(tr.find('td').get_text().strip('\n')) 
                if (tr.find('div').get_text().strip() != '•\xa0Total area' and
                   tr.find('div').get_text().strip() != '•\xa0Total'):
                    read_content = False
        return additional_details
    except Exception as error:
        print('Error occured: {}'.format(error))
        return []

In [40]:
data_content = []
for row in rows:
    cells = row.find_all('td')
    if len(cells) > 1:
        print(cells[1].get_text())
        country_link = cells[1].find('a')
        country_info = [cell.text.strip('\n') for cell in cells]
        additional_details = getAdditionalDetails(country_link.get('href'))
        if (len(additional_details) == 4):
            country_info += additional_details
            data_content.append(country_info)

dataset = pd.DataFrame(data_content)

# Define column headings
headers = rows[0].find_all('th')
headers = [header.get_text().strip('\n') for header in headers]
headers += ['Total Area', 'Percentage Water', 'Total Nominal GDP', 'Per Capita GDP']
dataset.columns = headers

drop_columns = ['Rank', 'Date', 'Source']
dataset.drop(drop_columns, axis = 1, inplace = True)
dataset.sample(3)

dataset.to_csv("Dataset.csv", index = False)

 China[Note 2]

 India[Note 3]

 United States[Note 4]

 Indonesia

 Brazil

 Pakistan

 Nigeria

 Bangladesh

 Russia[Note 5]

 Mexico

 Japan

 Philippines

 Egypt

 Ethiopia

 Vietnam

 Democratic Republic of the Congo

 Germany

 Iran

 Turkey

 France[Note 6]

 Thailand

 United Kingdom[Note 7]

 Italy

 South Africa

 Tanzania[Note 8]

 Myanmar

 Kenya

 South Korea

 Colombia

 Spain

 Argentina

 Algeria

 Ukraine[Note 9]

 Sudan

 Uganda

 Iraq

 Poland

 Canada

 Morocco[Note 10]

 Uzbekistan

 Saudi Arabia

 Malaysia

 Peru

 Venezuela

 Afghanistan

 Ghana

 Angola

 Nepal

 Yemen

 Mozambique

 Ivory Coast

 North Korea

 Australia

 Madagascar

 Cameroon

 Taiwan[Note 11]

 Niger

 Sri Lanka

 Burkina Faso

 Mali

 Romania

 Chile

 Syria

 Kazakhstan

 Guatemala

 Malawi

 Zambia

 Netherlands

 Ecuador

 Cambodia

 Senegal

 Chad

 Somalia[Note 12]

 Zimbabwe

 South Sudan

 Rwanda

 Guinea

 Benin

 Haiti

 Tunisia

 Bolivia

 Belgium

 Cuba

 Burundi

 Greece

 Czech 