In [149]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Web Scraping

- **Spletna stran:** https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population
- **Tabela:** Sovereign states and dependencies by population        

### Naloga

Tabelo na zgornji strani pretvorite v urejen pandas dataframe, ki vsebuje naslednje stolpce (pozor na ustrezen tip in index):
- Rank: (Index) - int
- Country name: - object
- Population - int
- Date - Datetime
- % of world population - int


In [150]:
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population')

In [151]:
len(tables)

2

In [152]:
sov_states = tables[0]

In [153]:
sov_states.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 6 columns):
Rank                               240 non-null object
Country(or dependent territory)    240 non-null object
Population                         240 non-null int64
Date                               240 non-null object
% of worldpopulation               240 non-null object
Source                             240 non-null object
dtypes: int64(1), object(5)
memory usage: 11.3+ KB


In [154]:
sov_states.head()

Unnamed: 0,Rank,Country(or dependent territory),Population,Date,% of worldpopulation,Source
0,1,China[Note 2],1397600000,"June 2, 2019",18.1%,Official population clock
1,2,India[Note 3],1347960000,"June 2, 2019",17.5%,Official population clock
2,3,United States[Note 4],329306000,"June 2, 2019",4.27%,Official population clock
3,4,Indonesia,268074600,"July 1, 2019",3.48%,Official annual projection
4,5,Brazil,209985000,"June 2, 2019",2.72%,Official population clock


In [155]:
# Nekatere države imajo enak Rank, druga navedena država bo imela vrednost --
sov_states['Rank'].replace('–', np.nan, inplace=True)

In [156]:
# Naredimo, da imata obe državi isti Rank
sov_states['Rank'].fillna(method='ffill', inplace=True)

In [157]:
# Spremenimo Rank iz object v int
sov_states['Rank'] = sov_states['Rank'].astype('int16')

In [158]:
# Spremenimo Date iz object v datetime
sov_states['Date'] = pd.to_datetime(sov_states['Date'])

In [159]:
# % vsebuje tudi znak %, ki ga je potrebno odstraniti, če želimo spremeniti v int
def clean_per(item):
    return item[:item.find('%')] 


In [160]:
sov_states['% of worldpopulation'] = sov_states['% of worldpopulation'].apply(clean_per)

In [161]:
sov_states['% of worldpopulation'] = sov_states['% of worldpopulation'].astype('float16').astype('int16')

In [162]:
sov_states.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 6 columns):
Rank                               240 non-null int16
Country(or dependent territory)    240 non-null object
Population                         240 non-null int64
Date                               240 non-null datetime64[ns]
% of worldpopulation               240 non-null int16
Source                             240 non-null object
dtypes: datetime64[ns](1), int16(2), int64(1), object(2)
memory usage: 8.5+ KB


In [163]:
sov_states.set_index('Rank', inplace=True)
sov_states.drop('Source', axis=1, inplace=True)


In [165]:
sov_states.rename(columns={'Country(or dependent territory)': 'Country'}, inplace=True)

In [166]:
sov_states.head()

Unnamed: 0_level_0,Country,Population,Date,% of worldpopulation
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,China[Note 2],1397600000,2019-06-02,18
2,India[Note 3],1347960000,2019-06-02,17
3,United States[Note 4],329306000,2019-06-02,4
4,Indonesia,268074600,2019-07-01,3
5,Brazil,209985000,2019-06-02,2


In [146]:
def clean_br(item):
    return item[:item.find('[')] 

In [167]:
sov_states['Country'] = sov_states['Country'].apply(clean_br)

In [168]:
sov_states.head()

Unnamed: 0_level_0,Country,Population,Date,% of worldpopulation
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,China,1397600000,2019-06-02,18
2,India,1347960000,2019-06-02,17
3,United States,329306000,2019-06-02,4
4,Indonesi,268074600,2019-07-01,3
5,Brazi,209985000,2019-06-02,2
