## Data Collection

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pandas.io.html import read_html

### 1. GDP Per Capita

In [2]:
url_1 = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)_per_capita'

table_1 = read_html(url_1, index_col = 0, attrs={'class': 'wikitable sortable'})

In [3]:
df1 = table_1[0]
df1.set_index('Country/Territory', inplace = True)

### 2. Population / Land Area / Migrants / Fertility Rate / Median Age / Urban Population

In [4]:
url_2 = 'https://www.worldometers.info/world-population/population-by-country/'
table_id = 'example2'

response = requests.get(url_2)
soup = BeautifulSoup(response.text, 'html.parser')

table_2 = soup.find('table', attrs={'id': table_id})
df2 = pd.read_html(str(table_2))
df2 = df2[0]
df2.drop(columns = ['#', 'Yearly Change', 'Net Change', 'Density (P/Km²)', 'World Share'], inplace=True)
df2.set_index('Country (or dependency)', inplace = True)

### 3. Temperature

In [5]:
url_3 = 'https://en.wikipedia.org/wiki/List_of_countries_by_average_yearly_temperature'

table_3 = read_html(url_3, index_col = 0, attrs={'class': 'wikitable sortable'})

In [6]:
df3 = table_3[0]
df3.rename(columns = {'Average yearly temperature (1961–1990, degrees Celsius)': 'average temperature'}, inplace = True)

### 4. Global Peace Index

In [7]:
url_4 = 'https://en.wikipedia.org/wiki/Global_Peace_Index'

table_4 = read_html(url_4, index_col = 0, attrs = {'class': 'wikitable sortable'})

In [8]:
df4 = table_4[0]
df4 = df4[['2019 rank']]
df4.rename(columns = {'2019 rank': 'peace_rk'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### 5. Corruption Index

In [9]:
url_5 = 'https://tradingeconomics.com/country-list/corruption-rank'

table_5 = read_html(url_5, index_col = 0, attrs = {'class': 'table table-hover'} )

In [10]:
df5 = table_5[0]
df5 = df5[['Last']]
df5.rename(columns = {'Last': 'corrupt_rk'}, inplace = True)

### 6. Hapiness Score

In [11]:
url_6 = 'https://en.wikipedia.org/wiki/World_Happiness_Report'

table_6 = read_html(url_6, index_col = 0, attrs = {'class': 'wikitable sortable'})

In [12]:
df6 = table_6[0]
df6 = df6[['Country or region', 'Score']]
df6.rename(columns = {'Score': 'hapiness_score'}, inplace = True)
df6.set_index('Country or region', inplace = True)

### 7. Alcohol Consumption

In [13]:
url_7 = 'https://en.wikipedia.org/wiki/List_of_countries_by_alcohol_consumption_per_capita'
table_id2 = 'WHO2010'

response = requests.get(url_7)
soup = BeautifulSoup(response.text, 'html.parser')

table_7 = soup.find('table', attrs={'id': table_id2})
df7 = pd.read_html(str(table_7))
df7 = df7[0]
df7 = df7[['Country', 'Total']]
df7.rename(columns = {'Total': 'alcohol_consumption'}, inplace=True)
df7.set_index('Country', inplace = True)

### 8. Life Expectancy

In [14]:
url_8 = 'https://www.worldometers.info/demographics/life-expectancy/'
table_id3 = 'example2'

response = requests.get(url_8)
soup = BeautifulSoup(response.text, 'html.parser')

table_8 = soup.find('table', attrs={'id': table_id3})
df8 = pd.read_html(str(table_8))
df8 = df8[0]
df8 = df8[['Country', 'Life Expectancy (both sexes)']]
df8.rename(columns = {'Life Expectancy (both sexes)': 'life_exp'}, inplace = True)
df8.set_index('Country', inplace = True)

### 9. Years of Schooling / Education Index

In [15]:
url_9 = 'https://en.wikipedia.org/wiki/Education_Index'

table_9 = read_html(url_9, index_col = 0, attrs = {'class': 'wikitable sortable mw-collapsible'})

In [16]:
df9 = table_9[0]
df9 = df9[['Country', 'EducationIndex', 'Mean years of schooling']]
df9.rename(columns = {'EducationIndex': 'educ_index', 'Mean years of schooling': 'school_mean'}, inplace = True)
df9.set_index('Country', inplace = True)

### 10. Railway Network

In [17]:
url_10 = 'https://en.wikipedia.org/wiki/List_of_countries_by_rail_transport_network_size'

table_10 = read_html(url_10, index_col = 0, attrs = {'class': 'wikitable sortable'})

In [18]:
df10 = table_10[0]

In [19]:
df10 = df10[['Country', 'Length(km)']]
df10.rename(columns = {'Length(km)': 'railway_len'}, inplace = True)
df10.set_index('Country', inplace = True)

### 11. Road Network

In [20]:
url_11 = 'https://en.wikipedia.org/wiki/List_of_countries_by_road_network_size'

table_11 = read_html(url_11, index_col = 0, attrs = {'class': 'wikitable sortable'})

In [21]:
df11 = table_11[0]
df11.columns = [''] * len(df11.columns)
df11.columns = ['country', 'roads', 'expressways', 'date']
df11.rename(columns = {'expressways': 'highway_len'}, inplace = True)
df11 = df11[['country', 'highway_len']]
df11.set_index('country', inplace = True)

## Combining the Data

In [22]:
all_df = [df2, df3, df4, df5, df6, df7, df8, df9, df10, df11]

df = df1.join(all_df, how = 'outer')

In [23]:
df.to_excel('capita.xlsx')