# Import Libraries
Importing libraries are necessary for any project.
We're going to import commonly used Webscrapping and Data Analysis libraries. Make sure all libraries are installed.

In [1]:
import requests # for getting web contents
from bs4 import BeautifulSoup # for scraping web contents
import pandas as pd # for data analysis

# URL
For web scrapping it's important to have url.

In [2]:
# link of web page that you want to scrap data
URL = 'https://www.worldometers.info/gdp/gdp-by-country/'

# get web data
page = requests.get(URL)

# parse web data
soup = BeautifulSoup(page.content, "html.parser")

In [3]:
# find the table
# our trageted table is last

# getting the table body
html_tbody = soup.find_all('tbody')[-1]

#getting all the rows in table body
html_text = [tr for tr in html_tbody.find_all('tr')]

urls = []
countries = []

# loop through table body
for tr in html_text:
    # getting all th, td
    th = tr.find_all(['th','td'])
    # storing all th value in row and removing white space
    # append content
    country = th[1].get_text()
    url = th[1].find('a')['href']
    urls.append(url)
    countries.append(country)

In [4]:
# function to scrape values from each url
def url_scrraper(url,country):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    table = soup.find_all('table')[0]
    # getting the table head because it may contains headings (column names)
    html_thead = soup.find_all('thead')[0]

    #getting all the rows in table head
    html_tr = [tr for tr in html_thead.find_all('tr')]

    # loop through table head
    for tr in html_tr:
        # getting all th
        th = tr.find_all(['th'])
        # storing all th value in row and removing white space
        row = [i.text.strip() for i in th]
        row.insert(1, 'Country')
        # append headings 
        headings.append(row)
    
    # getting the table body
    html_tbody = soup.find_all('tbody')[0]

    #getting all the rows in table body
    html_text = [tr for tr in html_tbody.find_all('tr')]


    # loop through table body
    for tr in html_text:
        # getting all th, td
        th = tr.find_all(['th','td'])
        # storing all th value in row and removing white space
        row = [i.text.strip() for i in th]
        row.insert(1, country)
        # append content
        content.append(row)

In [5]:
base_url = 'https://www.worldometers.info'
content = []

for i in range(len(countries)):
    headings = []
    if type(urls[i]) == list:
        url = base_url + urls[i][0]
    else:
        url = base_url + urls[i]
    url_scrraper(url, countries[i])

In [6]:
# save contents in a dataframe
data = pd.DataFrame(content[:], columns=headings[0])

# Data Analysis

## Look at Example Records

In [7]:
# check few top rows of data
data.head()

Unnamed: 0,Year,Country,GDP Nominal (Current USD),GDP Real (Inflation adj.),GDP change,GDP per capita,Pop. change,Population
0,2017,United States,"$19,485,394,000,000","$17,348,625,758,200",2.27%,"$53,366",0.64 %,325084756
1,2016,United States,"$18,707,189,000,000","$16,972,347,070,400",1.49%,"$52,543",0.67 %,323015995
2,2015,United States,"$18,219,297,000,000","$16,710,458,234,000",2.86%,"$52,077",0.69 %,320878310
3,2014,United States,"$17,521,747,000,000","$16,242,525,613,600",2.57%,"$50,969",0.72 %,318673411
4,2013,United States,"$16,784,851,000,000","$15,853,794,839,100",1.68%,"$50,107",0.75 %,316400538


## Descriptive Statistics

In [8]:
# getting Generate descriptive statistics of data. Generate descriptive statistics include count, mean, std, min_value, 25%, 50%, 75%, max_value
data.describe()

Unnamed: 0,Year,Country,GDP Nominal (Current USD),GDP Real (Inflation adj.),GDP change,GDP per capita,Pop. change,Population
count,4326,4326,4326,4326,4326,4326,4326,4326
unique,24,189,4325,4308,1514,3658,655,4325
top,2017,Kiribati,"$234,648,370,497",$0,0.00%,$0,0.50 %,9323
freq,189,24,2,14,22,14,23,2


## Summary of data-type, columns, non-null values, memory usage.

In [9]:
# data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4326 entries, 0 to 4325
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Year                        4326 non-null   object
 1   Country                     4326 non-null   object
 2   GDP Nominal (Current USD)   4326 non-null   object
 3   GDP Real  (Inflation adj.)  4326 non-null   object
 4   GDP change                  4326 non-null   object
 5   GDP per capita              4326 non-null   object
 6   Pop. change                 4326 non-null   object
 7   Population                  4326 non-null   object
dtypes: object(8)
memory usage: 270.5+ KB


## Column labels

In [10]:
# get the column labels of the data.
data.columns

Index(['Year', 'Country', 'GDP Nominal (Current USD)',
       'GDP Real  (Inflation adj.)', 'GDP change', 'GDP per capita',
       'Pop. change', 'Population'],
      dtype='object')

# Data Cleaning

In [11]:
data.head(10)

Unnamed: 0,Year,Country,GDP Nominal (Current USD),GDP Real (Inflation adj.),GDP change,GDP per capita,Pop. change,Population
0,2017,United States,"$19,485,394,000,000","$17,348,625,758,200",2.27%,"$53,366",0.64 %,325084756
1,2016,United States,"$18,707,189,000,000","$16,972,347,070,400",1.49%,"$52,543",0.67 %,323015995
2,2015,United States,"$18,219,297,000,000","$16,710,458,234,000",2.86%,"$52,077",0.69 %,320878310
3,2014,United States,"$17,521,747,000,000","$16,242,525,613,600",2.57%,"$50,969",0.72 %,318673411
4,2013,United States,"$16,784,851,000,000","$15,853,794,839,100",1.68%,"$50,107",0.75 %,316400538
5,2012,United States,"$16,197,007,000,000","$15,567,037,390,000",2.22%,"$49,570",0.79 %,314043885
6,2011,United States,"$15,542,582,000,000","$15,224,554,065,500",1.60%,"$48,862",0.83 %,311584047
7,2010,United States,"$14,992,052,000,000","$14,992,052,000,000",2.53%,"$48,516",0.88 %,309011475
8,2009,United States,"$14,418,739,000,000","$14,594,842,181,900",-2.78%,"$47,648",0.93 %,306307567
9,2008,United States,"$14,718,582,000,000","$15,011,490,541,400",-0.29%,"$49,464",0.96 %,303486012


## Rename Column Name

In [12]:
# rename column name if required
data = data.rename(columns={'GDP Nominal (Current USD)':'GDP (in USD)', 'GDP Real  (Inflation adj.)':'GDP Real (in USD)', 
                            'GDP change':'GDP change (%)', 'GDP per capita':'GDP per capita (in USD)', 
                            'Pop. change':'Pop. change (%)'})

## Remove unwanted symbols (like % and thousand comma from integer)

In [13]:
# remove extra characters from columns
# remove $ sign
data['GDP (in USD)'] = data['GDP (in USD)'].str.replace('$','')
data['GDP Real (in USD)'] = data['GDP Real (in USD)'].str.replace('$','')
data['GDP per capita (in USD)'] = data['GDP per capita (in USD)'].str.replace('$','')

# remove thousand comma
data['GDP (in USD)'] = data['GDP (in USD)'].str.replace(',','')
data['GDP Real (in USD)'] = data['GDP Real (in USD)'].str.replace(',','')
data['GDP per capita (in USD)'] = data['GDP per capita (in USD)'].str.replace(',','')
data['Population'] = data['Population'].str.replace(',','')

# remove % sign
data['GDP change (%)'] = data['GDP change (%)'].str.replace('%','')
data['Pop. change (%)'] = data['Pop. change (%)'].str.replace('%','')

# Save Data into CSV

In [14]:
# save data
data.to_csv('Country wise GDP from 1994 to 2017.csv', index=False)