# Import Libraries
Importing libraries are necessary for any project.
We're going to import commonly used Webscrapping and Data Analysis libraries. Make sure all libraries are installed.

In [1]:
import requests # for getting web contents
from bs4 import BeautifulSoup # for scraping web contents
import pandas as pd # for data analysis

# URL
For web scrapping it's important to have url.

In [2]:
# link of web page that you want to scrap data
URL = 'https://www.worldometers.info/gdp/gdp-by-country/'

# get web data
page = requests.get(URL)

# parse web data
soup = BeautifulSoup(page.content, "html.parser")

In [3]:
# find the table
# our trageted table is last

# getting the table head because it may contains headings (column names)
html_thead = soup.find_all('thead')[-1]

#getting all the rows in table head
html_tr = [tr for tr in html_thead.find_all('tr')]

# list to store all table headings
headings = []

# loop through table head
for tr in html_tr:
    # getting all th
    th = tr.find_all(['th'])
    # storing all th value in row and removing white space
    row = [i.text.strip() for i in th]
    # append headings 
    headings.append(row)
    
# print heading
# print(headings)

In [4]:
# getting the table body
html_tbody = soup.find_all('tbody')[-1]

#getting all the rows in table body
html_text = [tr for tr in html_tbody.find_all('tr')]

# list to store all content
content = []

# loop through table body
for tr in html_text:
    # getting all th, td
    th = tr.find_all(['th','td'])
    # storing all th value in row and removing white space
    row = [i.text.strip() for i in th]
    # append content 
    content.append(row)
    
# print content
#print(content)

In [5]:
# save contents in a dataframe
data = pd.DataFrame(content[:], columns=headings[0])

# Data Analysis

## Look at Example Records

In [6]:
# check few top rows of data
data.head()

Unnamed: 0,#,Country,"GDP (nominal, 2017)",GDP (abbrev.),GDP growth,Population (2017),GDP per capita,Share of World GDP
0,1,United States,"$19,485,394,000,000",$19.485 trillion,2.27%,325084756,"$59,939",24.08%
1,2,China,"$12,237,700,479,375",$12.238 trillion,6.90%,1421021791,"$8,612",15.12%
2,3,Japan,"$4,872,415,104,315",$4.872 trillion,1.71%,127502725,"$38,214",6.02%
3,4,Germany,"$3,693,204,332,230",$3.693 trillion,2.22%,82658409,"$44,680",4.56%
4,5,India,"$2,650,725,335,364",$2.651 trillion,6.68%,1338676785,"$1,980",3.28%


## Descriptive Statistics

In [7]:
# getting Generate descriptive statistics of data. Generate descriptive statistics include count, mean, std, min_value, 25%, 50%, 75%, max_value
data.describe()

Unnamed: 0,#,Country,"GDP (nominal, 2017)",GDP (abbrev.),GDP growth,Population (2017),GDP per capita,Share of World GDP
count,189,189,189,189,189,189,189,189
unique,189,189,189,188,166,189,188,63
top,47,Slovenia,"$1,510,084,751",$314 billion,1.87%,96418,"$3,494",0.00%
freq,1,1,1,2,3,1,2,35


## Summary of data-type, columns, non-null values, memory usage.

In [8]:
# data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189 entries, 0 to 188
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   #                    189 non-null    object
 1   Country              189 non-null    object
 2   GDP (nominal, 2017)  189 non-null    object
 3   GDP (abbrev.)        189 non-null    object
 4   GDP  growth          189 non-null    object
 5   Population (2017)    189 non-null    object
 6   GDP  per capita      189 non-null    object
 7   Share of World GDP   189 non-null    object
dtypes: object(8)
memory usage: 11.9+ KB


## Column labels

In [9]:
# get the column labels of the data.
data.columns

Index(['#', 'Country', 'GDP (nominal, 2017)', 'GDP (abbrev.)', 'GDP  growth',
       'Population (2017)', 'GDP  per capita', 'Share of World GDP'],
      dtype='object')

# Data Cleaning

## Rename Column Name

In [10]:
# rename column name if required
data = data.rename(columns={'#':'Rank', 'GDP (nominal, 2017)':'GDP(in US$)', 
                            'GDP  growth':'GDP growth %', 'GDP  per capita':'GDP per capita(in US$)', 
                            'Share of World GDP':'Share of World GDP %'})

In [11]:
data.columns

Index(['Rank', 'Country', 'GDP(in US$)', 'GDP (abbrev.)', 'GDP growth %',
       'Population (2017)', 'GDP per capita(in US$)', 'Share of World GDP %'],
      dtype='object')

## Remove unwanted symbols (like % and thousand comma from integer)

In [12]:
# remove extra characters from columns

# remove $ sign
data['GDP(in US$)'] = data['GDP(in US$)'].str.replace('$','')
data['GDP per capita(in US$)'] = data['GDP per capita(in US$)'].str.replace('$','')

# remove % sign
data['GDP growth %'] = data['GDP growth %'].str.replace('%','')
data['Share of World GDP %'] = data['Share of World GDP %'].str.replace('%','')

# remove thoudand comma
data['GDP(in US$)'] = data['GDP(in US$)'].str.replace(',','')
data['Population (2017)'] = data['Population (2017)'].str.replace(',','')
data['GDP per capita(in US$)'] = data['GDP per capita(in US$)'].str.replace(',','')

# Save Data into CSV

In [13]:
# save data
data.to_csv('GDP by Country in 2017.csv', index=False)