#### Import dependencies

In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import time
from splinter import Browser
import re

#### Data to be extracted:

##### people and society
     population
##### energy
     consumption
     fossil fuels
     nuclear fuels
     hydroelectric
     other renewable sources
     carbondioxide emissions from consumption of energy



In [2]:
executable_path = {"executable_path": "chromedriver.exe"}
browser = Browser("chrome", **executable_path, headless=False)

#### Extract population data

In [3]:
url = 'https://www.cia.gov/library/publications/the-world-factbook/fields/335rank.html'
browser.visit(url)

time.sleep(2)

html = browser.html

soup = bs(html, "html.parser")

In [4]:
country_name = []
country_pop = []
population_rank = []

tbody = soup.find("tbody")
trow = tbody.findAll("tr")

for i in trow:
    name = i.find("td", class_="region").find("a").text
    if name == "Congo, Democratic Republic of the":
        name = "CD"
    if name == "Congo, Republic of the":
        name = "CG"
    if name == "Cote d'Ivoire":
        name = "CI"
    if name == "South Sudan":
        name = "SS"
    if name == "Svalbard":
        name = "Svalbard and Jan Mayen"
    if name == "Burma":
        name = "Myanmar"
    if name == "Korea, South":
        name = "South Korea"
    if name == "Korea, North":
        name = "North Korea"
    if name == "Bahamas, The":
        name = "Bahamas"
    if name == "Czechia":
        name = "CZ"
    if name == "Kosovo":
        name = "XK"
    if name == "Eswatini":
        name = "SZ"
    if name == "Timor-Leste":
        name = "TL"

    country_name.append(name)

    population = i.findAll("td")[2].text
    population = "".join(population.split(","))
    country_pop.append(float(population))

    rank = i.findAll("td")[0].text
    population_rank.append(rank)

population_df = pd.DataFrame({
    "name": country_name,
    "population": country_pop,
    "population_rank": population_rank
})


population_df.head()

Unnamed: 0,name,population,population_rank
0,China,1384689000.0,1
1,India,1296834000.0,2
2,United States,329256500.0,3
3,Indonesia,262787400.0,4
4,Brazil,208846900.0,5


#### Extract electricity consumption data

In [5]:
url = 'https://www.cia.gov/library/publications/the-world-factbook/fields/253rank.html'
browser.visit(url)

time.sleep(2)

html = browser.html

soup = bs(html, "html.parser")

In [6]:
country_name = []
country_econsumption = []
econsumption_rank = []

tbody = soup.find("tbody")
trow = tbody.findAll("tr")

for i in trow:
    name = i.find("td", class_="region").find("a").text
    if name == "Congo, Democratic Republic of the":
        name = "CD"
    if name == "Congo, Republic of the":
        name = "CG"
    if name == "Cote d'Ivoire":
        name = "CI"
    if name == "South Sudan":
        name = "SS"
    if name == "Svalbard":
        name = "Svalbard and Jan Mayen"
    if name == "Burma":
        name = "Myanmar"
    if name == "Korea, South":
        name = "South Korea"
    if name == "Korea, North":
        name = "North Korea"
    if name == "Bahamas, The":
        name = "Bahamas"
    if name == "Czechia":
        name = "CZ"
    if name == "Kosovo":
        name = "XK"
    if name == "Eswatini":
        name = "SZ"
    if name == "Timor-Leste":
        name = "TL"
    country_name.append(name)

    econsumption = i.findAll("td")[2].text
    econsumption = "".join(econsumption.split(","))
    country_econsumption.append(float(econsumption))

    rank = i.findAll("td")[0].text
    econsumption_rank.append(rank)

econsumption_df = pd.DataFrame({
    "name": country_name,
    "econsumption": country_econsumption,
    "econsumption_rank": econsumption_rank
})

econsumption_df.head()

Unnamed: 0,name,econsumption,econsumption_rank
0,China,5564000000000.0,1
1,United States,3902000000000.0,2
2,India,1137000000000.0,3
3,Japan,943700000000.0,4
4,Russia,909600000000.0,5


#### Extract electricity from fossil fuel data

In [7]:
url = 'https://www.cia.gov/library/publications/the-world-factbook/fields/257rank.html'
browser.visit(url)

time.sleep(2)

html = browser.html

soup = bs(html, "html.parser")

In [8]:
country_name = []
country_fossil = []
fossil_rank = []

tbody = soup.find("tbody")
trow = tbody.findAll("tr")

for i in trow:
    name = i.find("td", class_="region").find("a").text
    if name == "Congo, Democratic Republic of the":
        name = "CD"
    if name == "Congo, Republic of the":
        name = "CG"
    if name == "Cote d'Ivoire":
        name = "CI"
    if name == "South Sudan":
        name = "SS"
    if name == "Svalbard":
        name = "Svalbard and Jan Mayen"
    if name == "Burma":
        name = "Myanmar"
    if name == "Korea, South":
        name = "South Korea"
    if name == "Korea, North":
        name = "North Korea"
    if name == "Bahamas, The":
        name = "Bahamas"
    if name == "Czechia":
        name = "CZ"
    if name == "Kosovo":
        name = "XK"
    if name == "Eswatini":
        name = "SZ"
    if name == "Timor-Leste":
        name = "TL"
    country_name.append(name)

    fossil = i.findAll("td")[2].text
    country_fossil.append(float(fossil))

    rank = i.findAll("td")[0].text
    fossil_rank.append(rank)

fossil_df = pd.DataFrame({
    "name": country_name,
    "fossil": country_fossil,
    "fossil_rank": fossil_rank
})

fossil_df.head()

Unnamed: 0,name,fossil,fossil_rank
0,Bahamas,100.0,1
1,Bahrain,100.0,2
2,Bermuda,100.0,3
3,Botswana,100.0,4
4,Brunei,100.0,5


#### Extract electricity from nuclear fuels data

In [9]:
url = 'https://www.cia.gov/library/publications/the-world-factbook/fields/258rank.html'
browser.visit(url)

time.sleep(2)

html = browser.html

soup = bs(html, "html.parser")

In [10]:
country_name = []
country_nuclear = []
nuclear_rank = []

tbody = soup.find("tbody")
trow = tbody.findAll("tr")

for i in trow:
    name = i.find("td", class_="region").find("a").text
    country_name.append(name)
    
    nuclear = i.findAll("td")[2].text
    country_nuclear.append(float(nuclear))
    
    rank = i.findAll("td")[0].text
    nuclear_rank.append(rank)

nuclear_df = pd.DataFrame({
    "name": country_name,
    "nuclear": country_nuclear,
    "nuclear_rank": nuclear_rank
})

nuclear_df.head()

Unnamed: 0,name,nuclear,nuclear_rank
0,France,50.0,1
1,Belgium,28.0,2
2,Slovakia,27.0,3
3,Ukraine,23.0,4
4,Hungary,22.0,5


#### Extract electricity from hydroelectric plants data

In [11]:
url = 'https://www.cia.gov/library/publications/the-world-factbook/fields/259rank.html'
browser.visit(url)

time.sleep(2)

html = browser.html

soup = bs(html, "html.parser")

In [12]:
country_name = []
country_hydro = []
hydro_rank = []

tbody = soup.find("tbody")
trow = tbody.findAll("tr")

for i in trow:
    name = i.find("td", class_="region").find("a").text
    if name == "Congo, Democratic Republic of the":
        name = "CD"
    if name == "Congo, Republic of the":
        name = "CG"
    if name == "Cote d'Ivoire":
        name = "CI"
    if name == "South Sudan":
        name = "SS"
    if name == "Svalbard":
        name = "Svalbard and Jan Mayen"
    if name == "Burma":
        name = "Myanmar"
    if name == "Korea, South":
        name = "South Korea"
    if name == "Korea, North":
        name = "North Korea"
    if name == "Bahamas, The":
        name = "Bahamas"
    if name == "Czechia":
        name = "CZ"
    if name == "Kosovo":
        name = "XK"
    if name == "Eswatini":
        name = "SZ"
    if name == "Timor-Leste":
        name = "TL"
    country_name.append(name)

    hydro = i.findAll("td")[2].text
    country_hydro.append(float(hydro))

    rank = i.findAll("td")[0].text
    hydro_rank.append(rank)

hydro_df = pd.DataFrame({
    "name": country_name,
    "hydroelectric": country_hydro,
    "hydro_rank": hydro_rank
})

hydro_df.head()

Unnamed: 0,name,hydroelectric,hydro_rank
0,Lesotho,100.0,1
1,Bhutan,99.0,2
2,Paraguay,99.0,3
3,CD,98.0,4
4,Albania,95.0,5


#### Extract electricity from other renewable sources data

In [13]:
url = 'https://www.cia.gov/library/publications/the-world-factbook/fields/260rank.html'
browser.visit(url)

time.sleep(2)

html = browser.html

soup = bs(html, "html.parser")

In [14]:
country_name = []
country_other = []
other_rank = []

tbody = soup.find("tbody")
trow = tbody.findAll("tr")

for i in trow:
    name = i.find("td", class_="region").find("a").text
    if name == "Congo, Democratic Republic of the":
        name = "CD"
    if name == "Congo, Republic of the":
        name = "CG"
    if name == "Cote d'Ivoire":
        name = "CI"
    if name == "South Sudan":
        name = "SS"
    if name == "Svalbard":
        name = "Svalbard and Jan Mayen"
    if name == "Burma":
        name = "Myanmar"
    if name == "Korea, South":
        name = "South Korea"
    if name == "Korea, North":
        name = "North Korea"
    if name == "Bahamas, The":
        name = "Bahamas"
    if name == "Czechia":
        name = "CZ"
    if name == "Kosovo":
        name = "XK"
    if name == "Eswatini":
        name = "SZ"
    if name == "Timor-Leste":
        name = "TL"
    country_name.append(name)

    other = i.findAll("td")[2].text
    country_other.append(float(other))

    rank = i.findAll("td")[0].text
    other_rank.append(rank)

other_df = pd.DataFrame({
    "name": country_name,
    "other": country_other,
    "other_rank": other_rank

})

other_df.head()

Unnamed: 0,name,other,other_rank
0,TL,100.0,1
1,Luxembourg,67.0,2
2,Denmark,54.0,3
3,Germany,52.0,4
4,Uruguay,42.0,5


#### Extract electricity from carbondioxide emissions from consumption of energy data

In [15]:
url = 'https://www.cia.gov/library/publications/the-world-factbook/fields/274rank.html'
browser.visit(url)

time.sleep(2)

html = browser.html

soup = bs(html, "html.parser")

In [16]:
country_name = []
country_emissions = []
emissions_rank = []

tbody = soup.find("tbody")
trow = tbody.findAll("tr")

for i in trow:
    name = i.find("td", class_="region").find("a").text
    if name == "Congo, Democratic Republic of the":
        name = "CD"
    if name == "Congo, Republic of the":
        name = "CG"
    if name == "Cote d'Ivoire":
        name = "CI"
    if name == "South Sudan":
        name = "SS"
    if name == "Svalbard":
        name = "Svalbard and Jan Mayen"
    if name == "Burma":
        name = "Myanmar"
    if name == "Korea, South":
        name = "South Korea"
    if name == "Korea, North":
        name = "North Korea"
    if name == "Bahamas, The":
        name = "Bahamas"
    if name == "Czechia":
        name = "CZ"
    if name == "Kosovo":
        name = "XK"
    if name == "Eswatini":
        name = "SZ"
    if name == "Timor-Leste":
        name = "TL"
    country_name.append(name)

    emissions = i.findAll("td")[2].text
    emissions = "".join(emissions.split(","))
    country_emissions.append(float(emissions))

    rank = i.findAll("td")[0].text
    emissions_rank.append(rank)

emissions_df = pd.DataFrame({
    "name": country_name,
    "emissions": country_emissions,
    "emissions_rank": emissions_rank
})

emissions_df.head()

Unnamed: 0,name,emissions,emissions_rank
0,China,11670000000.0,1
1,United States,5242000000.0,2
2,India,2383000000.0,3
3,Russia,1847000000.0,4
4,Japan,1268000000.0,5


#### Extract GDP data

In [17]:
url = 'https://www.cia.gov/library/publications/the-world-factbook/fields/208rank.html'
browser.visit(url)

time.sleep(2)

html = browser.html

soup = bs(html, "html.parser")

In [18]:
country_name = []
country_gdp = []
gdp_rank = []

tbody = soup.find("tbody")
trow = tbody.findAll("tr")

for i in trow: 
    name = i.find("td", class_="region").find("a").text
    if name == "Congo, Democratic Republic of the":
        name = "CD"
    if name == "Congo, Republic of the":
        name = "CG"
    if name == "Cote d'Ivoire":
        name = "CI"
    if name == "South Sudan":
        name = "SS"
    if name == "Svalbard":
        name = "Svalbard and Jan Mayen"
    if name == "Burma":
        name = "Myanmar"
    if name == "Korea, South":
        name = "South Korea"
    if name == "Korea, North":
        name = "North Korea"
    if name == "Bahamas, The":
        name = "Bahamas"
    if name == "Czechia":
        name = "CZ"
    if name == "Kosovo":
        name = "XK"
    if name == "Eswatini":
        name = "SZ"
    if name == "Timor-Leste":
        name = "TL"
    country_name.append(name)

    gdp = i.findAll("td")[2].text
    gdp = "".join(gdp.split(","))
    country_gdp.append(float(gdp[1:]))

    rank = i.findAll("td")[0].text
    gdp_rank.append(rank)

gdp_df = pd.DataFrame({
    "name": country_name,
    "gdp": country_gdp,
    "gdp_rank": gdp_rank
})

gdp_df.head()

Unnamed: 0,name,gdp,gdp_rank
0,China,23210000000000.0,1
1,United States,19490000000000.0,2
2,India,9474000000000.0,3
3,Japan,5443000000000.0,4
4,Germany,4199000000000.0,5


#### Extract GDP per capita data

In [19]:
url = 'https://www.cia.gov/library/publications/the-world-factbook/fields/211rank.html'
browser.visit(url)

time.sleep(2)

html = browser.html

soup = bs(html, "html.parser")

In [20]:
country_name = []
country_gdppc = []
gdppc_rank = []

tbody = soup.find("tbody")
trow = tbody.findAll("tr")

for i in trow: 
    name = i.find("td", class_="region").find("a").text
    if name == "Congo, Democratic Republic of the":
        name = "CD"
    if name == "Congo, Republic of the":
        name = "CG"
    if name == "Cote d'Ivoire":
        name = "CI"
    if name == "South Sudan":
        name = "SS"
    if name == "Svalbard":
        name = "Svalbard and Jan Mayen"
    if name == "Burma":
        name = "Myanmar"
    if name == "Korea, South":
        name = "South Korea"
    if name == "Korea, North":
        name = "North Korea"
    if name == "Bahamas, The":
        name = "Bahamas"
    if name == "Czechia":
        name = "CZ"
    if name == "Kosovo":
        name = "XK"
    if name == "Eswatini":
        name = "SZ"
    if name == "Timor-Leste":
        name = "TL"
    country_name.append(name)

    gdppc = i.findAll("td")[2].text
    gdppc = "".join(gdppc.split(","))
    country_gdppc.append(float(gdppc[1:]))

    rank = i.findAll("td")[0].text
    gdppc_rank.append(rank)

gdppc_df = pd.DataFrame({
    "name": country_name,
    "gdppc": country_gdppc,
    "gdppc_rank": gdppc_rank
})

gdppc_df.head()

Unnamed: 0,name,gdppc,gdppc_rank
0,Liechtenstein,139100.0,1
1,Qatar,124100.0,2
2,Monaco,115700.0,3
3,Macau,110000.0,4
4,Luxembourg,105100.0,5


#### Extract coordinates data

In [21]:
url = 'https://www.cia.gov/library/publications/the-world-factbook/fields/277.html'
browser.visit(url)

time.sleep(2)

html = browser.html

soup = bs(html, "html.parser")

In [22]:
country_name = []
country_lat = []
country_lon = []

tbody = soup.find("tbody")
trow = tbody.findAll("tr")

for i in trow:
    lat_lon = []

    try:
        coord = i.find("div", {"id": "field-geographic-coordinates"})\
            .find("div", class_ = "category_data subfield text").text.strip()
        lat_lon = coord.split(",")

        if lat_lon[0][-1] == "S":
            lat = -float(lat_lon[0][:-2].replace(" ", "."))
        else:
            lat = float(lat_lon[0][:-2].replace(" ", "."))

        if lat_lon[1][-1] == "W":
            lon = -float(lat_lon[1][1:-2].replace(" ", "."))
        else:
            lon = float(lat_lon[1][1:-2].replace(" ", "."))

        name = i.find("td", class_="country").find("a").text
        if name == "Congo, Democratic Republic of the":
            name = "CD"
        if name == "Congo, Republic of the":
            name = "CG"
        if name == "Cote d'Ivoire":
            name = "CI"
        if name == "South Sudan":
            name = "SS"
        if name == "Svalbard":
            name = "Svalbard and Jan Mayen"
        if name == "Burma":
            name = "Myanmar"
        if name == "Korea, South":
            name = "South Korea"
        if name == "Korea, North":
            name = "North Korea"
        if name == "Bahamas, The":
            name = "Bahamas"
        if name == "Czechia":
            name = "CZ"
        if name == "Kosovo":
            name = "XK"
        if name == "Eswatini":
            name = "SZ"
        if name == "Timor-Leste":
            name = "TL"

        country_name.append(name)
        country_lat.append(lat)
        country_lon.append(lon)

    except:
        try:
            coord = i.find("div", {"id": "field-geographic-coordinates"})\
                .find("div", class_ = "category_data subfield text").text.strip()

            search = re.search(':(.+?);', coord).group()
            if search != None:
                try:
                    lat_lon = search[2:-1].split(",")

                    if lat_lon[0][-1] == "S":
                        lat = -float(lat_lon[0][:-2].replace(" ", "."))
                    else:
                        lat = float(lat_lon[0][:-2].replace(" ", "."))


                    if lat_lon[1][-1] == "W":
                        lon = -float(lat_lon[1][1:-2].replace(" ", "."))
                    else:
                        lon = float(lat_lon[1][1:-2].replace(" ", "."))

                    name = i.find("td", class_="country").find("a").text
                    if name == "Congo, Democratic Republic of the":
                        name = "CD"
                    if name == "Congo, Republic of the":
                        name = "CG"
                    if name == "Cote d'Ivoire":
                        name = "CI"
                    if name == "South Sudan":
                        name = "SS"
                    if name == "Svalbard":
                        name = "Svalbard and Jan Mayen"
                    if name == "Burma":
                        name = "Myanmar"
                    if name == "Korea, South":
                        name = "South Korea"
                    if name == "Korea, North":
                        name = "North Korea"
                    if name == "Bahamas, The":
                        name = "Bahamas"
                    if name == "Czechia":
                        name = "CZ"
                    if name == "Kosovo":
                        name = "XK"
                    if name == "Eswatini":
                        name = "SZ"
                    if name == "Timor-Leste":
                        name = "TL"

                    country_name.append(name)
                    country_lat.append(lat)
                    country_lon.append(lon)

                except:
                    print(f"Whoops! I'm stupid...")
            else:
                print(f"Whoops! I'm stupid...")
        except:
            print(f"Whoops! I'm stupid...")

coord_df = pd.DataFrame({
    "name": country_name,
    "lat": country_lat,
    "lon": country_lon
})

coord_df.head()

Whoops! I'm stupid...
Whoops! I'm stupid...


Unnamed: 0,name,lat,lon
0,Afghanistan,33.0,65.0
1,Akrotiri,34.37,32.58
2,Albania,41.0,20.0
3,Algeria,28.0,3.0
4,American Samoa,-14.2,-170.0


##### Merge data

In [32]:
merge1_df = population_df.merge(econsumption_df, how="left", on="name")
merge2_df = merge1_df.merge(fossil_df, how="left", on="name")
merge3_df = merge2_df.merge(nuclear_df, how="left", on="name")
merge4_df = merge3_df.merge(hydro_df, how="left", on="name")
merge5_df = merge4_df.merge(other_df, how="left", on="name")
merge6_df = merge5_df.merge(emissions_df, how="left", on="name")
merge7_df = merge6_df.merge(gdp_df, how="left", on="name")
merge8_df = merge7_df.merge(gdppc_df, how="left", on="name")
energy_df = merge8_df.merge(coord_df, how="left", on="name")
    
energy_df.head()

Unnamed: 0,name,population,population_rank,econsumption,econsumption_rank,fossil,fossil_rank,nuclear,nuclear_rank,hydroelectric,...,other,other_rank,emissions,emissions_rank,gdp,gdp_rank,gdppc,gdppc_rank,lat,lon
0,China,1384689000.0,1,5564000000000.0,1,62.0,124,2.0,25,18.0,...,18.0,47,11670000000.0,1,23210000000000.0,1,16700.0,105,35.0,105.0
1,India,1296834000.0,2,1137000000000.0,3,71.0,104,2.0,26,12.0,...,16.0,53,2383000000.0,3,9474000000000.0,3,7200.0,156,20.0,77.0
2,United States,329256500.0,3,3902000000000.0,2,70.0,111,9.0,18,7.0,...,14.0,65,5242000000.0,2,19490000000000.0,2,59800.0,19,38.0,-97.0
3,Indonesia,262787400.0,4,213400000000.0,20,85.0,71,0.0,110,9.0,...,6.0,99,540700000.0,12,3250000000000.0,7,12400.0,127,-5.0,120.0
4,Brazil,208846900.0,5,509100000000.0,8,17.0,197,1.0,28,64.0,...,18.0,46,513800000.0,13,3248000000000.0,8,15600.0,108,-10.0,-55.0


In [34]:
# Replace null values with "0"
energy_df.fillna(0, inplace=True)

In [35]:
energy_dict = energy_df.to_dict("list")

In [36]:
energy_dict

{'name': ['China',
  'India',
  'United States',
  'Indonesia',
  'Brazil',
  'Pakistan',
  'Nigeria',
  'Bangladesh',
  'Russia',
  'Japan',
  'Mexico',
  'Ethiopia',
  'Philippines',
  'Egypt',
  'Vietnam',
  'CD',
  'Iran',
  'Turkey',
  'Germany',
  'Thailand',
  'France',
  'United Kingdom',
  'Italy',
  'Myanmar',
  'Tanzania',
  'South Africa',
  'South Korea',
  'Spain',
  'Kenya',
  'Colombia',
  'Argentina',
  'Ukraine',
  'Sudan',
  'Algeria',
  'Uganda',
  'Iraq',
  'Poland',
  'Canada',
  'Afghanistan',
  'Morocco',
  'Saudi Arabia',
  'Malaysia',
  'Venezuela',
  'Peru',
  'Angola',
  'Uzbekistan',
  'Nepal',
  'Yemen',
  'Ghana',
  'Mozambique',
  'CI',
  'Madagascar',
  'Cameroon',
  'North Korea',
  'Taiwan',
  'Australia',
  'Sri Lanka',
  'Romania',
  'Niger',
  'Malawi',
  'Burkina Faso',
  'Syria',
  'Kazakhstan',
  'Mali',
  'Chile',
  'Netherlands',
  'Guatemala',
  'Ecuador',
  'Cambodia',
  'Zambia',
  'Chad',
  'Senegal',
  'Zimbabwe',
  'Rwanda',
  'Guinea',


In [38]:
energy_df.to_csv("stuff.csv",index=False)