# DSCI510 FINAL PROJECT by Dhandeep Suglani
***Data Collection through Webscraping***


Github link- https://github.com/dhand33p/USCDSCI510

---

**Sources**

https://www.worldometers.info/co2-emissions/co2-emissions-by-country/

https://www.iqair.com/us/world-most-polluted-countries

https://www.worldometers.info/gdp/gdp-per-capita/#:~:text=Gross%20Domestic%20Product%20(GDP)%20per,the%20Nominal%20GDP%20per%20capita


In [12]:
import requests
import bs4
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

### Most CO2 Emission Countries in percent share of world -Web Scraping
#### Data Cleaning  - Need to parse through the string and only select Country Name and percentage of CO2 Emission in Share of World

In [13]:
url1 = "https://www.worldometers.info/co2-emissions/co2-emissions-by-country/"
req = requests.get(url1).text
soup = BeautifulSoup(req, "html.parser")
results = soup.find('table').find_all("tr") # Parsing through the table, and selecting all table row
list_count = []
for i in range(0, len(results)):
    country = results[i].text
    list_count.append(country)
dict_list = {}
for i in range(1, len(list_count)):
    temp = list_count[i]
    name = temp[3:20]
    result = ''.join([x for x in name if not x.isdigit()])
    fin_wrd = (re.sub(r"[^a-zA-Z]+", ' ', result)).strip()
    per_share = float(temp[-7:-2].replace(" ", ""))
    dict_list[fin_wrd] = per_share

#len(list_count) # 210 Countries
df1 = pd.DataFrame([dict_list])
em = df1.T #switch rows and col
new = em.reset_index()
per_co2 = new.rename(columns={0: "CO2_emis_2016", "index" :"Country"})
per_co2

Unnamed: 0,Country,CO2_emis_2016
0,China,29.18
1,United States,14.02
2,India,7.09
3,Russia,4.65
4,Japan,3.47
...,...,...
204,Anguilla,0.00
205,Saint Helena,0.00
206,Saint Pierre,0.00
207,Faeroe Islands,0.00


### Most Polluted Countries Web Scraping
#### Need to select country name and latest pollution percentages in respect to WHO guidelines

In [14]:
url2 = "https://www.iqair.com/us/world-most-polluted-countries"
req2 = requests.get(url2).text
soup1 = BeautifulSoup(req2, "html.parser")
results1 = soup1.find('table').find("tbody")   # Fing the tbody section 
test = results1.find_all("tr")               # Select all table rows from the table 
list_cn = []
for i in range(0, len(test)):
    country = test[i].text
    list_cn.append(country)
 # 117 countries
dic_poll = {}
for i in range(1, len(list_cn)):
    temp = list_cn[i]
    char = ''.join([x for x in temp if not x.isdigit()])
    country = (re.sub(r"[^a-zA-Z]+", ' ', char)).strip()
    nums = re.sub(r"[^0-9.]+", ' ', temp).strip()
    list_nums = nums.split()
    if len(list_nums[1]) > 1 :
        dic_poll[country] = float(list_nums[1])
    elif list_nums[1] == ".":
        dic_poll[country] = 4.5
        
df1 = pd.DataFrame([dic_poll])
pol = df1.T #switch rows and col
new = pol.reset_index()
per_poll = new.rename(columns={0: "Poll_2021", "index" :"Country"})
per_poll.iloc[[88]]
per_poll.loc[88,['Country']] = "United States"
per_poll.iloc[88]
per_poll

Unnamed: 0,Country,Poll_2021
0,Chad,75.9
1,Pakistan,66.8
2,Tajikistan,59.4
3,India,58.1
4,Oman,53.9
...,...,...
109,Bonaire Saint Eustatius and Saba,5.1
110,Cape Verde,5.1
111,Puerto Rico,4.8
112,U S Virgin Islands,4.5


## GDP Per Capita BY Country
Need to select country name and its respective GDP Per Capita


In [15]:
url3 = "https://www.worldometers.info/gdp/gdp-per-capita/#:~:text=Gross%20Domestic%20Product%20(GDP)%20per,the%20Nominal%20GDP%20per%20capita."
req3 = requests.get(url3).text
soup2 = BeautifulSoup(req3, 'html.parser')
results2 = soup2.find('table').find_all("tr") # Parsing through the table, and selecting all table row
list_ct = []
for i in range(0, len(results2)):
    cry = results2[i].text
    list_ct.append(cry)

dict_l = {}
for i in range(1, len(list_ct)):
    temp = list_ct[i]
    ch = ''.join([x for x in temp if not x.isdigit()])
    country = (re.sub(r"[^a-zA-Z]+", ' ', ch)).strip()
    n = ''.join([x for x in temp if not x.isalpha()])
    list_nums = n.split()
    num = list_nums[1]
    cntry = country.replace(" N A N A", "")
    if len(num) >= 3:
        nu = float(num.replace("$", '').replace(',', '')) 
    dict_l[cntry] = nu
        

df1 = pd.DataFrame([dict_l])
pol = df1.T #switch rows and col
indqw = pol.reset_index()
gdp = indqw.rename(columns={0: "GDP_Per_Cap", "index" :"Country"})
gdp

Unnamed: 0,Country,GDP_Per_Cap
0,Qatar,128647.0
1,Macao,115367.0
2,Luxembourg,107641.0
3,Singapore,94105.0
4,Brunei,79003.0
...,...,...
185,Northern Mariana Islands,727.0
186,Andorra,727.0
187,Guam,727.0
188,Cuba,727.0


### Combining two dataframes into one and into CSV file - merged them based on country name

In [16]:
all_col = pd.merge(per_co2, per_poll, on="Country")
alcnt = pd.merge(all_col,gdp,  on = "Country")
alcnt.to_csv('cntryinfo.csv', index=False)
alcnt.head()

Unnamed: 0,Country,CO2_emis_2016,Poll_2021,GDP_Per_Cap
0,China,29.18,32.6,16842.0
1,United States,14.02,10.3,59928.0
2,India,7.09,58.1,7166.0
3,Russia,4.65,12.3,25763.0
4,Japan,3.47,9.1,42067.0
