# Mashed datasources with ChatGPT

## Import Modules

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Load the list with countries first

In [27]:
# Configure the URL to crawl
url = "https://www.scrapethissite.com/pages/simple/"

# Try to load the raw html data and create the BeautifulSoup instance
response = requests.get(url)
if response.status_code != 200:
    exit(0)

soup = BeautifulSoup(response.text, "html")
countries = soup.find_all("div", class_="country")

# Ready for takeoff...
data = []

for country in countries:
    data.append({
        "country": country.find("h3", class_="country-name").text.strip(),
        "capital": country.find("span", class_="country-capital").text.strip(),
        "population": country.find("span", class_="country-population").text.strip(),
        "area_km2": country.find("span", class_="country-area").text.strip(),
    })

data = sorted(data, key=lambda d: d['country'])

df_scrap = pd.DataFrame(data)
df_scrap

Unnamed: 0,country,capital,population,area_km2
0,Afghanistan,Kabul,29121286,647500.0
1,Albania,Tirana,2986952,28748.0
2,Algeria,Algiers,34586184,2381740.0
3,American Samoa,Pago Pago,57881,199.0
4,Andorra,Andorra la Vella,84000,468.0
...,...,...,...,...
245,Western Sahara,Laâyoune / El Aaiún,273008,266000.0
246,Yemen,Sanaa,23495361,527970.0
247,Zambia,Lusaka,13460305,752614.0
248,Zimbabwe,Harare,11651858,390580.0


## Now try to enrich that data with another source...
...maybe with ChatGPT?

### Generate the prompt to ask ChatGPT

In [28]:
print(f"""Could you please generate a file in CSV format with further infomation about the following countries? 
Please add these columns: 
- country: the name of the country itself
- gnp: gross national product
- unemployment_rate: the rate of unemployed but employable people in percent
- lat: Latitude
- lon: Longitude
- languages: a list of official languages separated by ";"
- number_airports: the number of official airports
- highest_elevation: the highest point in meter

And here the list of countries needed for the CSV file: {", ".join([item["country"] for item in data])}""")

Could you please generate a file in CSV format with further infomation about the following countries? 
Please add these columns: 
- country: the name of the country itself
- gnp: gross national product
- unemployment_rate: the rate of unemployed but employable people in percent
- lat: Latitude
- lon: Longitude
- languages: a list of official languages separated by ";"
- number_airports: the number of official airports
- highest_elevation: the highest point in meter

And here the list of countries needed for the CSV file: Afghanistan, Albania, Algeria, American Samoa, Andorra, Angola, Anguilla, Antarctica, Antigua and Barbuda, Argentina, Armenia, Aruba, Australia, Austria, Azerbaijan, Bahamas, Bahrain, Bangladesh, Barbados, Belarus, Belgium, Belize, Benin, Bermuda, Bhutan, Bolivia, Bonaire, Bosnia and Herzegovina, Botswana, Bouvet Island, Brazil, British Indian Ocean Territory, British Virgin Islands, Brunei, Bulgaria, Burkina Faso, Burundi, Cambodia, Cameroon, Canada, Cape Verde, Cayma

### Take the file and proceed...
ChatGPT replies very gentle, but sadly only with mocked data: 

"To generate a CSV file with the additional information for the list of countries you've provided, I will create mock data for the following columns: GNP, unemployment rate, latitude (lat), longitude (lon), languages, number of airports, and highest elevation. Since it isn't possible to pull real-time data directly from sources, I'll make up some representative data for this task.

Let me start creating this file for you.

The CSV file with the requested information about the countries is ready. You can download it using the link below:

Download the CSV file"

### Load the CSV file and merge both

In [29]:
df_csv = pd.read_csv("web_scraping/data/countries_info.csv")
df_csv

Unnamed: 0,country,gnp,unemployment_rate,lat,lon,languages,number_airports,highest_elevation
0,Afghanistan,1.5,1,45.1,-79.9,Spanish;Portuguese,1,550
1,Albania,3.0,2,45.2,-79.8,English;French,2,600
2,Algeria,4.5,3,45.3,-79.7,Spanish;Portuguese,3,650
3,American Samoa,6.0,4,45.4,-79.6,English;French,4,700
4,Andorra,7.5,5,45.5,-79.5,Spanish;Portuguese,5,750
...,...,...,...,...,...,...,...,...
245,Western Sahara,369.0,6,69.6,-55.4,English;French,6,12800
246,Yemen,370.5,7,69.7,-55.3,Spanish;Portuguese,7,12850
247,Zambia,372.0,8,69.8,-55.2,English;French,8,12900
248,Zimbabwe,373.5,9,69.9,-55.1,Spanish;Portuguese,9,12950


In [30]:
df_full = pd.merge(df_scrap, df_csv, how="left", on=["country", "country"])
df_full 

Unnamed: 0,country,capital,population,area_km2,gnp,unemployment_rate,lat,lon,languages,number_airports,highest_elevation
0,Afghanistan,Kabul,29121286,647500.0,1.5,1,45.1,-79.9,Spanish;Portuguese,1,550
1,Albania,Tirana,2986952,28748.0,3.0,2,45.2,-79.8,English;French,2,600
2,Algeria,Algiers,34586184,2381740.0,4.5,3,45.3,-79.7,Spanish;Portuguese,3,650
3,American Samoa,Pago Pago,57881,199.0,6.0,4,45.4,-79.6,English;French,4,700
4,Andorra,Andorra la Vella,84000,468.0,7.5,5,45.5,-79.5,Spanish;Portuguese,5,750
...,...,...,...,...,...,...,...,...,...,...,...
245,Western Sahara,Laâyoune / El Aaiún,273008,266000.0,369.0,6,69.6,-55.4,English;French,6,12800
246,Yemen,Sanaa,23495361,527970.0,370.5,7,69.7,-55.3,Spanish;Portuguese,7,12850
247,Zambia,Lusaka,13460305,752614.0,372.0,8,69.8,-55.2,English;French,8,12900
248,Zimbabwe,Harare,11651858,390580.0,373.5,9,69.9,-55.1,Spanish;Portuguese,9,12950


## Proceed with the data however you want

In [37]:
df_full.to_json("web_scraping/data/countries_full.json", orient="records", index=False)
df_full.to_xml("web_scraping/data/countries_full.xml", root_name="countries", row_name="country", index=False)