In [1]:
import pandas as pd
DATA_PATH = "../datasets/"

Load world population for computations:

In [2]:
world_pop = pd.read_csv(DATA_PATH + "world_population.csv")
# Rename column to match naming of other tables
world_pop = world_pop.rename(columns = {'CCA3': 'Code'}) 

In [3]:
print(f"Data available for {len(world_pop['Code'].unique())} countries.")
world_pop.head()

Data available for 234 countries.


Unnamed: 0,Rank,Code,Country/Territory,Capital,Continent,2022 Population,2020 Population,2015 Population,2010 Population,2000 Population,1990 Population,1980 Population,1970 Population,Area (km²),Density (per km²),Growth Rate,World Population Percentage
0,36,AFG,Afghanistan,Kabul,Asia,41128771,38972230,33753499,28189672,19542982,10694796,12486631,10752971,652230,63.0587,1.0257,0.52
1,138,ALB,Albania,Tirana,Europe,2842321,2866849,2882481,2913399,3182021,3295066,2941651,2324731,28748,98.8702,0.9957,0.04
2,34,DZA,Algeria,Algiers,Africa,44903225,43451666,39543154,35856344,30774621,25518074,18739378,13795915,2381741,18.8531,1.0164,0.56
3,213,ASM,American Samoa,Pago Pago,Oceania,44273,46189,51368,54849,58230,47818,32886,27075,199,222.4774,0.9831,0.0
4,203,AND,Andorra,Andorra la Vella,Europe,79824,77700,71746,71519,66097,53569,35611,19860,468,170.5641,1.01,0.0


#### Temperatures

In [4]:
temperatures = pd.read_csv(DATA_PATH + "average-monthly-surface-temperature.csv")
# Rename columns based on their actual meaning explained on Kaggle
temperatures = temperatures.rename(columns = {'Average surface temperature': 'Monthly average surface temperature',
                                             'Average surface temperature.1': 'Yearly average surface temperature',
                                             'Day': 'Month', 'year': 'Year'})
temperatures['Month'] = pd.to_datetime(temperatures['Month']).dt.strftime("%m")

In [5]:
print(f"{temperatures.shape[0]} data points")
temperatures.head()

198900 data points


Unnamed: 0,Entity,Code,Year,Month,Monthly average surface temperature,Yearly average surface temperature
0,Afghanistan,AFG,1940,1,-2.032494,11.327695
1,Afghanistan,AFG,1940,2,-0.733503,11.327695
2,Afghanistan,AFG,1940,3,1.999134,11.327695
3,Afghanistan,AFG,1940,4,10.199754,11.327695
4,Afghanistan,AFG,1940,5,17.942135,11.327695


In [6]:
# Compute average monthly temperature for the last 10 years
temperatures_processed = temperatures[temperatures['Year'] > 2004]
temperatures_processed = temperatures_processed.groupby(['Code', 'Month'])["Monthly average surface temperature"].mean().reset_index()
temperatures_processed = temperatures_processed.rename(columns = {"Monthly average surface temperature": "Temperature"})
temperatures_processed.to_csv(DATA_PATH + 'processed/temperatures.csv')

In [7]:
print(temperatures_processed.shape)
temperatures_processed.head()

(2340, 3)


Unnamed: 0,Code,Month,Temperature
0,AFG,1,-1.049607
1,AFG,2,1.579556
2,AFG,3,7.712855
3,AFG,4,13.639239
4,AFG,5,19.318636


#### Inbound arrivals

In [8]:
inbound_arrivals = pd.read_csv(DATA_PATH + "23-international-tourist-trips-per-1000-people.csv")
print(inbound_arrivals.shape)
inbound_arrivals.head()

(4933, 4)


Unnamed: 0,Entity,Code,Year,Inbound arrivals (tourists) per 1000 people
0,Albania,ALB,2007,356.84418
1,Albania,ALB,2008,422.46985
2,Albania,ALB,2009,583.8489
3,Albania,ALB,2010,752.04175
4,Albania,ALB,2011,851.1856


In [9]:
# Compute average over last 5 available years, before COVID restrictions (2020)
inbound_arrivals_processed = inbound_arrivals[inbound_arrivals['Year'] < 2020].sort_values(
    by='Year', ascending=False).groupby('Code').head(5).groupby(
    'Code')['Inbound arrivals (tourists) per 1000 people'].mean().reset_index()
# Transform into absolute values to get country popularity
inbound_arrivals_processed = inbound_arrivals_processed.rename(
    columns = {"Inbound arrivals (tourists) per 1000 people": "Popularity"})
inbound_arrivals_processed = pd.merge(inbound_arrivals_processed, world_pop, how='inner', on='Code')
inbound_arrivals_processed['Popularity'] = inbound_arrivals_processed['Popularity'] * inbound_arrivals_processed['2022 Population'] / 1000
inbound_arrivals_processed = inbound_arrivals_processed[['Code', 'Popularity']].round()
inbound_arrivals_processed['Popularity'] = inbound_arrivals_processed['Popularity'].astype(int)
inbound_arrivals_processed.to_csv(DATA_PATH + "processed/popularity.csv")

In [10]:
inbound_arrivals_processed.sort_values(by='Popularity', ascending=False).head()

Unnamed: 0,Code,Popularity
60,FRA,87498049
189,USA,80166013
55,ESP,79783254
35,CHN,61819695
86,ITA,56588143


#### Trip budget

In [11]:
expenditures = pd.read_csv(DATA_PATH + "21-average-expenditures-of-international-tourists-domestically.csv")
print(expenditures.shape)
expenditures.head()

(1260, 4)


Unnamed: 0,Entity,Code,Year,Inbound Tourism Expenditure (adjusted for inflation and cost of living)
0,Australia,AUS,1995,12904206000
1,Australia,AUS,1996,13947016000
2,Australia,AUS,1997,14575643000
3,Australia,AUS,1998,14679026000
4,Australia,AUS,1999,16038053000


In [12]:
# Compute average over last 5 available years, before COVID restrictions (2020)
expenditures_processed = expenditures[expenditures['Year'] < 2020].sort_values(
    by='Year', ascending=False).groupby('Code').head(5).groupby(
    'Code')['Inbound Tourism Expenditure (adjusted for inflation and cost of living)'].mean().reset_index()
# Divide by absolute number of inbound arrivals to get average trip budget
expenditures_processed = expenditures_processed.rename(
    columns = {"Inbound Tourism Expenditure (adjusted for inflation and cost of living)": "Budget"})
expenditures_processed = pd.merge(expenditures_processed, absolute_arrivals, how='inner', on='Code')
expenditures_processed['Budget'] /= expenditures_processed['Popularity']
expenditures_processed = expenditures_processed[['Code', 'Budget']]
expenditures_processed.to_csv(DATA_PATH + "processed/budget.csv")

NameError: name 'absolute_arrivals' is not defined

In [None]:
print(expenditures_processed.shape)
expenditures_processed.head()

#### Hotel occupancy

In [13]:
hotels = pd.read_csv(DATA_PATH + "15-foreign-guests-in-hotels-and-similar-establishments.csv")
print(hotels.shape)
hotels.head()

(2879, 4)


Unnamed: 0,Entity,Code,Year,Foreign guests in tourism accommodation (hotels and similar establishments)
0,Albania,ALB,1995,41000
1,Albania,ALB,1996,64000
2,Albania,ALB,1997,23000
3,Albania,ALB,1998,22000
4,Albania,ALB,1999,26000


In [14]:
# Compute average over last 5 available years, before COVID restrictions (2020)
hotels_processed = hotels[hotels['Year'] < 2020].sort_values(
    by='Year', ascending=False).groupby('Code').head(5).groupby(
    'Code')['Foreign guests in tourism accommodation (hotels and similar establishments)'].mean().reset_index()
hotels_processed = hotels_processed.rename(
    columns = {"Foreign guests in tourism accommodation (hotels and similar establishments)": "Hotel guests"})
hotels_processed["Hotel guests"] = hotels_processed["Hotel guests"].astype(int)
hotels_processed.to_csv(DATA_PATH + "processed/hotels.csv")

In [15]:
print(hotels_processed.shape)
hotels_processed.sort_values(by='Hotel guests', ascending=False).head()

(139, 2)


Unnamed: 0,Code,Hotel guests
23,CHN,97944800
124,THA,56006600
40,ESP,51890400
63,JPN,51055600
60,ITA,45213800


#### UNSECO World Heritage Sites

In [16]:
uwh_sites = pd.read_xml(DATA_PATH + "whc.unesco.org.xml")
print(uwh_sites.shape)
uwh_sites.head()

(1223, 21)


Unnamed: 0,category,criteria_txt,danger,date_inscribed,extension,http_url,id_number,image_url,iso_code,justification,...,location,longitude,region,revision,secondary_dates,short_description,site,states,transboundary,unique_number
0,Natural,(ix),,2007,0,https://whc.unesco.org/en/list/1133,1133,https://whc.unesco.org/uploads/sites/site_1133...,"al,at,be,ba,bg,hr,cz,fr,de,it,pl,ro,sk,si,es,c...",,...,,22.183333,Europe and North America,0,201120172021.0,<p>This transnational property includes 93 com...,Ancient and Primeval Beech Forests of the Carp...,"Albania,Austria,Belgium,Bosnia and Herzegovina...",1,2513
1,Mixed,(i)(iii)(iv)(vii),,1979,1,https://whc.unesco.org/en/list/99,99,https://whc.unesco.org/uploads/sites/site_99.jpg,"al,mk",,...,Ohrid (municipality),20.704167,Europe and North America,0,20191980.0,"<p>A superlative natural phenomenon, Lake Ohri...",Natural and Cultural Heritage of the Ohrid region,"Albania,North Macedonia",1,2313
2,Cultural,(i)(ii)(vi),,2016,0,https://whc.unesco.org/en/list/1321,1321,https://whc.unesco.org/uploads/sites/site_1321...,"ar,be,fr,de,in,jp,ch",,...,,6.829336,Latin America and the Caribbean,0,,"<p>Chosen from the work of Le Corbusier, the 1...","The Architectural Work of Le Corbusier, an Out...","Argentina,Belgium,France,Germany,India,Japan,S...",1,2085
3,Cultural,(ii)(iii)(iv)(vi),,2014,0,https://whc.unesco.org/en/list/1459,1459,https://whc.unesco.org/uploads/sites/site_1459...,"ar,bo,cl,co,ec,pe",,...,,-69.591667,Latin America and the Caribbean,0,,<p>This site is an extensive Inca communicatio...,"Qhapaq Ñan, Andean Road System","Argentina,Bolivia (Plurinational State of),Chi...",1,2003
4,Cultural,(iv),,1983,1,https://whc.unesco.org/en/list/275,275,https://whc.unesco.org/uploads/sites/site_275.jpg,"ar,br",,...,"State of Rio Grande do Sul, Brazil; Province o...",-54.265833,Latin America and the Caribbean,0,1984.0,<p>The ruins of São Miguel das Missões in Braz...,Jesuit Missions of the Guaranis: San Ignacio M...,"Argentina,Brazil",1,326


In [17]:
import pycountry

# Step 1: Copy and preprocess all rows
sites = uwh_sites.copy()

# Step 2: Split and standardize iso_code
sites["iso_code"] = sites["iso_code"].str.lower().str.split(",")

# Step 3: Explode list to individual codes
sites = sites.explode("iso_code")

# Step 4: Convert to 3-letter ISO codes using pycountry
def convert_to_iso3(code):
    try:
        return pycountry.countries.get(alpha_2=code.upper()).alpha_3
    except:
        return None

sites["Code"] = sites["iso_code"].apply(convert_to_iso3)

# Step 5: Drop rows with failed conversions
sites = sites.dropna(subset=["Code"])

# Step 6: Create two filtered DataFrames and count
natural_counts = (
    sites[sites["category"].isin(["Natural", "Mixed"])]["Code"]
    .value_counts()
    .sort_index()
    .reset_index()
    .rename(columns={"index": "Code", "count": "Natural sites"})
)
natural_counts.to_csv(DATA_PATH + "processed/natural_sites.csv")

cultural_counts = (
    sites[sites["category"].isin(["Cultural", "Mixed"])]["Code"]
    .value_counts()
    .sort_index()
    .reset_index()
    .rename(columns={"index": "Code", "count": "Cultural sites"})
)
cultural_counts.to_csv(DATA_PATH + "processed/cultural_sites.csv")

In [18]:
print(natural_counts.shape)
natural_counts.sort_values(by='Natural sites', ascending=False).head()

(114, 2)


Unnamed: 0,Code,Natural sites
19,CHN,19
2,AUS,16
106,USA,13
17,CAN,12
86,RUS,11


In [19]:
print(cultural_counts.shape)
cultural_counts.sort_values(by='Cultural sites', ascending=False).head()

(156, 2)


Unnamed: 0,Code,Cultural sites
66,ITA,54
34,DEU,51
41,ESP,46
46,FRA,46
26,CHN,44


#### Country Landmarks

In [53]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
from tqdm import tqdm

# Initialize SPARQL wrapper
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(JSON)

# Example input
# world_pop = pd.read_csv("your_data.csv")  # columns: Code, Country/Territory

# Helper: Get QID of a country by its ISO code
def get_qid_from_iso3(iso3_code):
    query = f"""
    SELECT ?country WHERE {{
      ?country wdt:P298 "{iso3_code}".
    }}
    LIMIT 1
    """
    sparql.setQuery(query)
    results = sparql.query().convert()
    bindings = results['results']['bindings']
    return bindings[0]['country']['value'].split('/')[-1] if bindings is not None else None

# Helper: Get landmark image from QID
def get_iconic_landmark_image(country_qid):
    query = f"""
    SELECT ?landmarkLabel ?image WHERE {{
      ?landmark wdt:P17 wd:{country_qid}.
      ?landmark wdt:P18 ?image.
      ?landmark wdt:P31 ?type.
      FILTER(?type IN (
        wd:Q570116,   # tourist attraction
        wd:Q839954,   # cultural property
        wd:Q41176,    # building
        wd:Q488383    # geographical feature
      ))
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    LIMIT 1
    """
    sparql.setQuery(query)
    results = sparql.query().convert()
    bindings = results["results"]["bindings"]
    return bindings[0]["image"]["value"] if bindings is not None else None

# Final DataFrame
results = []

for _, row in tqdm(world_pop.iterrows(), total=len(world_pop)):
    code = row["Code"]
    country_name = row["Country/Territory"]
    try:
        qid = get_qid_from_iso3(code)
        if qid is not None:
            img_url = get_iconic_landmark_image(qid)
            if img_url is not None:
                results.append({"Code": code, "Country/Territory": country_name, "Image URL": img_url})
    except Exception as e:
        print(f"Error with {country_name}: {e}")

landmark_df = pd.DataFrame(results)

  2%|▉                                                         | 4/234 [00:02<02:27,  1.56it/s]

Error with American Samoa: list index out of range


  3%|█▋                                                        | 7/234 [00:04<02:08,  1.77it/s]

Error with Anguilla: list index out of range


 10%|█████▌                                                   | 23/234 [00:29<03:44,  1.06s/it]

Error with Bermuda: list index out of range


 10%|█████▊                                                   | 24/234 [00:30<03:19,  1.05it/s]

Error with Bhutan: list index out of range


 12%|███████                                                  | 29/234 [00:38<05:08,  1.50s/it]

Error with British Virgin Islands: list index out of range


 19%|██████████▋                                              | 44/234 [01:07<07:36,  2.40s/it]

Error with Comoros: list index out of range


 19%|██████████▉                                              | 45/234 [01:08<05:42,  1.81s/it]

Error with Cook Islands: list index out of range


 24%|█████████████▋                                           | 56/234 [01:26<05:40,  1.91s/it]

Error with DR Congo: list index out of range


 28%|███████████████▊                                         | 65/234 [01:33<01:48,  1.55it/s]

Error with Falkland Islands: list index out of range


 30%|█████████████████                                        | 70/234 [01:54<11:46,  4.31s/it]

Error with French Guiana: list index out of range


 30%|█████████████████▎                                       | 71/234 [01:54<08:30,  3.13s/it]

Error with French Polynesia: list index out of range


 33%|██████████████████▊                                      | 77/234 [02:18<10:05,  3.85s/it]

Error with Gibraltar: list index out of range


 34%|███████████████████▍                                     | 80/234 [02:21<04:44,  1.85s/it]

Error with Grenada: list index out of range


 35%|███████████████████▋                                     | 81/234 [02:21<03:41,  1.45s/it]

Error with Guadeloupe: list index out of range


 35%|███████████████████▉                                     | 82/234 [02:22<02:59,  1.18s/it]

Error with Guam: list index out of range


 46%|█████████████████████████▊                              | 108/234 [03:11<01:50,  1.14it/s]

Error with Kiribati: list index out of range


 49%|███████████████████████████▎                            | 114/234 [03:17<01:59,  1.01it/s]

Error with Lesotho: list index out of range


 51%|████████████████████████████▋                           | 120/234 [03:22<01:37,  1.17it/s]

Error with Macau: list index out of range


 54%|██████████████████████████████▍                         | 127/234 [03:29<01:23,  1.27it/s]

Error with Marshall Islands: list index out of range


 55%|██████████████████████████████▋                         | 128/234 [03:29<01:09,  1.52it/s]

Error with Martinique: list index out of range


 56%|███████████████████████████████▎                        | 131/234 [03:31<00:54,  1.89it/s]

Error with Mayotte: list index out of range


 59%|█████████████████████████████████                       | 138/234 [03:39<01:11,  1.34it/s]

Error with Montserrat: list index out of range


 61%|██████████████████████████████████▏                     | 143/234 [03:43<01:03,  1.43it/s]

Error with Nauru: list index out of range


 62%|██████████████████████████████████▋                     | 145/234 [03:44<01:05,  1.36it/s]

Error with Netherlands: list index out of range


 62%|██████████████████████████████████▉                     | 146/234 [03:45<00:55,  1.58it/s]

Error with New Caledonia: list index out of range


 65%|████████████████████████████████████▏                   | 151/234 [03:55<02:25,  1.76s/it]

Error with Niue: list index out of range


 66%|████████████████████████████████████▊                   | 154/234 [03:57<01:16,  1.05it/s]

Error with Northern Mariana Islands: list index out of range


 73%|████████████████████████████████████████▋               | 170/234 [04:22<00:55,  1.15it/s]

Error with Reunion: list index out of range


 74%|█████████████████████████████████████████▍              | 173/234 [04:34<02:37,  2.58s/it]

Error with Rwanda: list index out of range


 74%|█████████████████████████████████████████▋              | 174/234 [04:34<01:54,  1.91s/it]

Error with Saint Barthelemy: list index out of range


 76%|██████████████████████████████████████████▎             | 177/234 [04:35<00:54,  1.04it/s]

Error with Saint Martin: list index out of range


 76%|██████████████████████████████████████████▌             | 178/234 [04:36<00:43,  1.28it/s]

Error with Saint Pierre and Miquelon: list index out of range


 82%|█████████████████████████████████████████████▉          | 192/234 [04:46<00:44,  1.07s/it]

Error with Solomon Islands: list index out of range


 84%|██████████████████████████████████████████████▉         | 196/234 [04:52<00:46,  1.23s/it]

Error with South Sudan: list index out of range


 90%|██████████████████████████████████████████████████▎     | 210/234 [05:33<00:23,  1.04it/s]

Error with Tokelau: list index out of range


 90%|██████████████████████████████████████████████████▍     | 211/234 [05:34<00:18,  1.24it/s]

Error with Tonga: list index out of range


 92%|███████████████████████████████████████████████████▋    | 216/234 [05:40<00:19,  1.09s/it]

Error with Turks and Caicos Islands: list index out of range


 93%|███████████████████████████████████████████████████▉    | 217/234 [05:41<00:14,  1.14it/s]

Error with Tuvalu: list index out of range


 95%|█████████████████████████████████████████████████████▎  | 223/234 [06:36<01:51, 10.16s/it]

Error with United States Virgin Islands: list index out of range


 98%|███████████████████████████████████████████████████████ | 230/234 [06:41<00:06,  1.51s/it]

Error with Wallis and Futuna: list index out of range


 99%|███████████████████████████████████████████████████████▎| 231/234 [06:42<00:03,  1.18s/it]

Error with Western Sahara: list index out of range


100%|████████████████████████████████████████████████████████| 234/234 [06:44<00:00,  1.73s/it]


In [57]:
landmark_df = landmark_df.rename(columns={"Country/Territory": "Country"})
print(landmark_df.shape)
landmark_df.head()

(193, 3)


Unnamed: 0,Code,Country,Image URL
0,AFG,Afghanistan,http://commons.wikimedia.org/wiki/Special:File...
1,ALB,Albania,http://commons.wikimedia.org/wiki/Special:File...
2,DZA,Algeria,http://commons.wikimedia.org/wiki/Special:File...
3,AND,Andorra,http://commons.wikimedia.org/wiki/Special:File...
4,AGO,Angola,http://commons.wikimedia.org/wiki/Special:File...


In [60]:
landmark_df.to_csv(DATA_PATH + "processed/country_landmarks.csv")