In [88]:
import pandas as pd
DATA_PATH = "datasets/"

# Exploratory data analysis

### World population

Pre-processing:

In [162]:
world_pop = pd.read_csv(DATA_PATH + "world_population.csv")
# Rename column to match naming of other tables
world_pop = world_pop.rename(columns = {'CCA3': 'Code'}) 

In [159]:
print(f"Data available for {len(world_pop['Code'].unique())} countries.")
world_pop.head()

Data available for 234 countries.


Unnamed: 0,Rank,Code,Country/Territory,Capital,Continent,2022 Population,2020 Population,2015 Population,2010 Population,2000 Population,1990 Population,1980 Population,1970 Population,Area (km²),Density (per km²),Growth Rate,World Population Percentage
0,36,AFG,Afghanistan,Kabul,Asia,41128771,38972230,33753499,28189672,19542982,10694796,12486631,10752971,652230,63.0587,1.0257,0.52
1,138,ALB,Albania,Tirana,Europe,2842321,2866849,2882481,2913399,3182021,3295066,2941651,2324731,28748,98.8702,0.9957,0.04
2,34,DZA,Algeria,Algiers,Africa,44903225,43451666,39543154,35856344,30774621,25518074,18739378,13795915,2381741,18.8531,1.0164,0.56
3,213,ASM,American Samoa,Pago Pago,Oceania,44273,46189,51368,54849,58230,47818,32886,27075,199,222.4774,0.9831,0.0
4,203,AND,Andorra,Andorra la Vella,Europe,79824,77700,71746,71519,66097,53569,35611,19860,468,170.5641,1.01,0.0


We check for NaNs:

In [161]:
print(f"Number of rows with NaN values: {world_pop[world_pop.isna().any(axis=1)].shape[0]}")

Number of rows with NaN values: 0


### Average monthly temperatures

We start with some pre-processing:

In [140]:
temperatures = pd.read_csv(DATA_PATH + "average-monthly-surface-temperature.csv")
# Rename columns based on their actual meaning explained on Kaggle
temperatures = temperatures.rename(columns = {'Average surface temperature': 'Monthly average surface temperature',
                                             'Average surface temperature.1': 'Yearly average surface temperature',
                                             'Day': 'Month', 'year': 'Year'})
temperatures['Month'] = pd.to_datetime(temperatures['Month']).dt.strftime("%m")

In [139]:
print(f"{temperatures.shape[0]} data points")
temperatures.head()

198900 data points


Unnamed: 0,Entity,Code,Year,Month,Monthly average surface temperature,Yearly average surface temperature
0,Afghanistan,AFG,1940,1,-2.032494,11.327695
1,Afghanistan,AFG,1940,2,-0.733503,11.327695
2,Afghanistan,AFG,1940,3,1.999134,11.327695
3,Afghanistan,AFG,1940,4,10.199754,11.327695
4,Afghanistan,AFG,1940,5,17.942135,11.327695


In [128]:
print(f"Data available for {len(temperatures['Code'].unique())} countries.")

Data available for 195 countries.


We check that each month of each year is present for every country:

In [132]:
assert len(temperatures['Code'].unique()) * 12 * (temperatures['Year'].max() - temperatures['Year'].min() + 1) == temperatures.shape[0]

We check for NaNs:

In [137]:
print(f"Number of rows with NaN values: {temperatures[temperatures.isna().any(axis=1)].shape[0]}")

Number of rows with NaN values: 0


Here are the five warmest countries over the last 25 years:

In [206]:
temperatures[temperatures["Year"] > 2000].groupby("Entity")["Monthly average surface temperature"].mean().reset_index().sort_values(
    by="Monthly average surface temperature", ascending=False).head()

Unnamed: 0,Entity,Monthly average surface temperature
112,Mali,28.994181
27,Burkina Faso,28.896367
153,Senegal,28.892454
50,Djibouti,28.534796
182,United Arab Emirates,28.463493


And the five coldest:

In [207]:
temperatures[temperatures["Year"] > 2000].groupby("Entity")["Monthly average surface temperature"].mean().reset_index().sort_values(
    by="Monthly average surface temperature", ascending=True).head()

Unnamed: 0,Entity,Monthly average surface temperature
73,Greenland,-17.909013
31,Canada,-3.334756
146,Russia,-3.127702
162,South Georgia and the South Sandwich Islands,-0.555846
172,Tajikistan,0.315281


### Number of UNESCO World Heritage sites

Pre-processing:

In [167]:
uwh = pd.read_csv(DATA_PATH + "uwh_by_country.csv")

# Add country code
for i, row in uwh.iterrows():
    match = world_pop[world_pop['Country/Territory'] == row['States Parties']]
    # Check that the country is present in temperatures table
    if len(match) != 0:
        uwh.at[i, 'Code'] = match.iloc[0]['Code']
    else:
        uwh.at[i, 'Code'] = ""  # Add empty string if no match

We see below that some countries are not present or named differently in the world_pop table. This means that we have to add the country codes manually: 

In [173]:
print(uwh[uwh['Code'] == ''].shape)
uwh[uwh['Code'] == '']

(22, 3)


Unnamed: 0,States Parties,Properties inscribed,Code
24,Cabo Verde,1,
33,Congo,2,
35,Côte d'Ivoire,5,
39,Czechia,17,
40,Democratic People's Republic of Korea,2,
41,Democratic Republic of the Congo,5,
63,Holy See,2,
69,Iran (Islamic Republic of),28,
81,Lao People's Democratic Republic,3,
97,Micronesia (Federated States of),1,


In [178]:
uwh.loc[uwh['States Parties'] == 'Bolivia (Plurinational State of)', 'Code'] = 'BOL'
uwh.loc[uwh['States Parties'] == 'Cabo Verde', 'Code'] = 'CPV'
uwh.loc[uwh['States Parties'] == 'Congo', 'Code'] = 'COG'
uwh.loc[uwh['States Parties'] == "Côte d'Ivoire", 'Code'] = 'CIV'
uwh.loc[uwh['States Parties'] == 'Czechia', 'Code'] = 'CZE'
uwh.loc[uwh['States Parties'] == "Democratic People's Republic of Korea", 'Code'] = 'PRK'
uwh.loc[uwh['States Parties'] == "Democratic Republic of the Congo", 'Code'] = 'COD'
uwh.loc[uwh['States Parties'] == "Holy See", 'Code'] = 'VAT'
uwh.loc[uwh['States Parties'] == "Iran (Islamic Republic of)", 'Code'] = 'IRN'
uwh.loc[uwh['States Parties'] == "Lao People's Democratic Republic", 'Code'] = 'LAO'
uwh.loc[uwh['States Parties'] == "Micronesia (Federated States of)", 'Code'] = 'FSM'
uwh.loc[uwh['States Parties'] == "Netherlands (Kingdom of the)", 'Code'] = 'NLD'
uwh.loc[uwh['States Parties'] == "Republic of Korea", 'Code'] = 'KOR'
uwh.loc[uwh['States Parties'] == "Republic of Moldova", 'Code'] = 'MDA'
uwh.loc[uwh['States Parties'] == "Russian Federation", 'Code'] = 'RUS'
uwh.loc[uwh['States Parties'] == "State of Palestine", 'Code'] = 'PSE'
uwh.loc[uwh['States Parties'] == "Syrian Arab Republic", 'Code'] = 'SYR'
uwh.loc[uwh['States Parties'] == "Türkiye", 'Code'] = 'TUR'
uwh.loc[uwh['States Parties'] == "United Kingdom of Great Britain and Northern Ireland", 'Code'] = 'GBR'
uwh.loc[uwh['States Parties'] == "United Republic of Tanzania", 'Code'] = 'TZA'
uwh.loc[uwh['States Parties'] == "United States of America", 'Code'] = 'USA'
uwh.loc[uwh['States Parties'] == "Venezuela (Bolivarian Republic of)", 'Code'] = 'VEN'
uwh.loc[uwh['States Parties'] == "Viet Nam", 'Code'] = 'VNM'

In [181]:
assert uwh[uwh['Code'] == ''].shape[0] == 0

We check for NaNs:

In [182]:
print(f"Number of rows with NaN values: {uwh[uwh.isna().any(axis=1)].shape[0]}")

Number of rows with NaN values: 0


We can see that there are 168 countries featuring at least one UNESCO world heritage site. The countries with the highest number of UWH are Italy, China, Germany and France:

In [142]:
print(f"Number of countries: {uwh.shape[0]}")
uwh.sort_values(by='Properties inscribed', ascending=False).head()

Number of countries: 168


Unnamed: 0,States Parties,Properties inscribed
73,Italy,60
31,China,59
57,Germany,54
53,France,53
140,Spain,50


### Inbound arrivals

In [215]:
inbound_arrivals = pd.read_csv(DATA_PATH + "23-international-tourist-trips-per-1000-people.csv")
print(inbound_arrivals.shape)
inbound_arrivals.head()

(4933, 4)


Unnamed: 0,Entity,Code,Year,Inbound arrivals (tourists) per 1000 people
0,Albania,ALB,2007,356.84418
1,Albania,ALB,2008,422.46985
2,Albania,ALB,2009,583.8489
3,Albania,ALB,2010,752.04175
4,Albania,ALB,2011,851.1856


Check for NaNs:

In [218]:
print(f"Number of rows with NaN values: {inbound_arrivals[inbound_arrivals.isna().any(axis=1)].shape[0]}")

Number of rows with NaN values: 0


Let's have a look at the most visited countries (relative to their size) over the last 15 years. We see that small countries are present at the top, this will have to be adjusted using the total population from `world_pop.csv`.

In [217]:
inbound_arrivals[inbound_arrivals["Year"] > 2010].groupby("Entity")["Inbound arrivals (tourists) per 1000 people"].mean().reset_index().sort_values(
    by="Inbound arrivals (tourists) per 1000 people", ascending=False).head()

Unnamed: 0,Entity,Inbound arrivals (tourists) per 1000 people
2,Andorra,34111.430364
102,Macao,21405.513791
25,British Virgin Islands,9943.847382
179,Turks and Caicos Islands,9706.947755
160,Sint Maarten (Dutch part),9592.671236


### Number of foreign guests in hotels

In [190]:
hotel_guests = pd.read_csv(DATA_PATH + "15-foreign-guests-in-hotels-and-similar-establishments.csv")
print(hotel_guests.shape)
hotel_guests.head()

(2879, 4)


Unnamed: 0,Entity,Code,Year,Foreign guests in tourism accommodation (hotels and similar establishments)
0,Albania,ALB,1995,41000
1,Albania,ALB,1996,64000
2,Albania,ALB,1997,23000
3,Albania,ALB,1998,22000
4,Albania,ALB,1999,26000


Check for NaNs:

In [192]:
print(f"Number of rows with NaN values: {hotel_guests[hotel_guests.isna().any(axis=1)].shape[0]}")

Number of rows with NaN values: 0


Here are the countries with the highest number of foreign guests in their hotels:

In [222]:
hotel_guests[hotel_guests["Year"] > 2010].groupby("Entity")["Foreign guests in tourism accommodation (hotels and similar establishments)"].mean().reset_index().sort_values(
    by="Foreign guests in tourism accommodation (hotels and similar establishments)", ascending=False).head()

Unnamed: 0,Entity,Foreign guests in tourism accommodation (hotels and similar establishments)
26,China,104247300.0
114,Thailand,45415100.0
109,Spain,41332730.0
58,Italy,37435730.0
60,Japan,31165090.0


### Average expenditures of international tourists

In [193]:
expenditures = pd.read_csv(DATA_PATH + "21-average-expenditures-of-international-tourists-domestically.csv")
print(expenditures.shape)
expenditures.head()

(1260, 4)


Unnamed: 0,Entity,Code,Year,Inbound Tourism Expenditure (adjusted for inflation and cost of living)
0,Australia,AUS,1995,12904206000
1,Australia,AUS,1996,13947016000
2,Australia,AUS,1997,14575643000
3,Australia,AUS,1998,14679026000
4,Australia,AUS,1999,16038053000


Check for NaNs

In [194]:
print(f"Number of rows with NaN values: {hotel_guests[hotel_guests.isna().any(axis=1)].shape[0]}")

Number of rows with NaN values: 0


Finally let's look at the countries where tourist spend the most:

In [228]:
expenditures[expenditures["Year"] > 2010].groupby("Entity")["Inbound Tourism Expenditure (adjusted for inflation and cost of living)"].mean().reset_index().sort_values(
    by="Inbound Tourism Expenditure (adjusted for inflation and cost of living)", ascending=False).head()

Unnamed: 0,Entity,Inbound Tourism Expenditure (adjusted for inflation and cost of living)
46,United States,180627000000.0
41,Spain,77076030000.0
14,France,58020660000.0
45,United Kingdom,48234190000.0
44,Turkey,47202590000.0
