In [2]:
import pandas as pd
DATA_PATH = "../datasets/"

Load world population for computations:

In [21]:
world_pop = pd.read_csv(DATA_PATH + "world_population.csv")
# Rename column to match naming of other tables
world_pop = world_pop.rename(columns = {'CCA3': 'Code'}) 

In [22]:
print(f"Data available for {len(world_pop['Code'].unique())} countries.")
world_pop.head()

Data available for 234 countries.


Unnamed: 0,Rank,Code,Country/Territory,Capital,Continent,2022 Population,2020 Population,2015 Population,2010 Population,2000 Population,1990 Population,1980 Population,1970 Population,Area (km²),Density (per km²),Growth Rate,World Population Percentage
0,36,AFG,Afghanistan,Kabul,Asia,41128771,38972230,33753499,28189672,19542982,10694796,12486631,10752971,652230,63.0587,1.0257,0.52
1,138,ALB,Albania,Tirana,Europe,2842321,2866849,2882481,2913399,3182021,3295066,2941651,2324731,28748,98.8702,0.9957,0.04
2,34,DZA,Algeria,Algiers,Africa,44903225,43451666,39543154,35856344,30774621,25518074,18739378,13795915,2381741,18.8531,1.0164,0.56
3,213,ASM,American Samoa,Pago Pago,Oceania,44273,46189,51368,54849,58230,47818,32886,27075,199,222.4774,0.9831,0.0
4,203,AND,Andorra,Andorra la Vella,Europe,79824,77700,71746,71519,66097,53569,35611,19860,468,170.5641,1.01,0.0


#### Temperatures

In [3]:
temperatures = pd.read_csv(DATA_PATH + "average-monthly-surface-temperature.csv")
# Rename columns based on their actual meaning explained on Kaggle
temperatures = temperatures.rename(columns = {'Average surface temperature': 'Monthly average surface temperature',
                                             'Average surface temperature.1': 'Yearly average surface temperature',
                                             'Day': 'Month', 'year': 'Year'})
temperatures['Month'] = pd.to_datetime(temperatures['Month']).dt.strftime("%m")

In [4]:
print(f"{temperatures.shape[0]} data points")
temperatures.head()

198900 data points


Unnamed: 0,Entity,Code,Year,Month,Monthly average surface temperature,Yearly average surface temperature
0,Afghanistan,AFG,1940,1,-2.032494,11.327695
1,Afghanistan,AFG,1940,2,-0.733503,11.327695
2,Afghanistan,AFG,1940,3,1.999134,11.327695
3,Afghanistan,AFG,1940,4,10.199754,11.327695
4,Afghanistan,AFG,1940,5,17.942135,11.327695


In [19]:
# Compute average monthly temperature for the last 10 years
temperatures_processed = temperatures[temperatures['Year'] > 2004]
temperatures_processed = temperatures_processed.groupby(['Code', 'Month'])["Monthly average surface temperature"].mean().reset_index()
temperatures_processed = temperatures_processed.rename(columns = {"Monthly average surface temperature": "Temperature"})
temperatures_processed.to_csv(DATA_PATH + 'processed/temperatures.csv')

In [20]:
print(temperatures_processed.shape)
temperatures_processed.head()

(2340, 3)


Unnamed: 0,Code,Month,Temperature
0,AFG,1,-1.049607
1,AFG,2,1.579556
2,AFG,3,7.712855
3,AFG,4,13.639239
4,AFG,5,19.318636


#### Inbound arrivals

In [23]:
inbound_arrivals = pd.read_csv(DATA_PATH + "23-international-tourist-trips-per-1000-people.csv")
print(inbound_arrivals.shape)
inbound_arrivals.head()

(4933, 4)


Unnamed: 0,Entity,Code,Year,Inbound arrivals (tourists) per 1000 people
0,Albania,ALB,2007,356.84418
1,Albania,ALB,2008,422.46985
2,Albania,ALB,2009,583.8489
3,Albania,ALB,2010,752.04175
4,Albania,ALB,2011,851.1856


In [78]:
# Compute average over last 5 available years, before COVID restrictions (2020)
inbound_arrivals_processed = inbound_arrivals[inbound_arrivals['Year'] < 2020].sort_values(
    by='Year', ascending=False).groupby('Code').head(5).groupby(
    'Code')['Inbound arrivals (tourists) per 1000 people'].mean().reset_index()
# Transform into absolute values and normalize to get country popularity
inbound_arrivals_processed = inbound_arrivals_processed.rename(
    columns = {"Inbound arrivals (tourists) per 1000 people": "Popularity"})
inbound_arrivals_processed = pd.merge(inbound_arrivals_processed, world_pop, how='inner', on='Code')
inbound_arrivals_processed['Popularity'] = inbound_arrivals_processed['Popularity'] * inbound_arrivals_processed['2022 Population'] / 1000
inbound_arrivals_processed = inbound_arrivals_processed[['Code', 'Popularity']]
absolute_arrivals = inbound_arrivals_processed.copy()
inbound_arrivals_processed['Popularity'] = (inbound_arrivals_processed['Popularity'] - inbound_arrivals_processed['Popularity'].mean()) / inbound_arrivals_processed['Popularity'].std()
inbound_arrivals_processed.to_csv(DATA_PATH + "processed/popularity.csv")

In [79]:
print(inbound_arrivals_processed.shape)
inbound_arrivals_processed.head()

(201, 2)


Unnamed: 0,Code,Popularity
0,ABW,-0.38248
1,AGO,-0.436288
2,AIA,-0.460594
3,ALB,-0.115209
4,AND,-0.232304


#### Trip budget

In [80]:
expenditures = pd.read_csv(DATA_PATH + "21-average-expenditures-of-international-tourists-domestically.csv")
print(expenditures.shape)
expenditures.head()

(1260, 4)


Unnamed: 0,Entity,Code,Year,Inbound Tourism Expenditure (adjusted for inflation and cost of living)
0,Australia,AUS,1995,12904206000
1,Australia,AUS,1996,13947016000
2,Australia,AUS,1997,14575643000
3,Australia,AUS,1998,14679026000
4,Australia,AUS,1999,16038053000


In [95]:
# Compute average over last 5 available years, before COVID restrictions (2020)
expenditures_processed = expenditures[expenditures['Year'] < 2020].sort_values(
    by='Year', ascending=False).groupby('Code').head(5).groupby(
    'Code')['Inbound Tourism Expenditure (adjusted for inflation and cost of living)'].mean().reset_index()
# Divide by absolute number of inbound arrivals to get average trip budget
expenditures_processed = expenditures_processed.rename(
    columns = {"Inbound Tourism Expenditure (adjusted for inflation and cost of living)": "Budget"})
expenditures_processed = pd.merge(expenditures_processed, absolute_arrivals, how='inner', on='Code')
expenditures_processed['Budget'] /= expenditures_processed['Popularity']
expenditures_processed = expenditures_processed[['Code', 'Budget']]
expenditures_processed.to_csv(DATA_PATH + "processed/budget.csv")

In [96]:
print(expenditures_processed.shape)
expenditures_processed.head()

(45, 2)


Unnamed: 0,Code,Budget
0,AUS,6106.63189
1,AUT,816.404918
2,BEL,1088.396149
3,BGR,1202.685054
4,BRA,1490.082091
