In [618]:
import pandas as pd

In [619]:
# Input the data
input = 'Tourism Input.csv'
df = pd.read_csv(input)
df.head(5)

Unnamed: 0,id,Series-Measure,Hierarchy-Breakdown,Unit-Detail,Jan-10,Feb-10,Mar-10,Apr-10,May-10,Jun-10,...,Mar-20,Apr-20,May-20,Jun-20,Jul-20,Aug-20,Sep-20,Oct-20,Nov-20,Dec-20
0,1103,Total tourist arrivals,Real Sector / Tourism,Tourists,67478.0,77063.0,74975.0,60742.0,58324.0,44050.0,...,59630.0,13.0,41.0,1.0,1752.0,7636.0,9605.0,21515.0,35757.0,96412.0
1,1104,Tourist bednights,Real Sector / Tourism,Bednights,552287.0,578472.0,581848.0,503007.0,443824.0,327385.0,...,562302.2051,8844.0203,4776.6212,2325.8012,24673.4247,71370.6948,75367.8621,169709.0807,279030.282,623284.397
2,1105,Average stay,Real Sector / Tourism,Days,8.184697,7.506481,7.76056,8.281041,7.609628,7.432122,...,9.4298541854713,9.428593030082,86.847657368888,42.287293761914,14.083004941515,9.3485538100132,9.4824196160074,9.6159959503923,8.877098540146,9.1055876952922
3,1106,Operational bed capacity,Real Sector / Tourism,Beds,22825.0,23472.0,23934.0,24124.0,23885.0,23585.0,...,51001.0,7690.0,2978.0,3078.0,9821.0,19263.0,25328.0,32600.0,37378.0,42194.0
4,1107,Bednight capacity,Real Sector / Tourism,Beds,707575.0,657216.0,741954.0,723720.0,740435.0,707550.0,...,1581031.0,230700.0,92318.0,92340.0,304451.0,597153.0,759840.0,1010600.0,1121340.0,1308014.0


In [620]:
df.columns[:4]

Index(['id', 'Series-Measure', 'Hierarchy-Breakdown', 'Unit-Detail'], dtype='object')

In [621]:
# Pivot all of the month fields into a single column
# Rename the fields and ensure that each field has the correct data type
df = df.melt(id_vars=df.columns[:4], value_vars=df.columns[4:], var_name='Month', value_name='Numbers of Tourists')
df



Unnamed: 0,id,Series-Measure,Hierarchy-Breakdown,Unit-Detail,Month,Numbers of Tourists
0,1103,Total tourist arrivals,Real Sector / Tourism,Tourists,Jan-10,67478
1,1104,Tourist bednights,Real Sector / Tourism,Bednights,Jan-10,552287
2,1105,Average stay,Real Sector / Tourism,Days,Jan-10,8.184697
3,1106,Operational bed capacity,Real Sector / Tourism,Beds,Jan-10,22825
4,1107,Bednight capacity,Real Sector / Tourism,Beds,Jan-10,707575
...,...,...,...,...,...,...
3691,1244,Total number of scheduled flights,Real Sector / Tourism / Total number of arriva...,Flights,Dec-20,647
3692,1245,Total number of general flights,Real Sector / Tourism / Total number of arriva...,Flights,Dec-20,163
3693,1252,Tourist arrivals from France,Real Sector / Tourism / Tourist arrivals / Europe,Tourists,Dec-20,3998
3694,1253,Tourist arrivals from Australia,Real Sector / Tourism / Tourist arrivals / Oce...,Tourists,Dec-20,607


In [622]:
# Filter out the nulls (help)
# Filter our dataset so our Values are referring to Number of Tourists
df = df[~(df['Numbers of Tourists'] == 'na')].copy()
df['Numbers of Tourists'] = df['Numbers of Tourists'].str.replace('%','')
df['Numbers of Tourists'] = df['Numbers of Tourists'].astype(float).astype(int)
df = df[df['Unit-Detail'] == 'Tourists']
df['Month'] = pd.to_datetime(df['Month'], format='%b-%y')

df

Unnamed: 0,id,Series-Measure,Hierarchy-Breakdown,Unit-Detail,Month,Numbers of Tourists
0,1103,Total tourist arrivals,Real Sector / Tourism,Tourists,2010-01-01,67478
8,1111,Tourist arrivals from Europe,Real Sector / Tourism / Tourist arrivals,Tourists,2010-01-01,51334
9,1112,Tourist arrivals from Asia,Real Sector / Tourism / Tourist arrivals,Tourists,2010-01-01,13243
10,1113,Tourist arrivals from Africa,Real Sector / Tourism / Tourist arrivals,Tourists,2010-01-01,350
11,1114,Tourist arrivals from Americas,Real Sector / Tourism / Tourist arrivals,Tourists,2010-01-01,1289
...,...,...,...,...,...,...
3687,1122,Tourist arrivals from China,Real Sector / Tourism / Tourist arrivals / Asia,Tourists,2020-12-01,171
3688,1241,Tourist arrivals from India,Real Sector / Tourism / Tourist arrivals / Asia,Tourists,2020-12-01,18637
3693,1252,Tourist arrivals from France,Real Sector / Tourism / Tourist arrivals / Europe,Tourists,2020-12-01,3998
3694,1253,Tourist arrivals from Australia,Real Sector / Tourism / Tourist arrivals / Oce...,Tourists,2020-12-01,607


In [623]:
# Our goal now is to remove all totals and subtotals from our dataset so that only 
# the lowest level of granularity remains. 
# Currently we have Total > Continents > Countries, but we don't have data for all 
# countries in a continent, so it's not as simple as just filtering out the totals 
# and subtotals. Plus in our Continents level of detail, we also have The Middle East 
# and UN passport holders as categories. If you feel confident in your 
# prep skills, this (plus the output) should be enough information to go on, 
# but otherwise read on for a breakdown of the steps we need to take:
# Filter out Total tourist arrivals
# Split our workflow into 2 streams: Continents and Countries
# Hint: the hierarchy field will be useful here
# Split out the Continent and Country names from the relevant fields (help)
# Aggregate our Country stream to the Continent level (help)
# Join the two streams together and work out how many tourists arrivals there are that we don't know the country of (help)
# Add in a Country field with the value "Unknown" (help)
# Union this back to here we had our Country breakdown

df = df[~(df['Series-Measure'] == 'Total tourist arrivals')]
df.head(5)
# result 1826 rows x 6 cols

Unnamed: 0,id,Series-Measure,Hierarchy-Breakdown,Unit-Detail,Month,Numbers of Tourists
8,1111,Tourist arrivals from Europe,Real Sector / Tourism / Tourist arrivals,Tourists,2010-01-01,51334
9,1112,Tourist arrivals from Asia,Real Sector / Tourism / Tourist arrivals,Tourists,2010-01-01,13243
10,1113,Tourist arrivals from Africa,Real Sector / Tourism / Tourist arrivals,Tourists,2010-01-01,350
11,1114,Tourist arrivals from Americas,Real Sector / Tourism / Tourist arrivals,Tourists,2010-01-01,1289
12,1115,Tourist arrivals from Oceania,Real Sector / Tourism / Tourist arrivals,Tourists,2010-01-01,703


In [624]:
continents = df[df['Hierarchy-Breakdown'] == 'Real Sector / Tourism / Tourist arrivals'].copy()
continents.loc[:, 'Series-Measure'] = continents['Series-Measure'].str.replace('-', 'from')
continents['Continent'] = continents['Series-Measure'].str.replace('Tourist arrivals from ','')
continents.drop(columns=['Series-Measure', 'id', 'Hierarchy-Breakdown', 'Unit-Detail'], inplace=True)
continents

Unnamed: 0,Month,Numbers of Tourists,Continent
8,2010-01-01,51334,Europe
9,2010-01-01,13243,Asia
10,2010-01-01,350,Africa
11,2010-01-01,1289,Americas
12,2010-01-01,703,Oceania
...,...,...,...
3678,2020-12-01,1996,Africa
3679,2020-12-01,4929,Americas
3680,2020-12-01,728,Oceania
3681,2020-12-01,4557,the Middle East


In [625]:
countries = df[~(df['Hierarchy-Breakdown'] == 'Real Sector / Tourism / Tourist arrivals')].copy()
countries['Country'] = countries['Series-Measure'].str.replace('Tourist arrivals from ','')
countries['Country'] = countries['Country'].str.replace('the ','')
countries['Hierarchy-Breakdown'] = countries['Hierarchy-Breakdown'].str.replace('Real Sector / Tourism / Tourist arrivals / ','')
countries.rename(columns={'Hierarchy-Breakdown':'Continent'}, inplace=True)
countries.drop(columns=['id', 'Series-Measure', 'Unit-Detail'], inplace=True)

countries

Unnamed: 0,Continent,Month,Numbers of Tourists,Country
15,Europe,2010-01-01,5890,Germany
16,Europe,2010-01-01,12276,Italy
17,Europe,2010-01-01,5873,Russia
18,Europe,2010-01-01,8405,United Kingdom
19,Asia,2010-01-01,6069,China
...,...,...,...,...
3687,Asia,2020-12-01,171,China
3688,Asia,2020-12-01,18637,India
3693,Europe,2020-12-01,3998,France
3694,Oceania,2020-12-01,607,Australia


In [626]:
countries_agg = countries.groupby(['Continent', 'Month']).agg(numnber_of_tourist=('Numbers of Tourists','sum')).reset_index()
countries_agg

Unnamed: 0,Continent,Month,numnber_of_tourist
0,Americas,2020-02-01,4543
1,Americas,2020-03-01,1820
2,Americas,2020-04-01,0
3,Americas,2020-05-01,0
4,Americas,2020-06-01,0
...,...,...,...
281,Oceania,2020-08-01,49
282,Oceania,2020-09-01,61
283,Oceania,2020-10-01,160
284,Oceania,2020-11-01,209


In [627]:
join_continents = pd.merge(left=continents, right=countries_agg, how='left', on=['Continent', 'Month'])
join_continents['numnber_of_tourist'] = join_continents['numnber_of_tourist'].fillna(0)
join_continents['Missing Values'] = join_continents['Numbers of Tourists'] - join_continents['numnber_of_tourist']
join_continents['Country'] = 'Unknown'
join_continents.drop(columns=['Numbers of Tourists', 'numnber_of_tourist'], inplace=True)
join_continents.rename(columns={'Missing Values':'Numbers of Tourists'}, inplace=True)

join_continents

Unnamed: 0,Month,Continent,Numbers of Tourists,Country
0,2010-01-01,Europe,11991.0,Unknown
1,2010-01-01,Asia,5432.0,Unknown
2,2010-01-01,Africa,350.0,Unknown
3,2010-01-01,Americas,1289.0,Unknown
4,2010-01-01,Oceania,703.0,Unknown
...,...,...,...,...
875,2020-12-01,Africa,1996.0,Unknown
876,2020-12-01,Americas,1924.0,Unknown
877,2020-12-01,Oceania,121.0,Unknown
878,2020-12-01,the Middle East,4557.0,Unknown


In [628]:
output = pd.concat([countries, join_continents], axis=0)
output['Numbers of Tourists'] = output['Numbers of Tourists'].astype(int)
output = output[['Numbers of Tourists', 'Month', 'Continent', 'Country']]
output

Unnamed: 0,Numbers of Tourists,Month,Continent,Country
15,5890,2010-01-01,Europe,Germany
16,12276,2010-01-01,Europe,Italy
17,5873,2010-01-01,Europe,Russia
18,8405,2010-01-01,Europe,United Kingdom
19,6069,2010-01-01,Asia,China
...,...,...,...,...
875,1996,2020-12-01,Africa,Unknown
876,1924,2020-12-01,Americas,Unknown
877,121,2020-12-01,Oceania,Unknown
878,4557,2020-12-01,the Middle East,Unknown
