In [284]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle

### UNESCO Data

In [92]:
file = '../pickle_files/unesco.p'

In [93]:
df = pd.read_pickle(file)
df.shape

(241, 16)

In [95]:
cols_to_keep = ['area_hectares', 'country', 'name']
df = df[cols_to_keep]
df.set_index('name', inplace = True)

#### Create "UNESCO" column for labels

In [96]:
df['UNESCO'] = 1

#### Rename one park

In [53]:
as_list = df.index.tolist()
idx = as_list.index('Redwood National and State Parks')
as_list[idx] = 'Redwood National Park'
df.index = as_list

### US National Park Data

In [143]:
us_parks = '../pickle_files/us_parks.p'
us_parks_df = pd.read_pickle(us_parks)
us_parks_df.shape

(58, 11)

In [145]:
us_parks_df['name'] = us_parks_df['name'] + ' National Park'
us_parks_df.set_index('name', inplace = True)

In [146]:
us_parks_df['area_hectares'] = us_parks_df['sq_km'] * 100
us_parks_df['age'] = 2017 - us_parks_df['est_year']
us_parks_df['country'] = 'United States of America'

In [147]:
cols_to_keep = ['area_hectares', 'age', 'country']
us_parks_df = us_parks_df[cols_to_keep]

#### Create a UNESCO column of labels for US Parks

In [148]:
us_parks = us_parks_df.index.values
unesco_parks = df[df['country'] == 'United States of America'].index.values
mask = np.in1d(us_parks, unesco_parks)
us_parks_df['UNESCO'] = mask 
us_parks_df['UNESCO'] = us_parks_df['UNESCO'].astype(int)

#### Merge UNESCO with US National parks

In [163]:
df1 = df.merge(us_parks_df, how='outer', left_index = True, right_index = True)

In [164]:
cols_to = ['area_hectares_x', 'country_x', 'UNESCO_x',]
cols_from = ['area_hectares_y', 'country_y', 'UNESCO_y']

In [271]:
def merge_columns(df, cols_to, cols_from):
    '''
    merges common columns in a dataframe
    '''
    for col_to, col_from in zip(cols_to, cols_from):
        mask = df[col_to].isnull()
        df.loc[mask, col_to] = df.loc[mask, col_from]
        df.drop(col_from, axis = 1, inplace = True)
        df.rename(columns = {col_to: col_to[:-2]}, inplace = True)
    return df

In [None]:
df1 = merge_columns(df1, cols_to, cols_from)

### Canadian National Park Data

In [251]:
canadian_parks = '../pickle_files/canadian_parks.p'
canadian_parks_df = pd.read_pickle(canadian_parks)
canadian_parks_df.shape

(42, 6)

In [252]:
canadian_parks_df['geographic_location'] = (canadian_parks_df['geographic_location'].str.split(' ')
                                            .apply(lambda x: x[:-2])
                                            .str.join(' '))

In [253]:
canadian_parks_df.set_index('geographic_location', inplace = True)

#### Rename One Park

In [254]:
as_list = canadian_parks_df.index.tolist()
idx = as_list.index('Nahanni National Park Reserve')
as_list[idx] = 'Nahanni National Park'
canadian_parks_df.index = as_list

In [255]:
canadian_parks_df['park_area_in_square_kilometres'] = canadian_parks_df['park_area_in_square_kilometres'].str.replace(',', '')
canadian_parks_df['area_hectares'] = pd.to_numeric(canadian_parks_df['park_area_in_square_kilometres']) * 100
canadian_parks_df['age'] = 2017 - canadian_parks_df['year_established']

#### Create a UNESCO column of labels for Canadian Parks

In [259]:
canadian_parks = canadian_parks_df.index.values
unesco_parks = df[df['country'] == 'Canada'].index.values
mask = np.in1d(canadian_parks, unesco_parks)
canadian_parks_df['UNESCO'] = mask 
canadian_parks_df['UNESCO'] = canadian_parks_df['UNESCO'].astype(int)

In [260]:
cols_to_keep = ['year_established', 'area_hectares', 'UNESCO', 'age']
canadian_parks_df = canadian_parks_df[cols_to_keep]

#### Merge UNESCO with Canadian National parks

In [272]:
df2 = df1.merge(canadian_parks_df, how='outer', left_index = True, right_index = True)

In [273]:
cols_to = ['area_hectares_x', 'UNESCO_x', 'age_x']
cols_from = ['area_hectares_y', 'UNESCO_y', 'age_y']
df2 = merge_columns(df2, cols_to, cols_from)

### TripAdvisor Data

In [350]:
trip_advisor = '../pickle_files/trip_advisor.p'
tripadvisor_df = pd.read_pickle(trip_advisor)
tripadvisor_df.set_index('name', inplace = True)

In [351]:
tripadvisor_df.head()

Unnamed: 0_level_0,latitude,longitude,critiera7,criteria8,criteria9,criteria10,established,tareviews,ranking,rating,visitors,region
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Aulavik National Park,73.700278,-119.919444,0.0,0.0,0.0,0.0,1992,0,,,8,North America
Auyuittuq National Park,67.883333,-65.016667,0.0,0.0,0.0,0.0,2001,10,"(80, 10, 10, 0, 0)",4.5,400,North America
Bruce Peninsula National Park,45.238889,-81.614167,0.0,0.0,0.0,0.0,1987,559,"(73, 20, 4, 2, 1)",4.5,412623,North America
Cape Breton Highlands National Park,46.716667,-60.659722,0.0,0.0,0.0,0.0,1936,1202,"(84, 12, 2, 1, 1)",5.0,302827,North America
Elk Island National Park,53.614444,-112.866111,0.0,0.0,0.0,0.0,1913,434,"(55, 32, 10, 2, 1)",4.5,360678,North America


### A few of the Parks in Canada have no Trip Advisor Reviews. Let's look how far they are from the closest parks with reviews.

In [352]:
tripadvisor_df.loc[tripadvisor_df['ranking'] == 'n/a']

Unnamed: 0_level_0,latitude,longitude,critiera7,criteria8,criteria9,criteria10,established,tareviews,ranking,rating,visitors,region
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Aulavik National Park,73.700278,-119.919444,0.0,0.0,0.0,0.0,1992,0,,,8,North America
Qausuittuq National Park,76.0,-100.0,0.0,0.0,0.0,0.0,2015,0,,,21,North America
Tuktut Nogait National Park,68.81869,-121.74925,0.0,0.0,0.0,0.0,1998,0,,,2,North America
Ukkusiksalik National Park,65.341667,-87.305556,0.0,0.0,0.0,0.0,2003,0,,,300,North America
Vuntut National Park,68.366667,-139.85,0.0,0.0,0.0,0.0,1995,0,,,400,North America


In [353]:
tripadvisor_df['Age'] = 2017 - tripadvisor_df['established']

In [354]:
no_ratings = tripadvisor_df.loc[tripadvisor_df['tareviews'] == 0]
ratings = tripadvisor_df.loc[tripadvisor_df['tareviews'] != 0]
no_ratings_lst = no_ratings.index.values
ratings_lst = ratings.index.values

In [355]:
no_ratings_lat = no_ratings['latitude'].values
no_ratings_lon = no_ratings['longitude'].values

ratings_lat = ratings['latitude'].values
ratings_lon = ratings['longitude'].values

In [356]:
no_ratings_dict = {}
ratings_dict = {}

In [357]:
for site, lat, lon in zip(no_ratings_lst, no_ratings_lat, no_ratings_lon):
    no_ratings_dict[site] = [lat, lon]
for site, lat, lon in zip(ratings_lst, ratings_lat, ratings_lon):
    ratings_dict[site] = [lat, lon]

In [358]:
closest_dict = {}
for site in no_ratings_dict.keys():
    lowest = None
    lowest_site = None
    dists = []

    for site1 in ratings_dict.keys():
        dist = great_circle(no_ratings_dict[site][0:2] , ratings_dict[site1])
        dist = str(dist)
        dist = float(dist[:-3])
        
        if lowest == None or dist < lowest:
            lowest = dist
            lowest_site = site1
    closest_dict[site] = (lowest_site, lowest)

In [359]:
closest_dict

{'Aulavik National Park': ('Sirmilik National Park', 1216.9442828418512),
 'Qausuittuq National Park': ('Sirmilik National Park', 648.9066794669081),
 'Tuktut Nogait National Park': ('Nahanni National Park Reserve',
  828.0115337095184),
 'Ukkusiksalik National Park': ('Sirmilik National Park', 883.9479127587751),
 'Vuntut National Park': ('Nahanni National Park Reserve', 1008.1595892075644)}

### The two closest parks to the ones with no ratings are Sirmilik and Nahanni

In [360]:
tripadvisor_df.loc[['Sirmilik National Park', 'Nahanni National Park Reserve']]

Unnamed: 0_level_0,latitude,longitude,critiera7,criteria8,criteria9,criteria10,established,tareviews,ranking,rating,visitors,region,Age
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Sirmilik National Park,72.9906,-81.13732,0.0,0.0,0.0,0.0,2001,4,"(50, 50, 0, 0, 0)",4.5,84,North America,16
Nahanni National Park Reserve,61.547222,-125.589444,1.0,1.0,0.0,0.0,1972,18,"(89, 11, 0, 0, 0)",5.0,1082,North America,45


### These seem similar enough. Let's give the ones with no ratings the same Trip Advisor ratings as the closest part with ratings.

In [361]:
cols = ['tareviews', 'ranking', 'rating']

for k in closest_dict.keys():
    mask = closest_dict[k][0]
    tripadvisor_df.loc[k, cols] = tripadvisor_df.loc[mask, cols]

In [362]:
cols_to_keep = ['tareviews', 'ranking', 'rating', 'visitors', 'Age']
tripadvisor_df = tripadvisor_df[cols_to_keep]

### Let's merge our Trip Advisor dataframe with our UNESCO dataframe.

In [374]:
df3 = df2.merge(tripadvisor_df, how = 'outer', left_index = True, right_index = True)

In [375]:
df3.columns

Index(['area_hectares', 'country', 'UNESCO', 'age', 'year_established',
       'tareviews', 'ranking', 'rating', 'visitors', 'Age'],
      dtype='object')

#### Drop the rows with no tripadvisor reviews

In [376]:
rows_to_drop = df3.loc[df3['tareviews'].isnull()].index.values
df3.drop(rows_to_drop, axis = 0, inplace = True)
df3.shape

(148, 10)

In [377]:
df3.columns

Index(['area_hectares', 'country', 'UNESCO', 'age', 'year_established',
       'tareviews', 'ranking', 'rating', 'visitors', 'Age'],
      dtype='object')

In [378]:
df3[df3['area_hectares'].isnull()]

Unnamed: 0,area_hectares,country,UNESCO,age,year_established,tareviews,ranking,rating,visitors,Age
El Vizcaíno Biosphere Reserve,,,,,,1.0,"(100, 0, 0, 0, 0)",5.0,13000.0,29.0
Gulf Islands National Park Reserve,,,,,,16.0,"(82, 12, 6, 0, 0)",5.0,140000.0,14.0
Nahanni National Park Reserve,,,,,,18.0,"(89, 11, 0, 0, 0)",5.0,1082.0,45.0
Nanda Devi and Valley of Flowers National Parks,,,,,,213.0,"(79, 17, 4, 0, 0)",4.5,15000.0,29.0
Pinnacles National Park,,,,,,411.0,"(63, 30, 5, 1, 1)",4.5,215555.0,4.0
Qausuittuq National Park,,,,,,4.0,"(50, 50, 0, 0, 0)",4.5,21.0,2.0
Simien Mountains National Park,,,,,,682.0,"(81, 14, 3, 1, 1)",4.5,250000.0,48.0
Thousand Island National Park,,,,,,36.0,"(52, 30, 8, 5, 5)",4.0,90961.0,113.0
Torngat Mountains National Park,,,,,,1.0,"(100, 0, 0, 0, 0)",5.0,700.0,9.0
Uluṟu-Kata Tjuṯa National Park,,,,,,3375.0,"(80, 15, 3, 1, 1)",4.5,279461.0,59.0


#### Load information for missing rows

In [380]:
missing_rows = '../pickle_files/missing_rows.p'
missing_rows_df = pd.read_pickle(missing_rows)
missing_rows_df.set_index('site', inplace = True)

In [403]:
missing_rows_df

Unnamed: 0_level_0,area_hectares,country,unesco,age
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
El Vizcaíno Biosphere Reserve,369631,Mexico,1,29
Gulf Islands National Park Reserve,3600,Canada,0,14
Nahanni National Park Reserve,3005000,Canada,1,45
Nanda Devi and Valley of Flowers National Parks,71783,India,1,29
Pinnacles National Park,10767,United States of America,0,4
Qausuittuq National Park,1100000,Canada,0,2
Simien Mountains National Park,22000,Ethiopia,1,48
Torngat Mountains National Park,970000,Canada,0,9
Uluṟu-Kata Tjuṯa National Park,133372,Australia,1,59
Thousand Islands National Park,2440,Canada,0,113


In [381]:
df4 = df3.merge(missing_rows_df, how = 'outer', left_index = True, right_index = True)

In [382]:
df4

Unnamed: 0,area_hectares_x,country_x,UNESCO,age_x,year_established,tareviews,ranking,rating,visitors,Age,area_hectares_y,country_y,unesco,age_y
Acadia National Park,12300.0,United States of America,0.0,98.0,,175.0,"(89, 9, 1, 1, 0)",5,3303393.0,101.0,,,,
American Samoa National Park,3600.0,United States of America,0.0,29.0,,46.0,"(58, 30, 10, 2, 0)",4.5,28892.0,29.0,,,,
Arches National Park,30900.0,United States of America,0.0,46.0,,1441.0,"(85, 11, 2, 1, 1)",5,1585718.0,88.0,,,,
Aulavik National Park,1220000.0,,0.0,25.0,1992.0,4.0,"(50, 50, 0, 0, 0)",4.5,8.0,25.0,,,,
Auyuittuq National Park,1970740.0,,0.0,41.0,1976.0,10.0,"(80, 10, 10, 0, 0)",4.5,400.0,16.0,,,,
Badlands National Park,98200.0,United States of America,0.0,39.0,,1524.0,"(87, 11, 1, 1, 0)",5,996263.0,78.0,,,,
Banc d'Arguin National Park,1200000.0,Mauritania,1.0,,,4.0,"(50, 25, 25, 0, 0)",4.5,100.0,39.0,,,,
Big Bend National Park,324200.0,United States of America,0.0,73.0,,451.0,"(77, 18, 3, 1, 1)",4.5,388290.0,73.0,,,,
Biscayne National Park,70000.0,United States of America,0.0,37.0,,41.0,"(39, 43, 14, 2, 2)",4,514709.0,37.0,,,,
Black Canyon of the Gunnison National Park,13300.0,United States of America,0.0,18.0,,747.0,"(80, 16, 2, 1, 1)",5,238018.0,18.0,,,,


In [383]:
cols_to = ['area_hectares_x','Age', 'country_x', 'UNESCO']
cols_from = ['area_hectares_y', 'age_y','country_y', 'unesco']
df4 = merge_columns(df4, cols_to, cols_from)

In [384]:
cols_to = ['tareviews', 'ranking','rating', 'visitors']
df4.loc['Thousand Islands National Park', cols_to] = df4.loc['Thousand Island National Park', cols_to]
df4.drop('Thousand Island National Park', axis = 0, inplace = True)

In [396]:
df4.drop(['country', 'year_established', 'age_x'], axis = 1, inplace = True)

In [391]:
df4 = df4.rename(columns = {'UNES': 'UNESCO', 'A': 'Age'})

In [407]:
df4.drop('Talampaya National Park', axis = 0, inplace = True)

In [408]:
df4.to_pickle('../pickle_files/cleaned_df.p')