## Trails Filtering and Scoring

### Import data and drop columns we don't need

In [129]:
import numpy as np
import pandas as pd

# Import data
trails = pd.read_csv("alltrailsexpanded.csv")
user = pd.read_csv("user input.csv")

# Remove columns
trails = trails.drop("Unnamed: 0", axis=1)
trails = trails.drop("_geoloc", axis=1)
trails = trails.drop("units", axis=1)
trails = trails.drop("country_name", axis=1)
trails = trails.drop("features_not_coded", axis=1)
trails = trails.drop("activities_not_coded", axis=1)
trails = trails.rename(columns={"latitiude": "latitude"})

### Categorize columns into numeric or character and ensure types

In [130]:
# Categorize columns
numeric_cols = ['Trail ID','latitude','longitude','popularity','length','elevation_gain','difficulty_rating',
                'visitor_usage','avg_rating','num_reviews','TF_dogs-no','TF_forest','TF_historic-site','TF_kids',
                'TF_dogs-leash','TF_ada','TF_beach','TF_cave','TF_views','TF_river','TF_city-walk','TF_lake',
                'TF_rails-trails','TF_partially-paved','TF_dogs','TF_wildlife','TF_paved','ACT_birding','ACT_hiking',
                'ACT_backpacking','ACT_walking','ACT_camping','ACT_nature-trips','ACT_off-road-driving','ACT_road-biking',
                'ACT_mountain-biking','ACT_scenic-driving','ACT_bike-touring','ACT_fishing','ACT_snowshoeing',
                'ACT_paddle-sports','ACT_trail-running','ACT_horseback-riding','ACT_rock-climbing',
                'ACT_cross-country-skiing','ACT_sea-kayaking','ACT_fly-fish','ACT_canoeing','ACT_whitewater-kayaking',
                'ACT_skiing','ACT_surfing','ACT_snowboarding','ACT_ice-climbing','ACT_rails-trails','Grade']
character_cols = ['name','area_name','city_name','state_name','route_type']

# Ensure types
for col in numeric_cols:
    trails[col] = pd.to_numeric(trails[col])
for col in character_cols:
    trails[col] = trails[col].astype(str)

### Find distance from user for each trail

In [131]:
# Create Trail Distance Variable
from math import radians, cos, sin, asin, sqrt

# Find user coords
user_lat = float(user.loc()[54][3])
user_long = float(user.loc()[55][3])

def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 3956 # Radius of earth in miles
    return c * r

distance_to_user = [0]
for index, row in trails.iterrows():
    trail_lat = row['latitude']
    trail_long = row['longitude']
    
    dist = round(haversine(trail_long, trail_lat, user_long, user_lat),2)
    
    distance_to_user.append(dist)
    
distance_to_user.pop(0)

trails['Distance'] = distance_to_user
numeric_cols.append('Distance')

print("User Location:",str(user_lat), str(user_long))

User Location: 35.567165 -82.543786


### Create general trail information dataframe

In [132]:
trail_info = trails[['Trail ID', 'name', 'length', 'difficulty_rating', 'Distance', 'area_name', 'latitude', 'longitude']]

### Apply required filters

In [133]:
# filter trails to meet required input
required_input = user[user['Class'] == 'Required']

valid_trails = trails
print("There are", str(len(valid_trails)), "valid trails.\n")
for index, row in required_input.iterrows():
    starting_len = len(valid_trails)
    
    currCol = row['Variable']
    currFilter = row['Filter']
    currValue = row['Value']
    
    print(currCol,currFilter,currValue)
    
    if currCol in numeric_cols:
        currValue = int(currValue)
    else:
        currValue = str(currValue)
    
    if currFilter == 'Equals':
        valid_trails = valid_trails[valid_trails[currCol] == currValue]
    elif currFilter == 'Greater Than':
        valid_trails = valid_trails[valid_trails[currCol] >= currValue]
    else:
        valid_trails = valid_trails[valid_trails[currCol] <= currValue]
        
    ending_len = len(valid_trails)
        
    print("Filtered out",str(starting_len - ending_len),"trails.\n")
print("After filtering, there are",str(len(valid_trails)),"valid trails.")

There are 3313 valid trails.

popularity Greater Than 5
Filtered out 1194 trails.

avg_rating Greater Than 3
Filtered out 0 trails.

TF_views Equals 1
Filtered out 177 trails.

ACT_hiking Equals 1
Filtered out 104 trails.

ACT_off-road-driving Equals 0
Filtered out 9 trails.

Distance Less Than 1000
Filtered out 1406 trails.

After filtering, there are 423 valid trails.


### Score for similarity based on optional criteria

In [134]:
# Find optional criteria
optional_input = user[user['Class'] == 'Optional']

# This function creates new the new similarity column based on column type
def simi_finder(trail, currCol, currValue):
    
    # all numeric
    if currCol in numeric_cols:
        currValue = int(currValue)
        
        # binary field
        if max(valid_trails[currCol] == 1):
            if trail[currCol] == currValue:
                return 1
            else:
                return 0
        
        # continuous field (like length)
        else:
            if trail[currCol] != 0:
                return max(0, round(1 - (abs(currValue - trail[currCol]) / trail[currCol]), 2))
            else:
                return 0
    
    # all character
    else:
        currValue = str(currValue) 
        
        if trail[currCol] == currValue:
            return 1
        else:
            return 0


for index, row in optional_input.iterrows():
    currCol = row['Variable']
    currValue = row['Value']
    
    print(currCol, ' ', currValue)
    
    newColName = currCol + '_similarity'
    
    valid_trails[newColName] = valid_trails.apply(lambda trail: simi_finder(trail, currCol, currValue), axis=1)

length   5000
route_type   out and back
num_reviews   50
TF_historic-site   1
TF_cave   1
TF_river   1
TF_lake   1
TF_dogs   1
TF_paved   1
ACT_camping   1
ACT_mountain-biking   1
ACT_scenic-driving   1
ACT_trail-running   1


### Calculate score and append to info

In [135]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

similarity_scores = valid_trails.filter(regex='similarity')
similarity_scores['score'] = round((similarity_scores.sum(axis=1) / len(optional_input))*100,2)
valid_trails['Score'] = similarity_scores['score']
trail_scores = valid_trails[['Trail ID', 'Score']]

trail_info = pd.merge(trail_info, trail_scores, on='Trail ID')
trail_info

Unnamed: 0,Trail ID,name,length,difficulty_rating,Distance,area_name,latitude,longitude,Score
0,10041922,Sunset Trail,20921.420,5,597.16,Hot Springs National Park,34.52560,-93.03637,21.00
1,10235890,West Mountain Trail,3057.746,3,598.55,Hot Springs National Park,34.51279,-93.05836,21.85
2,10264755,Goat Rock Trail,3862.416,3,597.33,Hot Springs National Park,34.52802,-93.03995,33.77
3,10264753,Gulpha Gorge Trail,1931.208,3,597.17,Hot Springs National Park,34.52556,-93.03645,20.54
4,10007938,Dead Chief Trail,4345.218,3,598.20,Hot Springs National Park,34.51359,-93.05238,36.85
...,...,...,...,...,...,...,...,...,...
418,10029295,Ivy Creek via Appalachian Trail,4345.218,1,284.89,Shenandoah National Park,38.28487,-78.65889,29.62
419,10013306,Thorton Gap and Buck Hollow Trail,19473.014,3,317.12,Shenandoah National Park,38.65839,-78.28212,9.69
420,10015003,The Peak Trail,14805.928,5,327.98,Shenandoah National Park,38.80698,-78.18120,25.69
421,10327754,Cave Falls via Hazel River and White Rocks Trail,10460.710,3,316.18,Shenandoah National Park,38.61445,-78.25682,26.77


### Map top trails

In [136]:
import plotly.express as px
import pandas as pd

color_scale = [(0, 'yellow'), (1,'darkgreen')]

fig = px.scatter_mapbox(trail_info, 
                        lat="latitude", 
                        lon="longitude", 
                        hover_name="name", 
                        hover_data=["Score", "area_name", "Distance"],
                        color="Score",
                        size="Score",
                        color_continuous_scale=color_scale,
                        zoom=3, 
                        height=800,
                        width=800)

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

### Print most optimal trails

In [137]:
trail_info[['name', 'Distance', 'length', 'difficulty_rating', 'Score']]

Unnamed: 0,name,Distance,length,difficulty_rating,Score
0,Sunset Trail,597.16,20921.420,5,21.00
1,West Mountain Trail,598.55,3057.746,3,21.85
2,Goat Rock Trail,597.33,3862.416,3,33.77
3,Gulpha Gorge Trail,597.17,1931.208,3,20.54
4,Dead Chief Trail,598.20,4345.218,3,36.85
...,...,...,...,...,...
418,Ivy Creek via Appalachian Trail,284.89,4345.218,1,29.62
419,Thorton Gap and Buck Hollow Trail,317.12,19473.014,3,9.69
420,The Peak Trail,327.98,14805.928,5,25.69
421,Cave Falls via Hazel River and White Rocks Trail,316.18,10460.710,3,26.77


### Recommend areas

In [142]:
areas = trail_info['area_name'].unique()

area_info = pd.DataFrame()
for area in areas:
    area_trails = trail_info[trail_info['area_name'] == area]
    area_lat = area_trails['latitude'].mean()
    area_long = area_trails['longitude'].mean()
    area_distance = round(haversine(area_long, area_lat, user_long, user_lat),2)
    
    area_max = max(area_trails['Score'])
    area_average = area_trails['Score'].mean()
    area_min = min(area_trails['Score'])
    area_count = len(area_trails)
    area_sum = sum(area_trails['Score'])
    
    area_output = pd.DataFrame(data = {'Area':[area],'Distance':[area_distance],
                                       'Min Score':[area_min],'Mean Score':[area_average],
                                      'Max Score':[area_max],'Sum Score':[area_sum],'Trail Count':[area_count]})
    
    area_info = pd.concat([area_info, area_output])
    
area_info.sort_values(by=['Mean Score'], ascending=False)

Unnamed: 0,Area,Distance,Min Score,Mean Score,Max Score,Sum Score,Trail Count
0,Fort Hunt National Park,370.5,43.0,43.0,43.0,43.0,1
0,Cuyahoga Valley National Park,395.56,14.54,30.117333,53.54,903.52,30
0,Mammoth Cave National Park,227.05,16.85,29.931,46.23,299.31,10
0,Congaree National Park,155.27,25.77,27.436667,30.23,82.31,3
0,Hot Springs National Park,598.07,14.85,24.51,36.85,245.1,10
0,Acadia National Park,966.37,3.23,23.765755,43.85,2519.17,106
0,Shenandoah National Park,305.61,0.77,22.942165,40.23,2225.39,97
0,Great Smoky Mountains National Park,53.18,0.0,21.199091,53.62,3264.66,154
0,Indiana Dunes National Park,486.89,7.46,19.591667,29.62,117.55,6
0,Everglades National Park,713.43,0.0,11.014,21.62,55.07,5
