In [137]:
import numpy as np
import pandas as pd

# Load in data and drop columns we don't need

trails = pd.read_csv("alltrailsexpanded.csv")
user = pd.read_csv("user input.csv")

trails = trails.drop("Unnamed: 0", axis=1)
trails = trails.drop("_geoloc", axis=1)
trails = trails.drop("units", axis=1)
trails = trails.drop("country_name", axis=1)
trails = trails.drop("features_not_coded", axis=1)
trails = trails.drop("activities_not_coded", axis=1)
trails = trails.rename(columns={"latitiude": "latitude"})

In [138]:
# make sure types are correct

numeric_cols = ['Trail ID','latitude','longitude','popularity','length','elevation_gain','difficulty_rating',
                'visitor_usage','avg_rating','num_reviews','TF_dogs-no','TF_forest','TF_historic-site','TF_kids',
                'TF_dogs-leash','TF_ada','TF_beach','TF_cave','TF_views','TF_river','TF_city-walk','TF_lake',
                'TF_rails-trails','TF_partially-paved','TF_dogs','TF_wildlife','TF_paved','ACT_birding','ACT_hiking',
                'ACT_backpacking','ACT_walking','ACT_camping','ACT_nature-trips','ACT_off-road-driving','ACT_road-biking',
                'ACT_mountain-biking','ACT_scenic-driving','ACT_bike-touring','ACT_fishing','ACT_snowshoeing',
                'ACT_paddle-sports','ACT_trail-running','ACT_horseback-riding','ACT_rock-climbing',
                'ACT_cross-country-skiing','ACT_sea-kayaking','ACT_fly-fish','ACT_canoeing','ACT_whitewater-kayaking',
                'ACT_skiing','ACT_surfing','ACT_snowboarding','ACT_ice-climbing','ACT_rails-trails','Grade']
character_cols = ['name','area_name','city_name','state_name','route_type']

for col in numeric_cols:
    trails[col] = pd.to_numeric(trails[col])
for col in character_cols:
    trails[col] = trails[col].astype(str)

In [139]:
# filter trails to meet required input
required_input = user[user['Class'] == 'Required']

valid_trails = trails
print("There are", str(len(valid_trails)), "valid trails.\n")
for index, row in required_input.iterrows():
    starting_len = len(valid_trails)
    
    currCol = row['Variable']
    currFilter = row['Filter']
    currValue = row['Value']
    
    print(currCol,currFilter,currValue)
    
    if currCol in numeric_cols:
        currValue = int(currValue)
    else:
        currValue = str(currValue)
    
    if currFilter == 'Equals':
        valid_trails = valid_trails[valid_trails[currCol] == currValue]
    elif currFilter == 'Greater Than':
        valid_trails = valid_trails[valid_trails[currCol] >= currValue]
    else:
        valid_trails = valid_trails[valid_trails[currCol] <= currValue]
        
    ending_len = len(valid_trails)
        
    print("Filtered out",str(starting_len - ending_len),"trails.\n")
print("After filtering, there are",str(len(valid_trails)),"valid trails.")

There are 3313 valid trails.

popularity Greater Than 5
Filtered out 1194 trails.

avg_rating Greater Than 3
Filtered out 0 trails.

TF_views Equals 1
Filtered out 177 trails.

TF_dogs Equals 1
Filtered out 1671 trails.

ACT_hiking Equals 1
Filtered out 16 trails.

ACT_off-road-driving Equals 0
Filtered out 1 trails.

ACT_road-biking Equals 0
Filtered out 10 trails.

ACT_mountain-biking Equals 0
Filtered out 11 trails.

After filtering, there are 233 valid trails.


In [140]:
# find differences from optional input (for a similarity score)
optional_input = user[user['Class'] == 'Optional']

# This function creates new the new similarity column based on column type
def simi_finder(trail, currCol, currValue):
    
    # all numeric
    if currCol in numeric_cols:
        currValue = int(currValue)
        
        # binary field
        if max(valid_trails[currCol] == 1):
            if trail[currCol] == currValue:
                return 1
            else:
                return 0
        
        # continuous field (like length)
        else:
            if trail[currCol] != 0:
                return 1 - abs((currValue - trail[currCol]) / trail[currCol])
            else:
                return 0
    
    # all character
    else:
        currValue = str(currValue) 
        
        if trail[currCol] == currValue:
            return 1
        else:
            return 0


for index, row in optional_input.iterrows():
    currCol = row['Variable']
    currValue = row['Value']
    
    print(currCol, currValue)
    
    newColName = currCol + '_similarity'
    
    valid_trails[newColName] = valid_trails.apply(lambda trail: simi_finder(trail, currCol, currValue), axis=1)

length 1000
route_type out and back
num_reviews 50
TF_historic-site 1
TF_cave 1
TF_river 1
TF_paved 1
ACT_camping 1
ACT_scenic-driving 1


In [142]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

valid_trails_simi = valid_trails.filter(regex='similarity')
valid_trails_simi['score'] = valid_trails_simi.sum(axis=1)
valid_trails['Score'] = valid_trails_simi['score']

In [145]:
# Trails that fit the criteria the closest
final_trails = valid_trails[['Trail ID','Score']]
final_trails.sort_values(by='Score', ascending=False)

Unnamed: 0,Trail ID,Score
1243,10268406,5.327292
1805,10300429,4.158776
2697,10034432,3.980755
1790,10257890,3.656685
1782,10235835,3.457764
...,...,...
1367,10000372,-3.928591
2559,10015003,-4.075317
2528,10263869,-4.138014
2550,10263938,-6.941380
