# ML Models

## Import Libraries & Load Dataframe from AWS DB

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from math import sin, cos, sqrt, atan2, radians

import query_helper
import get_new_route
import json


## Get input from user for recommendation

In [2]:
df_numeric = pd.read_csv('df.csv', index_col='id')
df_numeric =df_numeric[['name', 'rating', 'stars', 'starVotes', 'pitches', 'location', 'region',
                               'area', 'sub_area', 'wall', 'longitude', 'latitude', 'url', 'Sport',
                               'Trad', 'Boulder', 'TR', 'Alpine', 'Aid', 'Ice', 'Snow', 'Mixed',
                               'danger', 'rope_grade', 'boulder_grade', 'infos', 'slab', 'traverse',
                               'roof', 'corner', 'crack', 'face', 'flake', 'fingers', 'jug', 'exposed',
                               'dihedral', 'sustained', 'technical', 'run out', 'well protected',
                               'chimney', 'offwidth', 'stem', 'arete', 'crimp', 'vertical', 'powerful',
                               'in_range']]
df_numeric.head()

Unnamed: 0_level_0,name,rating,stars,starVotes,pitches,location,region,area,sub_area,wall,...,run out,well protected,chimney,offwidth,stem,arete,crimp,vertical,powerful,in_range
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
105714722,Central Yellow Wall,V3 R,4.4,22,0,South Dakota,Custer State Park,Sylvan Lake,Sylvan Lake Bouldering,Campground Boulder,...,0,0,0,0,0,0,0,0,0,1
105714728,Waves,5.8,4.6,217,2,South Dakota,The Needles Of Rushmore,Mount Rushmore National Memorial,South Seas,Shipyard Rock,...,0,0,0,0,0,0,0,1,0,1
105714731,East Chimney Variation,5.7,4.1,40,2,South Dakota,Custer State Park,Cathedral Spires,Station 13,0,...,0,0,0,0,0,0,0,0,0,1
105714734,Conn Diagonal,5.7,4.9,151,3,South Dakota,Custer State Park,Sylvan Lake,Outlets,Outer Outlet,...,0,0,0,0,1,0,0,0,0,1
105714737,Bolts for Bobs,5.8,3.5,125,1,South Dakota,The Needles Of Rushmore,Mount Rushmore National Memorial,South Seas,Borneo,...,0,0,0,0,0,0,0,0,0,1


In [3]:
target_id = 106875741
target_lat = 32.9127 
target_lon = -116.882
target_state =None
target_city =None
target_zipcode = '92008'
target_radius_range=60
star_limit = 0
###other parameters to be added here later

### Get coordinates for zip or city

In [4]:
with open('us-zip-code-latitude-and-longitude.json') as f:
  coord_dict = json.load(f)

In [5]:
def get_coords(city=None, state=None, zipcode=None):
    #find the coordinates for city or zip code
    for city in coord_dict:
        if city['fields']['zip']==zipcode:
            return city['fields']['latitude'],city['fields']['longitude']
        if (city['fields']['state']==state)&(city['fields']['city']==city):
            return city['fields']['latitude'],city['fields']['longitude']
    #if nothing is found return none
    return None, None

In [6]:
target_lat, target_lon = get_coords(target_city, target_state, target_zipcode)

In [7]:
print(target_lat, target_lon)

33.158137 -117.32646


### Create fxn to see if climb is in search range

In [14]:
#function takes search param range and assigns to original df if climb in_range
def in_range(df_fxn, lat, lon, radius_range=None):
    if radius_range:
        R= 3958.8 
        if (lat == None)|(lon==None):
            df_fxn['in_range'] = 1
        else:
            #assign target coords and set to radians for calc
            lat1 = radians(lat)
            lon1 = radians(lon)
            for index, row in df_fxn.iterrows():
                #assign the lat and lon for each climb
                lat2 = radians(row['latitude'])
                lon2 = radians(row['longitude'])

                dlon = lon2 - lon1
                dlat = lat2 - lat1

                a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
                c = 2 * atan2(sqrt(a), sqrt(1 - a))

                distance = R * c

                #assign in_range col to 1 if the climb is in range
                if distance < radius_range:
                    df_fxn.at[index,'in_range']=1
                else:
                    df_fxn.at[index,'in_range']=0   
    else:
        df_fxn['in_range'] =1



In [13]:
def star_cutoff(df_fxn, star_limit=3.5):
    for index, row in df_fxn.iterrows():
        #assign in_range col to 1 if the climb is in range
        if (df_fxn.at[index, 'stars'] >= star_limit)&(df_fxn.at[index, 'in_range']!=0):
            df_fxn.at[index,'in_range']=1
        else:
            df_fxn.at[index,'in_range']=0   

### Call function to assign if climb in range

In [15]:
## used to get list of climbs allowed for comparison
in_range(df_numeric, lat = target_lat, lon = target_lon, radius_range=target_radius_range)

### Star cutoff (ie only give results for routes with above 3.5 stars)

In [17]:
star_cutoff(df_numeric, star_limit)

In [18]:
df_numeric.in_range.value_counts()

0    34441
1     2172
Name: in_range, dtype: int64

### To begin, see if if the climb already exists in db

In [20]:
if target_id in df_numeric.index:
    print('We have climb already')
else:
    print('Making API call and Scraping climb data')
    if(get_new_route.get_route_details(target_id)):
        df_target= pd.read_csv('target_climb.csv', index_col= 'id')
        df_target.drop(columns=['Unnamed: 0'], inplace=True)
        df_target['in_range'] = 1
        df_target = df_target[['name', 'rating', 'stars', 'starVotes', 'pitches', 'location', 'region',
                               'area', 'sub_area', 'wall', 'longitude', 'latitude', 'url', 'Sport',
                               'Trad', 'Boulder', 'TR', 'Alpine', 'Aid', 'Ice', 'Snow', 'Mixed',
                               'danger', 'rope_grade', 'boulder_grade', 'infos', 'slab', 'traverse',
                               'roof', 'corner', 'crack', 'face', 'flake', 'fingers', 'jug', 'exposed',
                               'dihedral', 'sustained', 'technical', 'run out', 'well protected',
                               'chimney', 'offwidth', 'stem', 'arete', 'crimp', 'vertical', 'powerful',
                               'in_range']]
        
        df_numeric = pd.concat([df_numeric, df_target])
    else:
        print("Something went wrong")


We have climb already


In [21]:
df_numeric.tail()

Unnamed: 0_level_0,name,rating,stars,starVotes,pitches,location,region,area,sub_area,wall,...,run out,well protected,chimney,offwidth,stem,arete,crimp,vertical,powerful,in_range
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
118210488,Unknown #1,V0-1,3.0,1,0,California,Lake Tahoe,I-80 Corridor,Auburn and Grass Valley,"Auburn SRA,Tall Green Bridge,Riverside Boulder...",...,0,0,0,0,0,0,0,0,0,0
118210496,Unknown #2,V1-2,3.0,1,0,California,Lake Tahoe,I-80 Corridor,Auburn and Grass Valley,"Auburn SRA,Tall Green Bridge,Riverside Boulder...",...,0,0,0,0,0,0,0,0,0,0
118210504,Unknown #3,V1-2,3.0,1,0,California,Lake Tahoe,I-80 Corridor,Auburn and Grass Valley,"Auburn SRA,Tall Green Bridge,Riverside Boulder...",...,0,0,0,0,0,1,0,0,0,0
118211517,Mertensia Pillar,WI5,4.0,1,1,Colorado,CO Ice & Mixed,RMNP - Mixed/Ice,Wild Basin,Mertensia Falls,...,0,0,0,0,0,0,0,0,0,0
106875741,Via Leni 6b,5.10b,5.0,4,6,International,Europe,Switzerland,Albigna Valley,Spazzacaldeira,...,0,0,0,0,0,0,0,0,0,1


## Reccomender

#### Kernel Imports

In [22]:
# Import kernels
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import polynomial_kernel
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics.pairwise import laplacian_kernel
from sklearn.metrics.pairwise import chi2_kernel



In [23]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler

#### Create df_in_range to run recommender in subset

In [25]:
df_in_range = df_numeric[df_numeric['in_range']==1].reset_index()      #maybe add star filter in this line
target_index =df_in_range.index[df_in_range['id']==target_id][0]
df_in_range.shape

(2173, 50)

In [61]:
df_in_range.iloc[target_index]

id                                                        106875741
name                                                    Via Leni 6b
rating                                                        5.10b
stars                                                             5
starVotes                                                         4
pitches                                                           6
location                                              International
region                                                       Europe
area                                                    Switzerland
sub_area                                             Albigna Valley
wall                                                 Spazzacaldeira
longitude                                                    9.6383
latitude                                                    46.3387
url               https://www.mountainproject.com/route/10687574...
Sport                                           

In [62]:
target_index

2172

In [63]:
df_in_range.tail()

Unnamed: 0,id,name,rating,stars,starVotes,pitches,location,region,area,sub_area,wall,longitude,latitude,url,Sport,Trad,Boulder,TR,Alpine,Aid,Ice,Snow,Mixed,danger,rope_grade,boulder_grade,infos,slab,traverse,roof,corner,crack,face,flake,fingers,jug,exposed,dihedral,sustained,technical,run out,well protected,chimney,offwidth,stem,arete,crimp,vertical,powerful,in_range
2168,118189308,Golgotha,V8-9,5.0,1,0,California,San Diego and Environs,North San Diego County,Lake Ramona,Summit Area,-116.946,33.0121,https://www.mountainproject.com/route/11818930...,0,0,1,0,0,0,0,0,0,0,0,37,Start with your left hand in a waist high unde...,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1
2169,118192124,Bloody L,5.7,4.0,2,1,California,Inland Empire,Big Rock Area,L-Slab/Lakeview Slab,0,-117.151,33.8429,https://www.mountainproject.com/route/11819212...,0,1,0,0,0,0,0,0,0,0,11,0,Begin climbing below a wide crack that begins ...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2170,118202461,Pieces of Eight,5.10c/d PG13,4.0,2,1,California,Tahquitz & Suicide Rocks,Suicide Rock,(l) Buttress of Cracks,Buttress of Cracks - Left Side,-116.694,33.7699,https://www.mountainproject.com/route/11820246...,0,1,0,0,0,0,0,0,0,1,27,0,Starts at P1 bolts for Pirate or Walk the Plan...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2171,118208812,Coyote Traverse,5.8 V0+ PG13,3.5,2,1,California,San Diego and Environs,North San Diego County,Calavera lake,Lake View Boulder,-117.289,33.1699,https://www.mountainproject.com/route/11820881...,0,0,1,1,0,0,0,0,0,1,14,0,This route connects the Coyote Line crack to t...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2172,106875741,Via Leni 6b,5.10b,5.0,4,6,International,Europe,Switzerland,Albigna Valley,Spazzacaldeira,9.6383,46.3387,https://www.mountainproject.com/route/10687574...,1,0,0,0,1,0,0,0,0,0,22,0,"Via Leni was one, if not the first modern rout...",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


### Scale Features

#### Create Features DF

In [116]:
#creates features df used for comparison
features = df_in_range.loc[:,['stars', 'Sport', 'Trad', 'Boulder', 'TR', 'Alpine', 'Aid',
       'Ice', 'Snow', 'Mixed', 'danger', 'rope_grade', 'boulder_grade', 'slab', 'traverse', 'roof', 
                'corner', 'crack', 'face','flake', 'fingers',
                 'jug', 'exposed', 'dihedral', 'sustained', 'technical','run out', 'well protected',
                 'chimney', 'offwidth', 'stem', 'arete','crimp', 'vertical', 'powerful']] #,'longitude','latitude',

In [117]:
features.shape

(2173, 35)

#### Pick scaling type (AND UPDATE WEIGHTS)

In [185]:
min_max_scaler = MinMaxScaler()
scalar = StandardScaler()

In [186]:
##### Pick a scaling option ###############################

# features_scaled = scalar.fit_transform(features)
# features_scaled = min_max_scaler.fit_transform(features.drop(columns=['danger','pitches']))

features_scaled = min_max_scaler.fit_transform(features)

# scale danger and pitches using ss and add into features scaled df
# features_scaled = np.concatenate((features_scaled, scalar.fit_transform(features[['danger', 'pitches']])), axis=1)

##################################################################

# for i in range(features_scaled.shape[0]):
#     features_scaled[i][10]=features_scaled[i][10]*100  #weight rope_grade higher
#     features_scaled[i][11]=features_scaled[i][11]*100  #weight boulder_grade higher

### Now lets fit the similarity model

#### Rec function

In [197]:
def get_recommendations(idx, kernel_type):

    #value to store scores and indicies
    score_matrix = np.ndarray(shape=(len(df_in_range),2), dtype=float)

    #go through the target climb vs all onthers in our db and populate score mtx with index and similarity
    for i in range(df_in_range.shape[0]):
        score = kernel_type(features_scaled[idx].reshape(1,-1),features_scaled[i].reshape(1,-1))
        score_matrix[i][0] =  i        ##the index comparison corresponding to the score
        score_matrix[i][1] = score     ##the score for the current index

    # Sort the climbs based on the similarity scores
    score_matrix = sorted(score_matrix, key=lambda x: x[1], reverse=True)
    
    # score_matrix
#########################  ADD/calculate SIMilarity VALUE   ###########################

    # # Get the scores of the 20 most similar climbs
    score_matrix = score_matrix[1:11]

    # # Get the climb indices (& cast to ints)
    climb_indices = [int(i[0]) for i in score_matrix]
    
    # Return the top 20 most similar climbs
    return df_in_range.loc[climb_indices,:]

#### Cells for comparison (delete later)

In [188]:
df_numeric.loc[target_id, :]

name                                                    Via Leni 6b
rating                                                        5.10b
stars                                                             5
starVotes                                                         4
pitches                                                           6
location                                              International
region                                                       Europe
area                                                    Switzerland
sub_area                                             Albigna Valley
wall                                                 Spazzacaldeira
longitude                                                    9.6383
latitude                                                    46.3387
url               https://www.mountainproject.com/route/10687574...
Sport                                                             1
Trad                                            

In [189]:
df_numeric.shape

(36614, 49)

In [190]:
target_index

2172

In [191]:
# df_numeric[(df_numeric['boulder_grade']>5)]

#### Call rec fxn

In [192]:
# pd.options.display.max_rows= 200

In [198]:
rec=get_recommendations(target_index, cosine_similarity)
rec

Unnamed: 0,id,name,rating,stars,starVotes,pitches,location,region,area,sub_area,wall,longitude,latitude,url,Sport,Trad,Boulder,TR,Alpine,Aid,Ice,Snow,Mixed,danger,rope_grade,boulder_grade,infos,slab,traverse,roof,corner,crack,face,flake,fingers,jug,exposed,dihedral,sustained,technical,run out,well protected,chimney,offwidth,stem,arete,crimp,vertical,powerful,in_range
282,105835105,Mañana,5.10,4.1,42,2,California,San Diego and Environs,South San Diego County,El Cajon Mountain,"The Wedge,Left Wall",-116.818,32.9046,https://www.mountainproject.com/route/10583510...,1,0,0,0,0,0,0,0,0,0,23,0,"Great route, rivals long face climbing found i...",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
940,107167402,Candid Chimera,5.10a,4.0,3,1,California,San Diego and Environs,South San Diego County,Corte Madera,Solar Slabs,-116.591,32.7555,https://www.mountainproject.com/route/10716740...,1,0,0,0,0,0,0,0,0,0,19,0,This exciting face route is exactly 100 feet l...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1142,107950501,Stranger Danger,5.10c,4.0,46,1,California,San Diego and Environs,North San Diego County,Cougar Crag,Cougar Crag East,-117.138,33.3417,https://www.mountainproject.com/route/10795050...,1,0,0,0,0,0,0,0,0,0,25,0,Start at the bottom of the central face pull u...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
846,106986452,Close to the Edge,5.10a/b,3.9,7,1,California,San Diego and Environs,South San Diego County,Corte Madera,El Nino Wall,-116.589,32.7568,https://www.mountainproject.com/route/10698645...,1,0,0,0,0,0,0,0,0,0,21,0,The crux section is actually a new progression...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1626,111294865,Urge to Merge,5.10a/b,3.9,15,1,California,San Diego and Environs,South San Diego County,El Cajon Mountain,"The Wedge,Left Wall",-116.818,32.9044,https://www.mountainproject.com/route/11129486...,1,0,0,0,0,0,0,0,0,0,21,0,"Follow the right slanting white grove, gain th...",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
134,105794217,Lord of the Rings,5.10c,3.9,33,1,California,San Diego and Environs,South San Diego County,Mission Gorge,Middle Earth,-117.051,32.8242,https://www.mountainproject.com/route/10579421...,1,0,0,0,0,0,0,0,0,0,25,0,"Climb past two bolts to a ledge, transition on...",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
65,105791566,Suzie's Wild Ride,5.9-,3.6,201,1,California,San Diego and Environs,South San Diego County,Mission Gorge,Main Wall,-117.051,32.8267,https://www.mountainproject.com/route/10579156...,1,0,0,0,0,0,0,0,0,0,16,0,The climb starts out down and right from a sma...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
903,107021544,Im Panadilla,5.10a,3.5,6,1,California,San Diego and Environs,South San Diego County,Corte Madera,El Nino Wall,-116.589,32.7568,https://www.mountainproject.com/route/10702154...,1,0,0,0,0,0,0,0,0,0,19,0,Sport climb up the middle of High Rise face.-b...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
459,106096196,Kissed by the Sun,5.8,3.6,8,1,California,Inland Empire,Big Rock Area,Helios Boulder,0,-117.173,33.8366,https://www.mountainproject.com/route/10609619...,1,0,0,0,0,0,0,0,0,0,14,0,This is the bolt line on the left side of the ...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
414,106003829,Mickey Finn,5.10b,3.4,19,1,California,San Diego and Environs,South San Diego County,Mission Gorge,Main Wall,-117.051,32.8266,https://www.mountainproject.com/route/10600382...,1,0,0,0,0,0,0,0,0,0,22,0,Thin face climbing that will force you to work...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [199]:
rec=get_recommendations(target_index, rbf_kernel)
rec

Unnamed: 0,id,name,rating,stars,starVotes,pitches,location,region,area,sub_area,wall,longitude,latitude,url,Sport,Trad,Boulder,TR,Alpine,Aid,Ice,Snow,Mixed,danger,rope_grade,boulder_grade,infos,slab,traverse,roof,corner,crack,face,flake,fingers,jug,exposed,dihedral,sustained,technical,run out,well protected,chimney,offwidth,stem,arete,crimp,vertical,powerful,in_range
282,105835105,Mañana,5.10,4.1,42,2,California,San Diego and Environs,South San Diego County,El Cajon Mountain,"The Wedge,Left Wall",-116.818,32.9046,https://www.mountainproject.com/route/10583510...,1,0,0,0,0,0,0,0,0,0,23,0,"Great route, rivals long face climbing found i...",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
940,107167402,Candid Chimera,5.10a,4.0,3,1,California,San Diego and Environs,South San Diego County,Corte Madera,Solar Slabs,-116.591,32.7555,https://www.mountainproject.com/route/10716740...,1,0,0,0,0,0,0,0,0,0,19,0,This exciting face route is exactly 100 feet l...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1142,107950501,Stranger Danger,5.10c,4.0,46,1,California,San Diego and Environs,North San Diego County,Cougar Crag,Cougar Crag East,-117.138,33.3417,https://www.mountainproject.com/route/10795050...,1,0,0,0,0,0,0,0,0,0,25,0,Start at the bottom of the central face pull u...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
846,106986452,Close to the Edge,5.10a/b,3.9,7,1,California,San Diego and Environs,South San Diego County,Corte Madera,El Nino Wall,-116.589,32.7568,https://www.mountainproject.com/route/10698645...,1,0,0,0,0,0,0,0,0,0,21,0,The crux section is actually a new progression...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1626,111294865,Urge to Merge,5.10a/b,3.9,15,1,California,San Diego and Environs,South San Diego County,El Cajon Mountain,"The Wedge,Left Wall",-116.818,32.9044,https://www.mountainproject.com/route/11129486...,1,0,0,0,0,0,0,0,0,0,21,0,"Follow the right slanting white grove, gain th...",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
134,105794217,Lord of the Rings,5.10c,3.9,33,1,California,San Diego and Environs,South San Diego County,Mission Gorge,Middle Earth,-117.051,32.8242,https://www.mountainproject.com/route/10579421...,1,0,0,0,0,0,0,0,0,0,25,0,"Climb past two bolts to a ledge, transition on...",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
65,105791566,Suzie's Wild Ride,5.9-,3.6,201,1,California,San Diego and Environs,South San Diego County,Mission Gorge,Main Wall,-117.051,32.8267,https://www.mountainproject.com/route/10579156...,1,0,0,0,0,0,0,0,0,0,16,0,The climb starts out down and right from a sma...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
903,107021544,Im Panadilla,5.10a,3.5,6,1,California,San Diego and Environs,South San Diego County,Corte Madera,El Nino Wall,-116.589,32.7568,https://www.mountainproject.com/route/10702154...,1,0,0,0,0,0,0,0,0,0,19,0,Sport climb up the middle of High Rise face.-b...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
459,106096196,Kissed by the Sun,5.8,3.6,8,1,California,Inland Empire,Big Rock Area,Helios Boulder,0,-117.173,33.8366,https://www.mountainproject.com/route/10609619...,1,0,0,0,0,0,0,0,0,0,14,0,This is the bolt line on the left side of the ...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
414,106003829,Mickey Finn,5.10b,3.4,19,1,California,San Diego and Environs,South San Diego County,Mission Gorge,Main Wall,-117.051,32.8266,https://www.mountainproject.com/route/10600382...,1,0,0,0,0,0,0,0,0,0,22,0,Thin face climbing that will force you to work...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [196]:
rec=get_recommendations(target_index, laplacian_kernel)
rec

Unnamed: 0,id,name,rating,stars,starVotes,pitches,location,region,area,sub_area,wall,longitude,latitude,url,Sport,Trad,Boulder,TR,Alpine,Aid,Ice,Snow,Mixed,danger,rope_grade,boulder_grade,infos,slab,traverse,roof,corner,crack,face,flake,fingers,jug,exposed,dihedral,sustained,technical,run out,well protected,chimney,offwidth,stem,arete,crimp,vertical,powerful,in_range
282,105835105,Mañana,5.10,4.1,42,2,California,San Diego and Environs,South San Diego County,El Cajon Mountain,"The Wedge,Left Wall",-116.818,32.9046,https://www.mountainproject.com/route/10583510...,1,0,0,0,0,0,0,0,0,0,23,0,"Great route, rivals long face climbing found i...",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
846,106986452,Close to the Edge,5.10a/b,3.9,7,1,California,San Diego and Environs,South San Diego County,Corte Madera,El Nino Wall,-116.589,32.7568,https://www.mountainproject.com/route/10698645...,1,0,0,0,0,0,0,0,0,0,21,0,The crux section is actually a new progression...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1626,111294865,Urge to Merge,5.10a/b,3.9,15,1,California,San Diego and Environs,South San Diego County,El Cajon Mountain,"The Wedge,Left Wall",-116.818,32.9044,https://www.mountainproject.com/route/11129486...,1,0,0,0,0,0,0,0,0,0,21,0,"Follow the right slanting white grove, gain th...",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
940,107167402,Candid Chimera,5.10a,4.0,3,1,California,San Diego and Environs,South San Diego County,Corte Madera,Solar Slabs,-116.591,32.7555,https://www.mountainproject.com/route/10716740...,1,0,0,0,0,0,0,0,0,0,19,0,This exciting face route is exactly 100 feet l...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1142,107950501,Stranger Danger,5.10c,4.0,46,1,California,San Diego and Environs,North San Diego County,Cougar Crag,Cougar Crag East,-117.138,33.3417,https://www.mountainproject.com/route/10795050...,1,0,0,0,0,0,0,0,0,0,25,0,Start at the bottom of the central face pull u...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
134,105794217,Lord of the Rings,5.10c,3.9,33,1,California,San Diego and Environs,South San Diego County,Mission Gorge,Middle Earth,-117.051,32.8242,https://www.mountainproject.com/route/10579421...,1,0,0,0,0,0,0,0,0,0,25,0,"Climb past two bolts to a ledge, transition on...",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
414,106003829,Mickey Finn,5.10b,3.4,19,1,California,San Diego and Environs,South San Diego County,Mission Gorge,Main Wall,-117.051,32.8266,https://www.mountainproject.com/route/10600382...,1,0,0,0,0,0,0,0,0,0,22,0,Thin face climbing that will force you to work...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
900,107016048,Empathy,5.10b,3.3,58,1,California,San Diego and Environs,South San Diego County,Mission Gorge,Main Wall,-117.051,32.8265,https://www.mountainproject.com/route/10701604...,1,0,0,0,0,0,0,0,0,0,22,0,Interesting and technical face climb between T...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
903,107021544,Im Panadilla,5.10a,3.5,6,1,California,San Diego and Environs,South San Diego County,Corte Madera,El Nino Wall,-116.589,32.7568,https://www.mountainproject.com/route/10702154...,1,0,0,0,0,0,0,0,0,0,19,0,Sport climb up the middle of High Rise face.-b...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
65,105791566,Suzie's Wild Ride,5.9-,3.6,201,1,California,San Diego and Environs,South San Diego County,Mission Gorge,Main Wall,-117.051,32.8267,https://www.mountainproject.com/route/10579156...,1,0,0,0,0,0,0,0,0,0,16,0,The climb starts out down and right from a sma...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [None]:
rec=get_recommendations(target_index, euclidean_distances)
rec

In [None]:
rec=get_recommendations(target_index, linear_kernel)
rec

In [None]:
rec=get_recommendations(target_index, polynomial_kernel)
rec

In [None]:
rec=get_recommendations(target_index, sigmoid_kernel)
rec

In [None]:
# rec=get_recommendations(climb_id, chi2_kernel)
# rec

## NLP (work in progress)

### Import NLP Data (redundent delete later)

In [None]:
df_nlp = query_helper.query_to_df('SELECT * FROM route_description;')
df_nlp.set_index('id', inplace=True)
df_nlp.head()

In [None]:
key_words = ['slab', 'traverse', 'roof', 'corner', 'ledge', 'crack', 'face','flake', 'finger', 'fingers',
             'hand', 'hands', 'arch', 'balancy', 'balance', 'jug', 'squeeze', 'mantel', 'sustained',  
             'ramp', 'overhung', 'dihedral', 'sporty', 'heady', 'pump', 'pumpy', 'technical',
             'run out', 'mental', 'well protected', 'chimney', 'offwidth', 'stem', 'arete', 'exposed', 'exposure',
             'crimp','crimpy', 'vertical', 'slabby', 'cave', 'steep', 'bouldery'. 'powerful']

In [None]:
col_key_words = ['slab', 'traverse', 'roof', 'corner', 'ledge', 'crack', 'face','flake', 'finger',
             'hand', 'arch', 'balancy', 'jug', 'squeeze', 'mantel', 'exposed', 
             'ramp', 'overhung', 'dihedral', 'sporty', 'sustained','pump', 'technical',
             'run out', 'mental', 'well protected', 'chimney', 'offwidth', 'stem', 'arete',
             'crimp', 'vertical', 'cave', 'steep', 'bouldery', 'powerful']

In [None]:
df_numeric

In [None]:
df_full = df_numeric.join(df_nlp)
df_full.head()

In [None]:
df_numeric.head()

### Break descriptions into rope and boulder

#### Rope

In [None]:
df_sub = df_numeric[(df_numeric['Boulder']==0)]
df_sub = df_sub[(df_numeric['Ice']==0)]
df_sub = df_sub[(df_numeric['Snow']==0)]

df_sub.head()

In [None]:
df_sub.Boulder.value_counts()

In [None]:
len(df_sub.index)

In [None]:
len(df_nlp.loc[df_sub.index])

In [None]:
nlp_rope =df_nlp.loc[df_sub.index]

In [None]:
nlp_rope.head()

#### Boulder

#### Tokenizer

In [None]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

In [None]:
extra_stop = ['climb','climbing', 'crux', 'rope', 'leave', 'use', 'start', 'end',
              'look', 'rock', 'tree', 'follow', 'continue', 'belay', 'photo', 'add', 'climber', 'route', 
              'lot', 'anchor', '...', '1', '2', '3', '4', '--', 'pitch', 'page', 'cold', 'hot', 'warm',
              'belayer', 'fun', 'like', 'unknown', 'rap', 'left', 'right', 'wide', 'leader']


In [None]:
for word in extra_stop:
    stop_words.add(word)

In [None]:
stop_words

In [None]:
# Creating our tokenizer function
def spacy_tokenizer(text):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(text)
    
#     mytokens = [word for word in mytokens if word.pos_ != "PROPN"]
    
    mytokens = [ word if word.pos_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

### TF-IDF Vectorizer

#### Run the vectorizer

In [None]:
df_nlp.loc[105714722,:]

In [None]:
df_nlp[df_nlp.isna().values]

In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(tokenizer=spacy_tokenizer, min_df=5, max_df=.7)


#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df_nlp['infos'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

#### Save it to load later

In [None]:
# Save the trained model as a pickle string. 
from sklearn.externals import joblib 
import pickle
tfidf_5_7 = pickle.dumps(tfidf_matrix) 

  
# Save the model as a pickle in a file 
joblib.dump(tfidf_matrix, 'tfidf_5_7.pkl') 
  
# Load the model from the file 
# tfidf_loaded = joblib.load('tfidf_5_7.pkl')  
  

### Get original index

In [None]:
#get index to iterate over
search_range = df_in_range.orig_index

# get index for target in reference to whole matrix
nlp_target_index = int(df_in_range.orig_index[df_in_range['id']==target_id].values)
print('target_index:',nlp_target_index)

In [None]:
df_nlp.loc[105793305]

In [None]:
######NEED TO COMBINE WITH get_recommendations so we can get average weighted score#########################
def get_recommendations_for_nlp(idx, kernel_type):

    #value to store scores and indicies
    score_matrix = np.ndarray(shape=(len(df_in_range),2), dtype=float)
    
    for iter_,i in enumerate(search_range.values):
        score = kernel_type(tfidf_matrix[idx],tfidf_matrix[i])
        score_matrix[iter_][0] =  i        ##the index comparison corresponding to the score
        score_matrix[iter_][1] = score     ##the score for the current index

    # Sort the climbs based on the similarity scores
    score_matrix = sorted(score_matrix, key=lambda x: x[1], reverse=True)
    
    # score_matrix
#########################  ADD/calculate SIMilarity VALUE   ###########################

    # # Get the scores of the 20 most similar climbs
    score_matrix = score_matrix[1:20]

    # # Get the climb indices (& cast to ints)
    climb_indices = [int(i[0]) for i in score_matrix]
    
    # Return the top 20 most similar climbs
    return df_numeric.iloc[climb_indices,:]

In [None]:
rec = get_recommendations_for_nlp(nlp_target_index, cosine_similarity)
rec

In [None]:
df_nlp.loc[rec.index]

### LDA

#### CV

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(tokenizer=spacy_tokenizer, max_df=0.90, min_df=5, stop_words='english', ngram_range=(1,1))

In [None]:
dtm = cv.fit_transform(nlp_rope['info'].sample(n=1000, random_state=1))

#### LDA Model

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=10,               # Number of topics
                                      max_iter=20,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=32,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )

print(lda_model)  # Model attributes

In [None]:
# This can take awhile, we're dealing with a large amount of documents!

lda_output = lda_model.fit_transform(dtm)


#### Diagnose model performance with perplexity and log-likelihood
A model with higher log-likelihood and lower perplexity (exp(-1. * log-likelihood per word)) is considered to be good. Let’s check for our model.



In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(dtm))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(dtm))

# See model parameters
print(lda_model.get_params())

In [None]:
len(lda_model.components_)

In [None]:
single_topic = lda_model.components_[0]

In [None]:
top_word_indices = single_topic.argsort()[-10:]

In [None]:
for index in top_word_indices:
    print(cv.get_feature_names()[index])

#### Top words for all groups

In [None]:
for index,topic in enumerate(lda_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-30:]])
    print('\n')

In [None]:
extra_stop = ['climb','climbing', 'crux', 'rope', 'leave', 'use', 'start', 'end',
              'look', 'rock', 'tree', 'follow', 'continue', 'belay', 'photo', 'add', 'climber', 'route', 
              'lot', 'anchor', '...', '1', '2', '3', '4', '--', 'pitch', 'page', 'cold', 'hot', 'warm',
              'belayer', 'fun', 'like', 'unknown', 'rap', 'left', 'right']



### Try new reverse tokenizer

In [None]:
key_words = ['slab', 'traverse', 'roof', 'corner', 'ledge', 'crack', 'face','flake', 'bolt', 'finger', 'fingers'
             'cam', 'camalot', 'hand', 'hands', 'arch', 'balancy', 'jug', 'squeeze', 'mantel', 'sustained', 'nut', 
             'gear', 'ramp', 'overhung', 'balance', 'dihedral', 'sporty', 'heady', 'pump', 'pumpy', 'technical',
             'run out', 'mental', 'well protected', 'chimney', 'offwidth', 'stem', 'arete', 'exposed', 'exposure',
             'crimp','crimpy', 'vertical', 'slabby', 'cave', 'steep', 'sidepull','bouldery']

In [None]:
# Creating our tokenizer function
def spacy_tokenizer_reverse(text):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(text)
    
#     mytokens = [word for word in mytokens if word.pos_ != "PROPN"]
    
    mytokens = [ word if word.pos_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    
    #grab only my key words
    my_sub_tokens =[]
    for word in set(mytokens):
        if word in key_words:
            my_sub_tokens.append(word)

    # return preprocessed list of tokens
    return my_sub_tokens

#### CV

In [None]:
cv = CountVectorizer(tokenizer=spacy_tokenizer_reverse,  stop_words='english', ngram_range=(1,1))

In [None]:
dtm = cv.fit_transform(nlp_rope['info'].sample(n=1000, random_state=1))

#### LDA Model

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=6,               # Number of topics
                                      max_iter=20,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=32,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )

print(lda_model)  # Model attributes

In [None]:
# This can take awhile, we're dealing with a large amount of documents!

lda_output = lda_model.fit_transform(dtm)


#### Diagnose model performance with perplexity and log-likelihood
A model with higher log-likelihood and lower perplexity (exp(-1. * log-likelihood per word)) is considered to be good. Let’s check for our model.



In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(dtm))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(dtm))

# See model parameters
print(lda_model.get_params())

#### Top words for all groups

In [None]:
for index,topic in enumerate(lda_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')