# ML Models

## Import Libraries & Load Dataframe from AWS DB

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from math import sin, cos, sqrt, atan2, radians

import query_helper
import get_new_route
import json


In [2]:
df_numeric = pd.read_csv('data/df.csv', index_col='id')
#reorder columns
df_numeric =df_numeric[['name', 'rating', 'stars', 'starVotes', 'pitches', 'location', 'region',
                               'area', 'sub_area', 'wall', 'longitude', 'latitude', 'url', 'Sport',
                               'Trad', 'Boulder', 'TR', 'Alpine', 'Aid', 'Ice', 'Snow', 'Mixed',
                               'danger', 'rope_grade', 'boulder_grade', 'infos', 'slab', 'traverse',
                               'roof', 'corner', 'crack', 'hand', 'face', 'flake', 'fingers', 'jug', 'exposed',
                               'dihedral', 'sustained', 'technical', 'run out', 'well protected',
                               'chimney', 'offwidth', 'stem', 'arete', 'crimp', 'vertical', 'powerful',
                               'in_range']]
df_numeric.head()

Unnamed: 0_level_0,name,rating,stars,starVotes,pitches,location,region,area,sub_area,wall,...,run out,well protected,chimney,offwidth,stem,arete,crimp,vertical,powerful,in_range
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
105714722,Central Yellow Wall,V3 R,4.4,22,0,South Dakota,Custer State Park,Sylvan Lake,Sylvan Lake Bouldering,Campground Boulder,...,0,0,0,0,0,0,0,0,0,1
105714728,Waves,5.8,4.6,217,2,South Dakota,The Needles Of Rushmore,Mount Rushmore National Memorial,South Seas,Shipyard Rock,...,0,0,0,0,0,0,0,1,0,1
105714731,East Chimney Variation,5.7,4.1,40,2,South Dakota,Custer State Park,Cathedral Spires,Station 13,0,...,0,0,0,0,0,0,0,0,0,1
105714734,Conn Diagonal,5.7,4.9,151,3,South Dakota,Custer State Park,Sylvan Lake,Outlets,Outer Outlet,...,0,0,0,0,1,0,0,0,0,1
105714737,Bolts for Bobs,5.8,3.5,125,1,South Dakota,The Needles Of Rushmore,Mount Rushmore National Memorial,South Seas,Borneo,...,0,0,0,0,0,0,0,0,0,1


## Get input from user for recommendation

In [3]:
target_id = 105894693
# target_lat = 32.9127 
# target_lon = -116.882
target_state =''
target_city =''
target_zipcode = '92008'
target_radius_range=60
star_limit = 3.5
###other parameters to be added here later

### Get coordinates for zip or city

In [4]:
with open('data/us-zip-code-latitude-and-longitude.json') as f:
  coord_dict = json.load(f)

In [5]:
def get_coords(target_city=None, target_state=None, zipcode=None):
    #find the coordinates for city or zip code
    for city in coord_dict:
        if city['fields']['zip']==zipcode:
            return city['fields']['latitude'],city['fields']['longitude']
        if (city['fields']['state']==target_state)&(city['fields']['city']==target_city):
            return city['fields']['latitude'],city['fields']['longitude']
    #if nothing is found return none
    return None, None

In [6]:
target_lat, target_lon = get_coords(target_city, target_state, target_zipcode)

In [7]:
print(target_lat, target_lon)

33.158137 -117.32646


### Create fxn to see if climb is in search range

In [8]:
#function takes search param range and assigns to original df if climb in_range
def in_range(df_fxn, lat, lon, radius_range=None):
    if radius_range:
        R= 3958.8 
        if (lat == None)|(lon==None):
            df_fxn['in_range'] = 1
        else:
            #assign target coords and set to radians for calc
            lat1 = radians(lat)
            lon1 = radians(lon)
            for index, row in df_fxn.iterrows():
                #assign the lat and lon for each climb
                lat2 = radians(row['latitude'])
                lon2 = radians(row['longitude'])

                dlon = lon2 - lon1
                dlat = lat2 - lat1

                a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
                c = 2 * atan2(sqrt(a), sqrt(1 - a))

                distance = R * c

                #assign in_range col to 1 if the climb is in range
                if distance < radius_range:
                    df_fxn.at[index,'in_range']=1
                else:
                    df_fxn.at[index,'in_range']=0   
    else:
        df_fxn['in_range'] =1



In [9]:
def star_cutoff(df_fxn, star_limit=3.5):
    for index, row in df_fxn.iterrows():
        #assign in_range col to 1 if the climb is in range
        if (df_fxn.at[index, 'stars'] >= star_limit)&(df_fxn.at[index, 'in_range']!=0):
            df_fxn.at[index,'in_range']=1
        else:
            df_fxn.at[index,'in_range']=0   

In [10]:
# ##difficulity cutoff, function not run until climb is in df
# def diff_cutoff(df_fxn, delta=6, target_grade):
#     if df_fxn.loc[target_id,'Boulder']==0:
#         target_grade = df_fxn.loc[target_id,'rope_grade']
#         for index, row in df_fxn.iterrows():
#             #assign in_range col to 1 if the climb is in range
#             if (df_fxn.at[index, 'rope_grade'] <= target_grade+delta)&(df_fxn.at[index, 'rope_grade'] >= target_grade-delta)
#                 &(df_fxn.at[index, 'in_range']!=0):
#                 df_fxn.at[index,'in_range']=1
#             else:
#                 df_fxn.at[index,'in_range']=0                

### Call function to assign if climb in range

In [11]:
## used to get list of climbs allowed for comparison
in_range(df_numeric, lat = target_lat, lon = target_lon, radius_range=target_radius_range)

### Star cutoff (ie only give results for routes with above 3.5 stars)

In [None]:
star_cutoff(df_numeric, star_limit)

In [13]:
df_numeric.in_range.value_counts()

0    35255
1     1372
Name: in_range, dtype: int64

### To begin, see if if the climb already exists in db

In [14]:
if target_id in df_numeric.index:
    print('We have climb already')
    #make sure reference climb is assigned in_range
    df_numeric.loc[target_id,'in_range']=1
else:
    print('Making API call and Scraping climb data')
    if(get_new_route.get_route_details(target_id)):
        #the function in the if statement saves target climb to target_climb.csv and returns 1
        df_target= pd.read_csv('data/target_climb.csv', index_col= 'id')
        df_target.drop(columns=['Unnamed: 0'], inplace=True)
        df_target['in_range'] = 1
        #order the same as df_numeric columns
        df_target = df_target[['name', 'rating', 'stars', 'starVotes', 'pitches', 'location', 'region',
                               'area', 'sub_area', 'wall', 'longitude', 'latitude', 'url', 'Sport',
                               'Trad', 'Boulder', 'TR', 'Alpine', 'Aid', 'Ice', 'Snow', 'Mixed',
                               'danger', 'rope_grade', 'boulder_grade', 'infos', 'slab', 'traverse',
                               'roof', 'corner', 'crack', 'hand', 'face', 'flake', 'fingers', 'jug', 'exposed',
                               'dihedral', 'sustained', 'technical', 'run out', 'well protected',
                               'chimney', 'offwidth', 'stem', 'arete', 'crimp', 'vertical', 'powerful',
                               'in_range']]
        
        df_numeric = pd.concat([df_numeric, df_target])
#         save new climb into 
#         df_numeric.to_csv('data/df.csv', index_col= 'id')
    else:
        print("Something went wrong")


Making API call and Scraping climb data


In [9]:
df_numeric.loc[105722593,:]    

name                                              White Rastafarian
rating                                                         V2 R
stars                                                           4.9
starVotes                                                       159
pitches                                                           0
location                                                 California
region                                    Joshua Tree National Park
area                                       *Joshua Tree Bouldering*
sub_area                                         Outback Bouldering
wall                                      White Rastafarian Boulder
longitude                                                  -116.381
latitude                                                     33.719
url               https://www.mountainproject.com/route/10572259...
Sport                                                             0
Trad                                            

In [15]:
df_numeric.loc[target_id,:]

name                                                      Lion King
rating                                                        5.11c
stars                                                           4.4
starVotes                                                        81
pitches                                                           1
location                                              International
region                                                         Asia
area                                                       Thailand
sub_area                                  South - Islands & Beaches
wall              Laem Phra Nang (Railay & Tonsai),Tonsai Bay,Du...
longitude                                                   98.9921
latitude                                                    18.7689
url               https://www.mountainproject.com/route/10589469...
Sport                                                             1
Trad                                            

In [16]:
df_numeric.tail()

Unnamed: 0_level_0,name,rating,stars,starVotes,pitches,location,region,area,sub_area,wall,...,run out,well protected,chimney,offwidth,stem,arete,crimp,vertical,powerful,in_range
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
118210488,Unknown #1,V0-1,3.0,1,0,California,Lake Tahoe,I-80 Corridor,Auburn and Grass Valley,"Auburn SRA,Tall Green Bridge,Riverside Boulder...",...,0,0,0,0,0,0,0,0,0,0
118210496,Unknown #2,V1-2,3.0,1,0,California,Lake Tahoe,I-80 Corridor,Auburn and Grass Valley,"Auburn SRA,Tall Green Bridge,Riverside Boulder...",...,0,0,0,0,0,0,0,0,0,0
118210504,Unknown #3,V1-2,3.0,1,0,California,Lake Tahoe,I-80 Corridor,Auburn and Grass Valley,"Auburn SRA,Tall Green Bridge,Riverside Boulder...",...,0,0,0,0,0,1,0,0,0,0
118211517,Mertensia Pillar,WI5,4.0,1,1,Colorado,CO Ice & Mixed,RMNP - Mixed/Ice,Wild Basin,Mertensia Falls,...,0,0,0,0,0,0,0,0,0,0
105894693,Lion King,5.11c,4.4,81,1,International,Asia,Thailand,South - Islands & Beaches,"Laem Phra Nang (Railay & Tonsai),Tonsai Bay,Du...",...,0,0,0,0,0,0,0,0,0,1


### Diff_grade cutoff WIP

In [17]:
###WIP call grade range function
# pass in deets for target climb

## Reccomender

#### Kernel Imports

In [18]:
# Import kernels
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import polynomial_kernel
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics.pairwise import laplacian_kernel
from sklearn.metrics.pairwise import chi2_kernel



In [19]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler

#### Create df_in_range to run recommender in subset

In [20]:
df_in_range = df_numeric[df_numeric['in_range']==1].reset_index()      
target_index =df_in_range.index[df_in_range['id']==target_id][0] #store target climb index in subset that will be compared
df_in_range.shape

(1373, 51)

In [21]:
target_index

1372

In [22]:
df_in_range.tail()

Unnamed: 0,id,name,rating,stars,starVotes,pitches,location,region,area,sub_area,...,run out,well protected,chimney,offwidth,stem,arete,crimp,vertical,powerful,in_range
1368,118189308,Golgotha,V8-9,5.0,1,0,California,San Diego and Environs,North San Diego County,Lake Ramona,...,0,0,0,0,0,1,1,0,0,1
1369,118192124,Bloody L,5.7,4.0,2,1,California,Inland Empire,Big Rock Area,L-Slab/Lakeview Slab,...,0,0,0,0,0,0,0,0,0,1
1370,118202461,Pieces of Eight,5.10c/d PG13,4.0,2,1,California,Tahquitz & Suicide Rocks,Suicide Rock,(l) Buttress of Cracks,...,0,0,0,0,0,0,0,0,0,1
1371,118208812,Coyote Traverse,5.8 V0+ PG13,3.5,2,1,California,San Diego and Environs,North San Diego County,Calavera lake,...,0,0,0,0,0,0,0,0,0,1
1372,105894693,Lion King,5.11c,4.4,81,1,International,Asia,Thailand,South - Islands & Beaches,...,0,0,0,0,0,0,0,0,0,1


### Scale Features

#### Create Features DF

In [23]:
pd.options.display.max_columns = 500 # this will set limit of columns to 500


In [24]:
df_in_range.head()

Unnamed: 0,id,name,rating,stars,starVotes,pitches,location,region,area,sub_area,wall,longitude,latitude,url,Sport,Trad,Boulder,TR,Alpine,Aid,Ice,Snow,Mixed,danger,rope_grade,boulder_grade,infos,slab,traverse,roof,corner,crack,hand,face,flake,fingers,jug,exposed,dihedral,sustained,technical,run out,well protected,chimney,offwidth,stem,arete,crimp,vertical,powerful,in_range
0,105722227,Overseer,5.9,4.1,329,1,California,Joshua Tree National Park,Lost Horse Area,Hemingway Buttress,Hemingway Buttress (East Face Left),-117.759,33.9106,https://www.mountainproject.com/route/10572222...,0,1,0,0,0,0,0,0,0,0,17,0,Start up the hollow-sounding expando flake to ...,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
1,105737333,Only Way To Fly,5.10c,4.2,61,1,California,Central Coast,San Luis Obispo,Bishop Peak,Cracked Wall,-117.66,33.6846,https://www.mountainproject.com/route/10573733...,1,0,0,1,0,0,0,0,0,0,25,0,The route starts on the left side of the dark ...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,105788042,Surprise,5.8 R,3.9,53,3,California,Tahquitz & Suicide Rocks,Suicide Rock,(m) The Weeping Wall,0,-116.695,33.7703,https://www.mountainproject.com/route/10578804...,0,1,0,0,0,0,0,0,0,2,14,0,This was the first climb established on the We...,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,105788045,Surprise Direct,5.9 R,3.7,15,3,California,Tahquitz & Suicide Rocks,Suicide Rock,(m) The Weeping Wall,0,-116.695,33.7703,https://www.mountainproject.com/route/10578804...,0,1,0,0,0,0,0,0,0,2,17,0,This is a third pitch variation to Surprise th...,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
4,105788065,Duck Soup,5.10c PG13,3.6,13,3,California,Tahquitz & Suicide Rocks,Suicide Rock,(m) The Weeping Wall,0,-116.695,33.7703,https://www.mountainproject.com/route/10578806...,0,1,0,0,0,0,0,0,0,1,25,0,This route takes the plumline up the face and ...,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [26]:
#creates features from df_in_range used for comparison
features = df_in_range.loc[:,['stars', 'pitches', 'Sport', 'Trad', 'Boulder', 'TR', 'Alpine', 'Aid',
       'Ice', 'Snow', 'Mixed', 'danger', 'rope_grade', 'boulder_grade', 'slab', 'traverse', 'roof', 
                'corner', 'crack', 'hand', 'face','flake', 'fingers',
                 'jug', 'exposed', 'dihedral', 'sustained', 'technical','run out', 'well protected',
                 'chimney', 'offwidth', 'stem', 'arete','crimp', 'vertical', 'powerful']] #,'longitude','latitude',

In [27]:
features.shape

(1373, 37)

#### Pick scaling type (AND UPDATE WEIGHTS)

In [28]:
min_max_scaler = MinMaxScaler()
scalar = StandardScaler()

In [29]:
##### Pick a scaling option ###############################

# features_scaled = scalar.fit_transform(features)
# features_scaled = min_max_scaler.fit_transform(features.drop(columns=['danger','pitches']))

features_scaled = min_max_scaler.fit_transform(features)

# scale danger and pitches using ss and add into features scaled df
# features_scaled = np.concatenate((features_scaled, scalar.fit_transform(features[['danger', 'pitches']])), axis=1)

##################################################################

for i in range(features_scaled.shape[0]):
    features_scaled[i][10]=features_scaled[i][12]*10  #weight rope_grade higher
    features_scaled[i][11]=features_scaled[i][13]*10 #weight boulder_grade higher

### Now lets fit the similarity model

#### Rec function

In [30]:
def get_recommendations(idx, kernel_type):

    #value to store scores and indicies
    score_matrix = np.ndarray(shape=(len(df_in_range),2), dtype=float)

    #go through the target climb vs all onthers in our db and populate score mtx with index and similarity
    for i in range(df_in_range.shape[0]):
        score = kernel_type(features_scaled[idx].reshape(1,-1),features_scaled[i].reshape(1,-1))
        score_matrix[i][0] =  i        ##the index comparison corresponding to the score
        score_matrix[i][1] = score     ##the score for the current index

    # Sort the climbs based on the similarity scores
    score_matrix = sorted(score_matrix, key=lambda x: x[1], reverse=True)
    

#########################  WIP ADD/calculate SIMilarity VALUE from tf-idf infos comparison #######################

    # # Get the scores of the 20 most similar climbs
    score_matrix = score_matrix[1:11]

    # # Get the climb indices (& cast to ints)
    climb_indices = [int(i[0]) for i in score_matrix]
    
    # Return the top 20 most similar climbs
    return df_in_range.loc[climb_indices,:]

#### Cells for comparison (delete later)

In [31]:
df_numeric.loc[target_id, :]

name                                                      Lion King
rating                                                        5.11c
stars                                                           4.4
starVotes                                                        81
pitches                                                           1
location                                              International
region                                                         Asia
area                                                       Thailand
sub_area                                  South - Islands & Beaches
wall              Laem Phra Nang (Railay & Tonsai),Tonsai Bay,Du...
longitude                                                   98.9921
latitude                                                    18.7689
url               https://www.mountainproject.com/route/10589469...
Sport                                                             1
Trad                                            

In [32]:
target_index

1372

#### Call rec fxn

In [33]:
# pd.set_option('max_colwidth', 100)

In [34]:
rec=get_recommendations(target_index, cosine_similarity)
rec

Unnamed: 0,id,name,rating,stars,starVotes,pitches,location,region,area,sub_area,wall,longitude,latitude,url,Sport,Trad,Boulder,TR,Alpine,Aid,Ice,Snow,Mixed,danger,rope_grade,boulder_grade,infos,slab,traverse,roof,corner,crack,hand,face,flake,fingers,jug,exposed,dihedral,sustained,technical,run out,well protected,chimney,offwidth,stem,arete,crimp,vertical,powerful,in_range
522,106988149,Technicolor Tango,5.12c,4.3,8,1,California,Inland Empire,Riverside Quarry,(f) Torture Machine Area,0,-117.416,34.0176,https://www.mountainproject.com/route/10698814...,1,0,0,0,0,0,0,0,0,0,45,0,The route starts on a technical slab that requ...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
382,106364007,Depends,5.10c,3.5,2,1,California,San Diego and Environs,South San Diego County,Corte Madera,Coulter Grove/Rest Home,-116.591,32.7555,https://www.mountainproject.com/route/10636400...,1,0,0,0,0,0,0,0,0,0,25,0,Great route up to the flake and out left. Inte...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
568,107096154,Natural Selection,5.12c,4.3,4,1,California,Inland Empire,Riverside Quarry,(e) Right of the Roof,0,-117.417,34.0169,https://www.mountainproject.com/route/10709615...,1,0,0,0,0,0,0,0,0,0,45,0,The first crux starts around the third bolt an...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
677,107861689,The World Below,5.12a,4.5,4,1,California,Inland Empire,Riverside Quarry,(j) The Tall Wall,0,-117.417,34.0169,https://www.mountainproject.com/route/10786168...,1,0,0,0,0,0,0,0,0,0,39,0,"Route begin 10ft right of Sliver Streaker, jus...",0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
579,107174569,Rocking the Dalai Lama,5.11+,4.4,12,1,California,San Diego and Environs,South San Diego County,Mission Gorge,Middle Earth,-117.051,32.8243,https://www.mountainproject.com/route/10717456...,1,0,0,0,0,0,0,0,0,0,36,0,Will edit with more pertinent info after leadi...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
617,107432346,Quantum Leap,5.11b/c,4.4,8,1,California,San Diego and Environs,South San Diego County,Mission Gorge,Main Wall,-117.051,32.8266,https://www.mountainproject.com/route/10743234...,1,0,0,0,0,0,0,0,0,0,34,0,"Hard to onsite unless you are amazing, the cru...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
359,106300807,Nostalgia,5.11c,4.5,46,1,California,Inland Empire,Riverside Quarry,(d) Roof Area,0,-117.417,34.0169,https://www.mountainproject.com/route/10630080...,1,0,0,0,0,0,0,0,0,0,35,0,A hard stand up move at the 2nd bolt starts of...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
491,106873390,Raging Raptor,5.12a,4.6,40,1,California,Inland Empire,Riverside Quarry,(c) Left of the Roof,0,-117.417,34.0169,https://www.mountainproject.com/route/10687339...,1,0,0,0,0,0,0,0,0,0,39,0,Fun crimping from the start leads to easier cl...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
410,106550154,Survival of the Fittest,5.12b,4.4,19,1,California,Inland Empire,Riverside Quarry,(e) Right of the Roof,0,-117.417,34.0169,https://www.mountainproject.com/route/10655015...,1,0,0,0,0,0,0,0,0,0,42,0,Great long route.-Left of Natural Selection (5...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
887,109150884,Lucky,5.12a,4.8,4,1,California,Inland Empire,Riverside Quarry,(j) The Tall Wall,0,-117.417,34.0169,https://www.mountainproject.com/route/10915088...,1,0,0,0,0,0,0,0,0,0,39,0,Right most line on the Tall Wall. Begin at the...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [None]:
rec=get_recommendations(target_index, rbf_kernel)
rec

In [None]:
rec=get_recommendations(target_index, laplacian_kernel)
rec

In [None]:
# rec=get_recommendations(target_index, euclidean_distances)
# rec

In [None]:
# rec=get_recommendations(target_index, linear_kernel)
# rec

In [None]:
# rec=get_recommendations(target_index, polynomial_kernel)
# rec

In [None]:
# rec=get_recommendations(target_index, sigmoid_kernel)
# rec

In [None]:
# rec=get_recommendations(climb_id, chi2_kernel)
# rec

## NLP (work in progress)

### Import NLP Data (redundent delete later)

In [None]:
df_nlp = query_helper.query_to_df('SELECT * FROM route_description;')
df_nlp.set_index('id', inplace=True)
df_nlp.head()

In [None]:
key_words = ['slab', 'traverse', 'roof', 'corner', 'ledge', 'crack', 'face','flake', 'finger', 'fingers',
             'hand', 'hands', 'arch', 'balancy', 'balance', 'jug', 'squeeze', 'mantel', 'sustained',  
             'ramp', 'overhung', 'dihedral', 'sporty', 'heady', 'pump', 'pumpy', 'technical',
             'run out', 'mental', 'well protected', 'chimney', 'offwidth', 'stem', 'arete', 'exposed', 'exposure',
             'crimp','crimpy', 'vertical', 'slabby', 'cave', 'steep', 'bouldery'. 'powerful']

In [None]:
col_key_words = ['slab', 'traverse', 'roof', 'corner', 'ledge', 'crack', 'face','flake', 'finger',
             'hand', 'arch', 'balancy', 'jug', 'squeeze', 'mantel', 'exposed', 
             'ramp', 'overhung', 'dihedral', 'sporty', 'sustained','pump', 'technical',
             'run out', 'mental', 'well protected', 'chimney', 'offwidth', 'stem', 'arete',
             'crimp', 'vertical', 'cave', 'steep', 'bouldery', 'powerful']

In [None]:
df_numeric

In [None]:
df_full = df_numeric.join(df_nlp)
df_full.head()

In [None]:
df_numeric.head()

### Break descriptions into rope and boulder

#### Rope

In [None]:
df_sub = df_numeric[(df_numeric['Boulder']==0)]
df_sub = df_sub[(df_numeric['Ice']==0)]
df_sub = df_sub[(df_numeric['Snow']==0)]

df_sub.head()

In [None]:
df_sub.Boulder.value_counts()

In [None]:
len(df_sub.index)

In [None]:
len(df_nlp.loc[df_sub.index])

In [None]:
nlp_rope =df_nlp.loc[df_sub.index]

In [None]:
nlp_rope.head()

#### Boulder

#### Tokenizer

In [None]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

In [None]:
extra_stop = ['climb','climbing', 'crux', 'rope', 'leave', 'use', 'start', 'end',
              'look', 'rock', 'tree', 'follow', 'continue', 'belay', 'photo', 'add', 'climber', 'route', 
              'lot', 'anchor', '...', '1', '2', '3', '4', '--', 'pitch', 'page', 'cold', 'hot', 'warm',
              'belayer', 'fun', 'like', 'unknown', 'rap', 'left', 'right', 'wide', 'leader']


In [None]:
for word in extra_stop:
    stop_words.add(word)

In [None]:
stop_words

In [None]:
# Creating our tokenizer function
def spacy_tokenizer(text):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(text)
    
#     mytokens = [word for word in mytokens if word.pos_ != "PROPN"]
    
    mytokens = [ word if word.pos_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

### TF-IDF Vectorizer

#### Run the vectorizer

In [None]:
df_nlp.loc[105714722,:]

In [None]:
df_nlp[df_nlp.isna().values]

In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(tokenizer=spacy_tokenizer, min_df=5, max_df=.7)


#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df_nlp['infos'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

#### Save it to load later

In [36]:
# Save the trained model as a pickle string. 
from sklearn.externals import joblib 
import pickle
tfidf_5_7 = pickle.dumps(tfidf_matrix) 

  
# Save the model as a pickle in a file 
joblib.dump(tfidf_matrix, 'tfidf_5_7.pkl') 
  
# Load the model from the file 
# tfidf_loaded = joblib.load('tfidf_5_7.pkl')  
  

### Get original index

In [None]:
#get index to iterate over
search_range = df_in_range.orig_index

# get index for target in reference to whole matrix
nlp_target_index = int(df_in_range.orig_index[df_in_range['id']==target_id].values)
print('target_index:',nlp_target_index)

In [None]:
df_nlp.loc[105793305]

In [None]:
######NEED TO COMBINE WITH get_recommendations so we can get average weighted score#########################
def get_recommendations_for_nlp(idx, kernel_type):

    #value to store scores and indicies
    score_matrix = np.ndarray(shape=(len(df_in_range),2), dtype=float)
    
    for iter_,i in enumerate(search_range.values):
        score = kernel_type(tfidf_matrix[idx],tfidf_matrix[i])
        score_matrix[iter_][0] =  i        ##the index comparison corresponding to the score
        score_matrix[iter_][1] = score     ##the score for the current index

    # Sort the climbs based on the similarity scores
    score_matrix = sorted(score_matrix, key=lambda x: x[1], reverse=True)
    
    # score_matrix
#########################  ADD/calculate SIMilarity VALUE   ###########################

    # # Get the scores of the 20 most similar climbs
    score_matrix = score_matrix[1:20]

    # # Get the climb indices (& cast to ints)
    climb_indices = [int(i[0]) for i in score_matrix]
    
    # Return the top 20 most similar climbs
    return df_numeric.iloc[climb_indices,:]

In [None]:
rec = get_recommendations_for_nlp(nlp_target_index, cosine_similarity)
rec

In [None]:
df_nlp.loc[rec.index]

### LDA

#### CV

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(tokenizer=spacy_tokenizer, max_df=0.90, min_df=5, stop_words='english', ngram_range=(1,1))

In [None]:
dtm = cv.fit_transform(nlp_rope['info'].sample(n=1000, random_state=1))

#### LDA Model

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=10,               # Number of topics
                                      max_iter=20,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=32,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )

print(lda_model)  # Model attributes

In [None]:
# This can take awhile, we're dealing with a large amount of documents!

lda_output = lda_model.fit_transform(dtm)


#### Diagnose model performance with perplexity and log-likelihood
A model with higher log-likelihood and lower perplexity (exp(-1. * log-likelihood per word)) is considered to be good. Let’s check for our model.



In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(dtm))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(dtm))

# See model parameters
print(lda_model.get_params())

In [None]:
len(lda_model.components_)

In [None]:
single_topic = lda_model.components_[0]

In [None]:
top_word_indices = single_topic.argsort()[-10:]

In [None]:
for index in top_word_indices:
    print(cv.get_feature_names()[index])

#### Top words for all groups

In [None]:
for index,topic in enumerate(lda_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-30:]])
    print('\n')

In [None]:
extra_stop = ['climb','climbing', 'crux', 'rope', 'leave', 'use', 'start', 'end',
              'look', 'rock', 'tree', 'follow', 'continue', 'belay', 'photo', 'add', 'climber', 'route', 
              'lot', 'anchor', '...', '1', '2', '3', '4', '--', 'pitch', 'page', 'cold', 'hot', 'warm',
              'belayer', 'fun', 'like', 'unknown', 'rap', 'left', 'right']



### Try new reverse tokenizer

In [None]:
key_words = ['slab', 'traverse', 'roof', 'corner', 'ledge', 'crack', 'face','flake', 'bolt', 'finger', 'fingers'
             'cam', 'camalot', 'hand', 'hands', 'arch', 'balancy', 'jug', 'squeeze', 'mantel', 'sustained', 'nut', 
             'gear', 'ramp', 'overhung', 'balance', 'dihedral', 'sporty', 'heady', 'pump', 'pumpy', 'technical',
             'run out', 'mental', 'well protected', 'chimney', 'offwidth', 'stem', 'arete', 'exposed', 'exposure',
             'crimp','crimpy', 'vertical', 'slabby', 'cave', 'steep', 'sidepull','bouldery']

In [None]:
# Creating our tokenizer function
def spacy_tokenizer_reverse(text):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(text)
    
#     mytokens = [word for word in mytokens if word.pos_ != "PROPN"]
    
    mytokens = [ word if word.pos_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    
    #grab only my key words
    my_sub_tokens =[]
    for word in set(mytokens):
        if word in key_words:
            my_sub_tokens.append(word)

    # return preprocessed list of tokens
    return my_sub_tokens

#### CV

In [None]:
cv = CountVectorizer(tokenizer=spacy_tokenizer_reverse,  stop_words='english', ngram_range=(1,1))

In [None]:
dtm = cv.fit_transform(nlp_rope['info'].sample(n=1000, random_state=1))

#### LDA Model

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=6,               # Number of topics
                                      max_iter=20,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=32,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )

print(lda_model)  # Model attributes

In [None]:
# This can take awhile, we're dealing with a large amount of documents!

lda_output = lda_model.fit_transform(dtm)


#### Diagnose model performance with perplexity and log-likelihood
A model with higher log-likelihood and lower perplexity (exp(-1. * log-likelihood per word)) is considered to be good. Let’s check for our model.



In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(dtm))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(dtm))

# See model parameters
print(lda_model.get_params())

#### Top words for all groups

In [None]:
for index,topic in enumerate(lda_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')