# Explore content-based filtering

In [100]:
import pickle
import pandas as pd

route_id_dict = pickle.load( open( "route_table.p", "rb" ) )
user_id_dict = pickle.load( open( "user_table.p", "rb" ) )

## load data to Pandas DF

In [101]:
route_ids = []
frames = []

for route_id, d in route_id_dict.items():
    route_ids.append(route_id)
    frames.append(pd.DataFrame.from_dict(d, orient='index'))

df = pd.concat(frames, keys=route_ids)

df2 = df.unstack(level=-1)

df3 = df2[0]

#df3 = df3.reset_index()
#df3.rename(columns={'index':'route_id'}, inplace=True)
df3.index.name = 'route_id'
df_routes = df3

In [102]:
df_routes.sample(10)

Unnamed: 0_level_0,desc,grade,route_avg_stars,route_lat,route_location,route_long,route_n_star_votes,route_name,route_pitches,route_rating,route_type,url
route_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
107422942,After a bouldery start you will find a few nic...,,3.0,37.2284,"[California, San Francisco Bay Area, Castle Ro...",-122.107,6,Right Arete,1.0,5.9,TR,https://www.mountainproject.com/route/10742294...
114078572,,,2.0,34.3019,"[California, San Bernardino Mountains, Lake Ar...",-117.214,1,12-Gauge,1.0,5.11a,TR,https://www.mountainproject.com/route/11407857...
108006224,,,3.0,34.3124,"[California, San Bernardino Mountains, Big Bea...",-116.88,1,Right Bandana Crack,1.0,5.10a,Trad,https://www.mountainproject.com/route/10800622...
106126708,Sit start underneath the overhang. Make...,,3.2,37.4182,"[California, Sierra Eastside, Bishop Area, Vol...",-118.452,16,East Easy Rider,,V5,Boulder,https://www.mountainproject.com/route/10612670...
107062895,This climb is rich with variety. The first 4 b...,,4.5,37.3836,"[California, Sierra Eastside, Bishop Area, Pin...",-118.677,33,The Remington Electric,1.0,5.11c,Sport,https://www.mountainproject.com/route/10706289...
106691763,This is a great warm up. Beta is very cut and ...,,3.4,34.4977,"[California, Central Coast, Santa Barbara, * S...",-119.862,34,Deep Forest Arete,,V0,Boulder,https://www.mountainproject.com/route/10669176...
106324296,This route is not listed in Bishop Area Rock C...,,2.8,36.5971,"[California, Sierra Eastside, Lone Pine Area, ...",-118.129,12,UnNamed,,5.9,Sport,https://www.mountainproject.com/route/10632429...
113796640,,,0.0,34.3035,"[California, San Bernardino Mountains, Lake Ar...",-117.223,0,Bone Dry,1.0,5.7,Trad,https://www.mountainproject.com/route/11379664...
114495265,The crux is near the bottom. It starts vertic...,,3.7,35.3499,"[California, Central Coast, San Luis Obispo, C...",-120.815,3,Unknown,1.0,5.8+,TR,https://www.mountainproject.com/route/11449526...
112431859,,,3.0,33.2209,"[California, San Diego, North SD, Culp Valley,...",-116.45,1,Left face,,V1,Boulder,https://www.mountainproject.com/route/11243185...


In [103]:
' '.join(df_routes[['route_location']].values[0][0])

'California San Francisco Bay Area Castle Rock Area Shady Rock'

## Concatenate data to make a document for each route

In [119]:
import string
import nltk
nltk.download('stopwords')

stopwords = set(nltk.corpus.stopwords.words('english') + ['california','climb','rock'])
stemmer = nltk.stem.PorterStemmer()

route_text = {}
for route_id, route_details in df_routes.iterrows():
    #join the relevent features of the climbing route to create a document
    if route_details['grade'] == None:
        grade = ''
    else:
        grade = 'grade'+str(route_details['grade'])
        
    if pd.isnull(route_details['desc']):
        route_details['desc'] = ''
    
    content = route_details['desc'] + ' ' + route_details['route_rating'] + ' ' + \
                route_details['route_type'] + ' ' + ' '.join(route_details['route_location']) + ' ' +\
                grade
    
    #print(content)
    #print(' ')
    #print('-----------------------------')
    #print(' ')
    #pre-processing, make all words lowercase and remove punctuation
    content = content.lower()
    table = str.maketrans('', '', string.punctuation)
    content = content.translate(table)
    #print(content)
    #print(' ')
    #print('-----------------------------')
    #print(' ')
    
    # Create stopwords list, convert to a set for speed
    content = [word for word in content.split() if word not in stopwords]
    #print(content)
    
    #print(' ')
    #print('-----------------------------')
    #print(' ')
    
    content = " ".join([stemmer.stem(word) for word in content])
    #print(content)
    
    #print(' ')
    #print('***************************************************************************************')
    #print(' ')

    route_text[route_id] = content
    
route_text_df = pd.DataFrame.from_dict(route_text,orient='index',columns=['route_text'])
route_text_df.index.name = 'route_id'

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\georg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [126]:
route_text_list = []
for key in route_text:
    route_text_list.append(route_text[key])

In [135]:
import sklearn
# Generate tf-idf object with maximum vocab size of 1000
tf_counter = sklearn.feature_extraction.text.TfidfVectorizer(max_features = 100)
# Get tf-idf matrix as sparse matrix
tfidf = tf_counter.fit_transform(route_text_list)
# Get the words corresponding to the vocab index
tf_counter.get_feature_names()

['58',
 '59',
 'anchor',
 'angel',
 'area',
 'aret',
 'around',
 'back',
 'basin',
 'belay',
 'big',
 'bishop',
 'bolt',
 'boulder',
 'cam',
 'canyon',
 'central',
 'climb',
 'coast',
 'continu',
 'corner',
 'corridor',
 'crack',
 'crag',
 'crux',
 'easi',
 'east',
 'eastsid',
 'end',
 'face',
 'feet',
 'finish',
 'first',
 'flake',
 'follow',
 'fun',
 'gear',
 'get',
 'go',
 'good',
 'gradenan',
 'hand',
 'head',
 'high',
 'hold',
 'lake',
 'larg',
 'lead',
 'ledg',
 'left',
 'line',
 'lo',
 'lower',
 'main',
 'make',
 'mountain',
 'move',
 'nation',
 'north',
 'obviou',
 'one',
 'park',
 'past',
 'pinnacl',
 'pitch',
 'protect',
 'pull',
 'reach',
 'right',
 'roof',
 'rope',
 'rout',
 'san',
 'santa',
 'section',
 'short',
 'side',
 'sierra',
 'slab',
 'small',
 'south',
 'sport',
 'start',
 'steep',
 'straight',
 'summit',
 'taho',
 'thin',
 'top',
 'tr',
 'trad',
 'travers',
 'tree',
 'two',
 'use',
 'valley',
 'wall',
 'way',
 'west',
 'yosemit']

In [136]:
from sklearn.metrics.pairwise import linear_kernel

cosine_similarities = linear_kernel(tfidf, tfidf)
print(cosine_similarities.shape)

for idx, row in route_text_df.iterrows(): #iterates through all the rows

# the below code 'similar_indice' stores similar ids based on cosine similarity. sorts them in ascending order. [:-5:-1] is then used so that the indices with most similarity are got. 0 means no similarity and 1 means perfect similarity#
    similar_indices = cosine_similarities[idx].argsort()[:-5:-1] 

    #stores 5 most similar routes, you can change it as per your needs
    similar_items = [(cosine_similarities[idx][i], ds['route_id'][i]) for i in similar_indices]
    results[row['route_id']] = similar_items[1:]

MemoryError: 

In [137]:
%debug

> [1;32mc:\users\georg\appdata\local\continuum\anaconda3\lib\site-packages\scipy\sparse\base.py[0m(1187)[0;36m_process_toarray_args[1;34m()[0m
[1;32m   1185 [1;33m            [1;32mreturn[0m [0mout[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m   1186 [1;33m        [1;32melse[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m-> 1187 [1;33m            [1;32mreturn[0m [0mnp[0m[1;33m.[0m[0mzeros[0m[1;33m([0m[0mself[0m[1;33m.[0m[0mshape[0m[1;33m,[0m [0mdtype[0m[1;33m=[0m[0mself[0m[1;33m.[0m[0mdtype[0m[1;33m,[0m [0morder[0m[1;33m=[0m[0morder[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m   1188 [1;33m[1;33m[0m[0m
[0m[1;32m   1189 [1;33m[1;33m[0m[0m
[0m
ipdb> self.shape
(20319, 20319)
ipdb> exit()
