In [1]:
import requests
from bs4 import BeautifulSoup
import time
from urllib import urlencode
import pandas as pd
from pymongo import MongoClient
from parse_clean_store import (parse_route_page, parse_user_page)

In [3]:
client = MongoClient('mongodb://localhost:27017/')
db = client.routes_collection_updated
routes_collection = db.routes_collection
raw_data = routes_collection.find()
route_df = pd.DataFrame(list(raw_data))
route_df.head()

Unnamed: 0,_id,html,route,url
0,58d29f9540b4412f7b79585d,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",30(1) Feet of Pleasure,/v/301-feet-of-pleasure/106862592
1,58d29f9540b4412f7b79585f,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",41 Feet of Pain,/v/41-feet-of-pain/106862549
2,58d29f9540b4412f7b795861,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",5.5 Crack,/v/55-crack/108318876
3,58d29f9540b4412f7b795863,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",5.6 crack,/v/56-crack/106630563
4,58d29f9540b4412f7b795865,"<!DOCTYPE html><html xmlns=""http://www.w3.org/...",5.6 Dihedral,/v/56-dihedral/106165717


### parse and clean route info

In [11]:
dict_list = []
_id = 0
for html, url in zip(route_df['html'], route_df['url']):
    route_dict = parse_route_page(_id, html)
    route_dict['route_url'] = url # add url
    _id += 1 # update id
    dict_list.append(route_dict)

'''
df_routes = pd.DataFrame(dict_list)
df_routes.head(50).T
'''

'\ndf_routes = pd.DataFrame(dict_list)\ndf_routes.head(50).T\n'

### store route info

In [12]:
client = MongoClient('mongodb://localhost:27017/')
db = client.routes_updated
routes = db.routes
routes.insert_many(dict_list)

<pymongo.results.InsertManyResult at 0x7fb0211e3960>

### clean and store user info

In [None]:
client = MongoClient('mongodb://localhost:27017/')
db = client.users_info
users_info = db.users_info
raw_data = users_info.find()
df_user_html = pd.DataFrame(list(raw_data))
df_user_html.head()

In [None]:
client = MongoClient('mongodb://localhost:27017/')
db = client.users
users = db.users
_id = 0
for html in df_user_html['html']:
    dict_list = parse_user_page(html, _id)
    users.insert_one(dict_list)
    _id += 1

### create utility matrix

In [3]:
def dataframe_from_collection(collection):
    '''covnert mongodb dataframe to pandas dataframe'''
    raw_data = collection.find()
    return pd.DataFrame(list(raw_data))

In [11]:
def create_utility_matrix(df_ratings, df_users, df_routes):
    '''creates a ratings matrix with route_id,user_id, and rating'''
    row = 0
    df_new = pd.DataFrame(columns=['route_id','user_id','rating'])
    for route, usernames, ratings, route_url in zip(df_ratings['route'], 
                                        df_ratings['username'],
                                        df_ratings['rating'],
                                        df_ratings['route_url']):
        # match route_id
        route_id = df_routes[df_routes['route_url'] == route_url]['id'].values
        # skip routes that have same name until given other info
        if len(route_id) > 1:
            print 'route_url: ', route_url
            continue
        route_id = int(route_id)
        for username, rating in zip(usernames, ratings):
            df_new.loc[row,'route_id'] = route_id
            # clean user_id to match
            username =  username.encode('utf-8')
            username = username.replace('\xc2\xa0', '').decode('utf-8')
            user_id = df_users[df_users['name'] == username]['id'].values
            if len(user_id) < 1:
                print 'user: ', username
                continue
            user_id = int(user_id)
            df_new.loc[row, 'user_id'] = user_id
            df_new.loc[row, 'rating'] = rating
            row += 1
    return df_new

In [4]:
client = MongoClient('mongodb://localhost:27017/')
db = client.ratings_collection_updated
ratings_df = dataframe_from_collection(db.ratings_collection)

db = client.users
df_users = dataframe_from_collection(db.users)
users_df = df_users[['name', 'id']]

db = client.routes_updated
df_routes = dataframe_from_collection(db.routes)
routes_df = df_routes[['name', 'id', 'route_url']]

In [8]:
routes_df[routes_df['name'] == 'A']

Unnamed: 0,name,id,route_url
7,A,7,/v/a/108164257
659,A,659,/v/a/111824585
660,A,660,/v/a/111824768


In [10]:
routes_df[routes_df['route_url'] == '/v/a/108164257']['id'].values

array([7])

### check for route dupliates and drop

In [13]:
routes_df[routes_df.duplicated(subset=['route_url'])].shape

(37, 3)

In [17]:
routes_df_dedup = routes_df.drop_duplicates(subset=['route_url'])

In [18]:
# create utility matrix
df = create_utility_matrix(ratings_df, users_df, routes_df_dedup)

In [19]:
df.head()

Unnamed: 0,route_id,user_id,rating
0,0,36,1
1,1,36,1
2,2,394,1
3,2,1972,1
4,2,1625,1


In [20]:
df.shape

(29892, 3)

### save to database

In [21]:
# save to database
db = client.utility_matrix
for d in df.to_dict(orient='record'):
    db.utility_matrix.insert_one(d)