In [1]:
from pymongo import MongoClient
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
%matplotlib inline

### get route info preped

In [2]:
def dataframe_from_collection(collection):
    '''covnert mongodb dataframe to pandas dataframe'''
    raw_data = collection.find()
    return pd.DataFrame(list(raw_data))

In [3]:
client = MongoClient('mongodb://localhost:27017/')
db = client.routes_updated
route_df = dataframe_from_collection(db.routes)
route_df.head().T

Unnamed: 0,0,1,2,3,4
Aid,0,0,0,0,0
Alpine,0,0,0,0,0
Boulder,0,0,0,0,0
FA,"Adam Winslow, Bill Coe, Kyle Silverman 3/20/09","Adam Winslow, Ujahn Davisson, Bill Coe, Jim Op...",?,,Unknown
Ice,0,0,0,0,0
Mixed,0,0,0,0,0
Sport,0,0,0,0,0
TR,0,0,1,1,1
Trad,1,1,1,1,1
_id,58d2afc740b4412d88f5a7ff,58d2afc740b4412d88f5a800,58d2afc740b4412d88f5a801,58d2afc740b4412d88f5a802,58d2afc740b4412d88f5a803


### make id dataframe

In [4]:
route_df_id = route_df[['name', 'id','average_rating']]

### drop duplicates

In [5]:
# might not need this 
# route_df[route_df.duplicated(subset=['route_url'])].shape

# route_df = route_df.drop_duplicates(subset=['route_url'])

In [6]:
# drop columns
route_df_new = route_df.drop(['uiaa', 'submitted_on',
                            'submitted_by', 'season', 
                            'name', 'id',
                            'average_rating', '_id',
                            'FA', 'route_url'], axis=1)

In [7]:
route_df_new.head()

Unnamed: 0,Aid,Alpine,Boulder,Ice,Mixed,Sport,TR,Trad,grade,height,original_grade,page_views,pitches
0,0,0,0,0,0,0,0,1,5.6,41.0,5.6,588,1.0
1,0,0,0,0,0,0,0,1,5.5,41.0,5.5,642,1.0
2,0,0,0,0,0,0,1,1,5.5,35.0,5.5,176,
3,0,0,0,0,0,0,1,1,5.6,85.0,5.6,1263,1.0
4,0,0,0,0,0,0,1,1,5.6,30.0,5.6,678,1.0


In [8]:
route_df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3727 entries, 0 to 3726
Data columns (total 13 columns):
Aid               3727 non-null int64
Alpine            3727 non-null int64
Boulder           3727 non-null int64
Ice               3727 non-null int64
Mixed             3727 non-null int64
Sport             3727 non-null int64
TR                3727 non-null int64
Trad              3727 non-null int64
grade             3727 non-null object
height            2992 non-null float64
original_grade    3727 non-null object
page_views        3727 non-null int64
pitches           2127 non-null float64
dtypes: float64(2), int64(9), object(2)
memory usage: 378.6+ KB


### fill nulls

In [9]:
route_df_new['height'] = route_df_new['height'].fillna(route_df_new['height'].mean())
route_df_new['pitches'] = route_df_new['pitches'].fillna(route_df_new['pitches'].mean())

### make dummies

In [10]:
df_grades = route_df_new[['grade','original_grade']]
df_dummies = pd.get_dummies(df_grades).drop('grade_3rd', axis=1)

In [11]:
concat_list = [route_df_new.drop(['grade','original_grade'], axis=1), df_dummies]
x_routes = pd.concat(concat_list, axis=1).fillna(route_df_new.mean())

In [12]:
x_routes.isnull().values.any()

False

In [13]:
x_routes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3727 entries, 0 to 3726
Columns: 129 entries, Aid to original_grade_Easy 5th  
dtypes: float64(2), int64(9), uint8(118)
memory usage: 749.8 KB


### prep user info

In [14]:
db = client.users
users_df = dataframe_from_collection(db.users)
users_df.head().T

Unnamed: 0,0,1,2,3,4
_id,58d1912840b4417fa4b16db4,58d1912840b4417fa4b16db5,58d1912840b4417fa4b16db6,58d1912840b4417fa4b16db7,58d1912840b4417fa4b16db8
age,,,,33,
aid_follows,,,C5,,
aid_leads,,,C3,,
city,,Seattle,Hawaii,Corvallis,
compliments,0,1,10,0,0
favorite_climbs,leading trad is rad,"high mountain woody, fridge left, easy in an ...",none specified,outer space,none specified
female,,0,0,0,
ice_follows,,,,,
ice_leads,,,,,


### make id dataframe

In [15]:
users_df_id = users_df[['name', 'id']]

In [16]:
users_df_new = users_df.drop(['name', '_id', 'other_interests', 'id', 'last_vist', 'favorite_climbs'], axis=1)
users_df_new.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
age,,,,33,,44,,25,26,
aid_follows,,,C5,,,,,,,
aid_leads,,,C3,,,,,,,
city,,Seattle,Hawaii,Corvallis,,Fort Thomas,San Diego,albuquerque,Seattle,Redmond
compliments,0,1,10,0,0,0,2,0,0,0
female,,0,0,0,,0,1,1,0,0
ice_follows,,,,,,,,,WI4,
ice_leads,,,,,,,,,WI3,
likes_gym,0,1,1,1,0,1,0,1,0,1
likes_sport,0,1,1,1,0,1,0,1,0,1


In [17]:
users_df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2069 entries, 0 to 2068
Data columns (total 20 columns):
age              903 non-null float64
aid_follows      217 non-null object
aid_leads        217 non-null object
city             1155 non-null object
compliments      2069 non-null int64
female           1260 non-null float64
ice_follows      314 non-null object
ice_leads        314 non-null object
likes_gym        2069 non-null int64
likes_sport      2069 non-null int64
likes_tr         2069 non-null int64
likes_trad       2069 non-null int64
member_since     2069 non-null object
point_rank       1156 non-null float64
sport_follows    1036 non-null object
sport_lead       1036 non-null object
state            1133 non-null object
total_points     2069 non-null int64
trad_follows     998 non-null object
trad_lead        998 non-null object
dtypes: float64(3), int64(6), object(11)
memory usage: 323.4+ KB


### fill nulls

In [18]:
# fill age with mean age
users_df_new['age'] = users_df_new['age'].fillna(users_df_new['age'].mean())

In [19]:
# fix point rank
users_df_new['point_rank'] = users_df_new['point_rank'].fillna(users_df_new['point_rank'].mean())

In [20]:
# fix datetime
users_df_new['member_since'] = pd.to_datetime(users_df_new['member_since'])
users_df_new.head().T

Unnamed: 0,0,1,2,3,4
age,33.2824,33.2824,33.2824,33,33.2824
aid_follows,,,C5,,
aid_leads,,,C3,,
city,,Seattle,Hawaii,Corvallis,
compliments,0,1,10,0,0
female,,0,0,0,
ice_follows,,,,,
ice_leads,,,,,
likes_gym,0,1,1,1,0
likes_sport,0,1,1,1,0


### time to make dummies

In [21]:
dummy_list1 = [u'aid_follows',
             u'aid_leads',
             u'ice_follows',
             u'ice_leads',
             u'sport_follows',
             u'sport_lead',
             u'trad_follows',
             u'trad_lead']
dummy_list2 = [u'city',u'state',]

In [22]:
df_subsets = users_df_new[dummy_list1]
dummy_df1 = pd.get_dummies(df_subsets)

In [23]:
df_subsets = users_df_new[dummy_list2]
dummy_df2 = pd.get_dummies(df_subsets)

In [24]:
dummy_df3 = pd.get_dummies(users_df_new['female']).rename( columns={0.0: 'male', 1.0: 'female'})

In [25]:
users_df_new.head().T
concat_list = [users_df_new.drop(dummy_list1+dummy_list2+['female'], axis=1), dummy_df1, dummy_df2, dummy_df3]
x_users = pd.concat(concat_list, axis=1)

In [26]:
x_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2069 entries, 0 to 2068
Columns: 599 entries, age to female
dtypes: datetime64[ns](1), float64(2), int64(6), uint8(590)
memory usage: 1.3 MB


In [27]:
x_users.isnull().values.any()

False

### Combine x_users and x_routes based on the utility matrix

In [28]:
db = client.utility_matrix
ratings_df = dataframe_from_collection(db.utility_matrix).drop('_id', axis=1)
ratings_df.head()

Unnamed: 0,rating,route_id,user_id
0,1,0,36
1,1,1,36
2,1,2,394
3,1,2,1972
4,1,2,1625


In [29]:
# get list of indices
route_ind_list = ratings_df['route_id'].tolist()
user_ind_list = ratings_df['user_id'].tolist()

In [30]:
left = x_users.iloc[user_ind_list, :].reset_index().drop('index', axis=1)
right = x_routes.iloc[route_ind_list, :].reset_index().drop('index', axis=1)
data_df = left.join(right)

In [31]:
data_df.isnull().values.any()

False

In [33]:
model_df = data_df.join(ratings_df)
model_df.head()

Unnamed: 0,age,compliments,likes_gym,likes_sport,likes_tr,likes_trad,member_since,point_rank,total_points,aid_follows_ C0,...,original_grade_5.8,original_grade_5.8+,original_grade_5.8-,original_grade_5.9,original_grade_5.9+,original_grade_5.9-,original_grade_Easy 5th,rating,route_id,user_id
0,35.0,40,1,1,1,1,2008-06-29,611.0,1100,1,...,0,0,0,0,0,0,0,1,0,36
1,35.0,40,1,1,1,1,2008-06-29,611.0,1100,1,...,0,0,0,0,0,0,0,1,1,36
2,29.0,4,1,1,0,1,2011-05-28,172.0,2945,0,...,0,0,0,0,0,0,0,1,2,394
3,27.0,0,1,1,1,1,2012-04-16,1053.0,656,0,...,0,0,0,0,0,0,0,1,2,1972
4,33.282392,0,0,0,0,0,2015-04-20,6388.894464,0,0,...,0,0,0,0,0,0,0,1,2,1625


In [34]:
model_df.isnull().values.any()

False

### save as csv because mongodb doesn't like '.'  in dict key

In [36]:
model_df.to_csv("model_df.csv", sep='\t')

In [38]:
pd.read_csv("model_df.csv", sep='\t')

Unnamed: 0.1,Unnamed: 0,age,compliments,likes_gym,likes_sport,likes_tr,likes_trad,member_since,point_rank,total_points,...,original_grade_5.8,original_grade_5.8+,original_grade_5.8-,original_grade_5.9,original_grade_5.9+,original_grade_5.9-,original_grade_Easy 5th,rating,route_id,user_id
0,0,35.000000,40,1,1,1,1,2008-06-29,611.000000,1100,...,0,0,0,0,0,0,0,1,0,36
1,1,35.000000,40,1,1,1,1,2008-06-29,611.000000,1100,...,0,0,0,0,0,0,0,1,1,36
2,2,29.000000,4,1,1,0,1,2011-05-28,172.000000,2945,...,0,0,0,0,0,0,0,1,2,394
3,3,27.000000,0,1,1,1,1,2012-04-16,1053.000000,656,...,0,0,0,0,0,0,0,1,2,1972
4,4,33.282392,0,0,0,0,0,2015-04-20,6388.894464,0,...,0,0,0,0,0,0,0,1,2,1625
5,5,14.000000,0,0,1,1,1,2016-06-21,7902.000000,31,...,0,0,0,0,0,0,0,1,2,1522
6,6,33.282392,0,0,0,0,0,2016-03-09,6388.894464,0,...,0,0,0,0,0,0,0,3,3,233
7,7,30.000000,1,1,1,1,1,2014-09-22,2899.000000,186,...,0,0,0,0,0,0,0,3,3,1716
8,8,34.000000,20,1,1,1,1,2014-07-14,1217.000000,551,...,0,0,0,0,0,0,0,3,3,1399
9,9,33.282392,8,1,1,1,1,2009-09-15,238.000000,2385,...,0,0,0,0,0,0,0,2,3,1935
