In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor

Take a peek at first few rows, since df has 2M rows

In [2]:
df = pd.read_csv('comboframe.csv', nrows=3, parse_dates=['date'], infer_datetime_format=True)
df

Unnamed: 0,checkin_id,beer_id,user_id,rating_user,brewery_name,beer_name,beer_style,brewery_id,brewery_type,brewery_country,...,venue_city,venue_country,venue_state,venue_cat,venue_id,checkin_comment,venue_type,rating_global,abv,date
0,821797539,2095023,3340203,3.75,Stone Brewing,Stone Scorpion Bowl IPA,IPA - American,1204,Regional Brewery,United States,...,,,,,,"Easy-drinking, not too hoppy. Solid.",[],3.73789,7.5,2019-10-26 03:52:50+00:00
1,818949121,1709568,3340203,3.5,Ritual Brewing Company,Pale Ale,Pale Ale - American,39329,Micro Brewery,United States,...,Redlands,United States,CA,Nightlife Spot,376422.0,Solid.,['brewery'],3.43165,5.2,2019-10-19 03:07:43+00:00
2,818856642,2734572,3340203,4.25,Ritual Brewing Company,Oil Rig,IPA - Imperial / Double,39329,Micro Brewery,United States,...,Redlands,United States,CA,Nightlife Spot,376422.0,Gooood stuff. Cool place too.,['brewery'],3.87873,9.0,2019-10-19 00:37:37+00:00


In [3]:
df.dtypes

checkin_id                       int64
beer_id                          int64
user_id                          int64
rating_user                    float64
brewery_name                    object
beer_name                       object
beer_style                      object
brewery_id                       int64
brewery_type                    object
brewery_country                 object
brewery_city                    object
brewery_state                   object
brewery_lat                    float64
brewery_lon                    float64
venue_lat                      float64
venue_lon                      float64
venue_city                      object
venue_country                   object
venue_state                     object
venue_cat                       object
venue_id                       float64
checkin_comment                 object
venue_type                      object
rating_global                  float64
abv                            float64
date               dateti

Just focus on some columns that are more easily interpretable in decision trees.  
Not using datetimes at this point, considering the difficulty in parsing out local  
timezones from UTC info based on states, countries, unspecified venues, meaning of  
days of the week, etc.

In [2]:
keep_cols = ['checkin_id','beer_id','user_id','rating_user',
             'beer_style','brewery_type','brewery_state','rating_global','abv']
df = pd.read_csv('comboframe.csv', usecols=keep_cols)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2061965 entries, 0 to 2061964
Data columns (total 9 columns):
checkin_id       int64
beer_id          int64
user_id          int64
rating_user      float64
beer_style       object
brewery_type     object
brewery_state    object
rating_global    float64
abv              float64
dtypes: float64(3), int64(3), object(3)
memory usage: 141.6+ MB


In [32]:
df.head()

Unnamed: 0,checkin_id,beer_id,user_id,rating_user,beer_style,brewery_type,brewery_state,rating_global,abv
0,821797539,2095023,3340203,3.75,IPA - American,Regional Brewery,CA,3.73789,7.5
1,818949121,1709568,3340203,3.5,Pale Ale - American,Micro Brewery,CA,3.43165,5.2
2,818856642,2734572,3340203,4.25,IPA - Imperial / Double,Micro Brewery,CA,3.87873,9.0
3,815159720,1044097,3340203,4.25,IPA - Imperial / Double,Regional Brewery,CA,4.0129,8.5
4,814916483,1070,3340203,3.75,Stout - Russian Imperial,Macro Brewery,CA,3.9142,9.9


In [33]:
min(df.rating_global)

0.0

In [34]:
df.rating_global.isna().sum()

212805

In [35]:
sum(df.rating_global == 0)

7127

In [3]:
df.dropna(subset=['rating_global'], axis=0, inplace=True)

In [4]:
df = df[df.rating_global > 0]
df.shape

(1842033, 9)

In [38]:
min(df.rating_global)

1.00423

In [39]:
df.brewery_type.unique()

array(['Regional Brewery', 'Micro Brewery', 'Macro Brewery', 'Cidery',
       'Nano Brewery', 'Brew Pub', 'Contract Brewery',
       'Bar / Restaurant / Store', 'Home Brewery', 'Meadery'], dtype=object)

In [40]:
sampler = df.sample(5).copy()

In [41]:
sampler

Unnamed: 0,checkin_id,beer_id,user_id,rating_user,beer_style,brewery_type,brewery_state,rating_global,abv
1125827,731131231,2028176,2859099,3.25,IPA - American,Micro Brewery,WA,3.57797,6.2
215238,805198741,1102518,175522,3.5,Red Ale - American Amber / Red,Micro Brewery,WA,3.62687,5.3
1796577,813348609,3386463,2199731,1.0,IPA - Session / India Session Ale,Contract Brewery,Leinster,3.49013,4.3
121494,736365778,3830,2380419,3.75,Stout - American Imperial / Double,Regional Brewery,PA,3.72988,9.1
1489104,796872426,3314314,1743528,3.5,IPA - American,Nano Brewery,VA,3.68357,7.1


Make a mapping for user-specific rating tendency (how user rates vs. mean rating for beers user rates).  
But only make the map for the training set, to avoid "peeking" at test data.  Then apply map to test set.

In [5]:
# make the split before calculating biases, to see how the results fare
trainers, testers = tts(df, test_size=0.02, random_state=0)

In [43]:
testers.shape

(36841, 9)

In [6]:
trainers['user_bias'] = trainers.rating_user - trainers.rating_global
userbiasdict = dict(trainers.groupby('user_id').user_bias.mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [45]:
userbiasdict[1397554]   # random user

0.22726999999999986

In [7]:
# now convert the bias column to the mean for the user
trainers['user_bias'] = trainers.user_id.map(userbiasdict)
testers['user_bias'] = testers.user_id.map(userbiasdict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [47]:
trainers.user_bias.isna().sum() # should be 0, since map was built from it

0

In [48]:
testers.user_bias.isna().sum() # probably more than a few test set users weren't in train set

3427

Could just drop those unseen test users and decide not to predict ratings for first-time users,  
but realistically, a recommender/predicter has to deal with cold-starts (first-time users whose bias is unknown).

In [8]:
testers.user_bias.fillna(0.0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [50]:
testers.columns

Index(['checkin_id', 'beer_id', 'user_id', 'rating_user', 'beer_style',
       'brewery_type', 'brewery_state', 'rating_global', 'abv', 'user_bias'],
      dtype='object')

In [9]:
# split the trainers into features and target, and the testers as well
feats = ['checkin_id', 'rating_global', 'abv', 'beer_style',
       'brewery_type', 'brewery_state','user_bias']
target = 'rating_user'
trainX = trainers[feats]
trainY = trainers[target]
testX = testers[feats]
testY = testers[target]

Attempt boosted regression on these features

In [10]:
# need to one-hot encode the string features first
# but sklearn doesn't want to encode NaN's, so convert to empty strings
trainX.fillna('', inplace=True)
testX.fillna('', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [11]:
enc = OneHotEncoder(handle_unknown='ignore')  # ignore = convert unseen categories to zeros

In [12]:
onehots = enc.fit(pd.concat([trainX, testX])[['brewery_type','brewery_state', 'beer_style']])

In [13]:
train_cats = onehots.transform(trainX[['brewery_type','brewery_state', 'beer_style']])

In [56]:
train_cats.shape

(1805192, 1018)

In [57]:
trainX.shape

(1805192, 7)

In [31]:
type(train_cats)

scipy.sparse.csr.csr_matrix

In [14]:
test_cats = onehots.transform(testX[['brewery_type','brewery_state', 'beer_style']])

In [33]:
test_cats.shape

(36841, 1018)

In [36]:
type(test_cats)

scipy.sparse.csr.csr_matrix

In [15]:
train_X = np.concatenate([trainX[['checkin_id', 'rating_global', 'user_bias', 'abv']].values, train_cats.toarray()], axis=1)

In [16]:
test_X = np.concatenate([testX[['checkin_id', 'rating_global', 'user_bias', 'abv']].values, test_cats.toarray()], axis=1)

In [38]:
from scipy.sparse import hstack, csr_matrix

In [39]:
train_X = hstack([csr_matrix(trainX[['checkin_id', 'rating_global', 'user_bias', 'abv']].values), train_cats])



[A[A

In [41]:
train_X.shape

(1805192, 1022)

In [42]:
test_X = hstack([csr_matrix(testX[['checkin_id', 'rating_global', 'user_bias', 'abv']].values), test_cats])

In [43]:
from tqdm import tqdm

In [71]:
gbr = GradientBoostingRegressor(verbose=1, n_estimators=20)

In [72]:
train_X.shape

(1805192, 1022)

In [73]:
trainY.shape

(1805192,)

In [74]:
type(train_X)

scipy.sparse.csr.csr_matrix

In [75]:
train_X = csr_matrix(train_X)
test_X = csr_matrix(test_X)

In [76]:
gbr.fit(train_X, trainY.values)

      Iter       Train Loss   Remaining Time 
         1           0.3220            2.69m
         2           0.2998            2.45m
         3           0.2808            2.29m
         4           0.2651            2.14m
         5           0.2515            2.00m
         6           0.2398            1.86m
         7           0.2296            1.73m
         8           0.2209            1.60m
         9           0.2129            1.46m
        10           0.2063            1.33m
        20           0.1705            0.00s


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=20, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=1, warm_start=False)

In [77]:
preds = gbr.predict(test_X)

In [78]:
print(min(preds), max(preds))

1.59887669547 4.63276393538


In [79]:
preds = np.clip(preds,0.25, 5.0)

In [80]:
def rmse(arr1, arr2):
    assert(len(arr1) == len(arr2)), print('Inputs must be same length.')
    diffs = arr1 - arr2
    mse = np.dot(diffs, diffs) / len(diffs)
    return np.sqrt(mse)

In [81]:
rmse(preds, testY)

0.47862522038952604

How about testing just on users whose bias has been shown?

In [69]:
biased_preds = gbr.predict(test_X[testX.user_bias != 0, :])
biased_targets = testY[testX.user_bias != 0]
rmse(biased_preds, biased_targets)

Inputs must be same length.


AssertionError: None

... vs just a baseline:

In [70]:
biased = testX[testX.user_bias != 0]
biasedY = testY[testX.user_bias != 0]
rmse(biasedY, biased.rating_global + biased.user_bias)

0.4619086937490075

In [82]:
sorted(list(zip(gbr.feature_importances_, trainX.columns)), reverse=True)[:10]

[(0.64955881174236763, 'abv'),
 (0.35044118825763237, 'rating_global'),
 (0.0, 'user_bias'),
 (0.0, 'checkin_id'),
 (0.0, 'brewery_type'),
 (0.0, 'brewery_state'),
 (0.0, 'beer_style')]

In [83]:
trainX.shape

(1805192, 7)