In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from surprise import SVD, Reader, Dataset, accuracy
from surprise.model_selection import cross_validate, train_test_split, KFold


In [4]:
df = pd.read_csv('bigframe.csv')

#### Remove all but the most recent one User/Beer combo for multiple such checkins

In [6]:
df.drop_duplicates(subset=['user_id', 'beer_id'], inplace=True)  # keep_first (default) means keep most recent checkin

In [7]:
df.shape  # remaining from 1.42M prev

(1296064, 27)

In [8]:
reader = Reader(rating_scale=(0.25, 5.0))

### Put this into surprise's format (user, beer, rating)

In [10]:
# The columns must correspond to user_id, beer_id and ratings (in that order).
data = Dataset.load_from_df(df[['user_id', 'beer_id', 'rating_user']], reader)

In [12]:
# define a cross-validation iterator
kf = KFold(n_splits=5)

algo = SVD()

In [13]:
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)


# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.4737


0.47365809752496973

In [17]:
predictions[:11]

[Prediction(uid=171496, iid=3034980, r_ui=4.25, est=4.4778698277391804, details={'was_impossible': False}),
 Prediction(uid=1803223, iid=1828774, r_ui=3.25, est=3.5844314206255463, details={'was_impossible': False}),
 Prediction(uid=267320, iid=399609, r_ui=4.0, est=3.981760322915016, details={'was_impossible': False}),
 Prediction(uid=2924460, iid=1187911, r_ui=4.0, est=3.6026735719768013, details={'was_impossible': False}),
 Prediction(uid=1749282, iid=2686615, r_ui=3.5, est=3.6181331817534472, details={'was_impossible': False}),
 Prediction(uid=3292172, iid=3970, r_ui=3.25, est=3.9172385217788266, details={'was_impossible': False}),
 Prediction(uid=4396115, iid=4057, r_ui=4.0, est=3.2826608617101911, details={'was_impossible': False}),
 Prediction(uid=959498, iid=1473393, r_ui=3.5, est=3.6003136089757715, details={'was_impossible': False}),
 Prediction(uid=267320, iid=9681, r_ui=4.0, est=4.3961044757993522, details={'was_impossible': False}),
 Prediction(uid=1279351, iid=647032, r_u

How does that compare to just guessing the global mean rating for every beer?  
First we have to remove the beers that don't have global ratings.  We could  
theoretically use the mean of all user ratings in place of the missing global  
ratings, but for beers with few checkins, that would cheat  
by skewing toward the rating of the user being predicted.

In [21]:
global_rated = df[['rating_user', 'rating_global']]

In [22]:
global_rated.shape

(1296064, 2)

In [98]:
global_rated.max()

rating_user      5.00000
rating_global    4.90672
dtype: float64

In [23]:
global_rated.min()

rating_user      0.25
rating_global    0.00
dtype: float64

Whoops, went to the trouble of setting NaN global ratings to 0.0

In [24]:
global_rated = global_rated[global_rated.rating_global > 0]

In [25]:
global_rated.shape

(1089604, 2)

In [26]:
global_rated.min()

rating_user      0.25000
rating_global    1.00423
dtype: float64

In [29]:
# calculate the rmse using global rating as prediction
gr = global_rated
diffs = gr.rating_global.values - gr.rating_user.values
sum_errs_sq = np.dot(diffs, diffs)
rmse = np.sqrt(sum_errs_sq / len(diffs))
rmse

0.51243794703341272

So at least the Surprise rmse was lower than that of (the `global_rated` subset of) the whole population.
  
Quick check of what pct of ratings had globals: 

In [69]:
print(f'{round(len(global_rated) * 100 / len(df), 3)} pct of checkins had global ratings.')

84.07 pct of checkins had global ratings.


In [99]:
# might as well keep track of evaluation results
methods = ['naively predict global_mean_rating']
rmse = [0.512]
results = pd.DataFrame({'method': methods,
                        'rmse': rmse})

def add_results_row(meth_name, rmse_result):
    methods.append(meth_name)
    rmse.append(rmse_result)
    results = pd.DataFrame({'method': methods,
                            'rmse': rmse})
    return results

# make sure function works
results = add_results_row('Surprise SVD with default params', 0.474)
results.tail()

Unnamed: 0,method,rmse
0,naively predict global_mean_rating,0.512
1,Surprise SVD with default params,0.474


In [None]:
# append result rows here and check tail to make sure all good
results = add_results_row(___)
results.tail()

===========================================================================================================

In [32]:
np.random.seed(42)
size=10000
cities=['paris','barca','london','berlin']
deef = pd.DataFrame({'city': np.random.choice(cities, size=size), 'booked_perc':np.random.rand(size)})
deef['id'] = deef.index.map(str) + '-' + deef.city
deef = deef[['id','city','booked_perc']]
deef.head()

Unnamed: 0,id,city,booked_perc
0,0-london,london,0.393636
1,1-berlin,berlin,0.473436
2,2-paris,paris,0.854547
3,3-london,london,0.340004
4,4-london,london,0.86965


fastest way to sum a column

In [53]:
%%timeit
deef.booked_perc.sum()

The slowest run took 6.70 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 198 µs per loop


fastest way to sum a column and filter it first

In [54]:
%%timeit
np.dot(deef.booked_perc.values, np.logical_and((deef.booked_perc < 0.5), deef.city=='berlin'))

100 loops, best of 3: 2.6 ms per loop


or with one filter only:

In [55]:
%%timeit
deef[deef.booked_perc < 0.5].booked_perc.sum()

100 loops, best of 3: 2 ms per loop


or with 2 filters:

In [56]:
%%timeit
deef[(deef.booked_perc < 0.5) & (deef.city == 'berlin')].booked_perc.sum()

100 loops, best of 3: 3.39 ms per loop


make a column of differences from same column, one row to next, for rows 1 to N-1

In [61]:
delters = deef.booked_perc.values - deef.booked_perc.values[:-1]


10000

In [67]:
%%timeit
delters = deef.booked_perc.values - np.append(np.array([0]), deef.booked_perc.values[:-1])

The slowest run took 6.01 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 52 µs per loop


In [68]:
%%timeit
delters = deef.booked_perc.pct_change()

100 loops, best of 3: 1.34 ms per loop
