## **NAME REDACTED**: Recommender System Case Study

**Imports and configuration**

In [1]:
import os
os.chdir('..')
from src.util import *
from src.recommender_jacob import MovieRecommender
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from src.compute_similarities import find_similarities
import pickle

In [2]:
os.getcwd()

'/Users/kylecaron/Desktop/dsi-recommender-case-study'

### Generate random sample and save train-test .csv files

In [3]:
generate_file_split(random_subset(10000), 0.8)

In [4]:
train_data = pd.read_csv('data/ctrain.csv')
test_data = pd.read_csv('data/ctest.csv')
request_data = pd.read_csv('data/crequests.csv')

In [5]:
train_data.head()

Unnamed: 0,user,movie,rating
0,6039,2791,4
1,6039,2083,3
2,6038,2716,3
3,6038,1223,5
4,6036,3005,2


In [6]:
test_data.head()

Unnamed: 0,user,movie,actualrating
0,1542,1268,3
1,1587,1101,2
2,1545,2916,2
3,1544,2690,3
4,1545,1262,3


In [7]:
request_data.head()

Unnamed: 0,user,movie
0,1542,1268
1,1587,1101
2,1545,2916
3,1544,2690
4,1545,1262


### Fit recommender model

In [8]:
reco_instance = MovieRecommender()
fit_model = reco_instance.fit(train_data)

## Find Similarities

In [233]:
os.getcwd()

'/Users/kylecaron/Desktop/dsi-recommender-case-study'

In [258]:
similarity_df = find_similarities('data/training.csv', 
            'data/the-movies-dataset/movies_metadata.csv', 
            'id', 'genres', 20)
similarity_df.to_pickle('data/similarities')

  if (yield from self.run_code(code, result)):


In [235]:
simdf = pd.read_pickle('data/similarities')

### Generate rating predictions

In [259]:
train_data.head()

Unnamed: 0,user,movie,rating
0,6039,2791,4
1,6039,2083,3
2,6038,2716,3
3,6038,1223,5
4,6036,3005,2


In [260]:
simdf.head()

Unnamed: 0,movie,similar
0,862,"[179, 525, 743, 1411, 1446, 107, 871, 1260, 13..."
1,949,"[75, 82, 630, 681, 730, 947, 957, 1145, 1278, ..."
2,710,"[100, 293, 299, 314, 416, 505, 506, 507, 594, ..."
3,1408,"[356, 1265, 1696, 591, 106, 1627, 1658, 190, 2..."
4,524,"[51, 77, 109, 116, 203, 209, 214, 386, 702, 74..."


In [267]:
def fill_ratings(training_set, n_similar_movies):
    find_similarities('data/training.csv', 
            'data/the-movies-dataset/movies_metadata.csv', 
            'id', 'genres', 20)

In [272]:
train_adjust = pd.merge(train_data, simdf, on='movie', how='left')
train_adjust = train_adjust.set_index(['user', 'movie', 'rating'])
train_adjust = (pd.melt(train_adjust.similar.apply(pd.Series).reset_index(), 
             id_vars=['user', 'movie', 'rating'],
             value_name='similar')
     .set_index(['user', 'movie', 'rating'])
     .drop('variable', axis=1)
     .dropna()
     .sort_index()).reset_index()

In [238]:
#null values in simdf
print(sum(simdf.similar.apply(pd.Series).isnull()))
explode = simdf.similar.apply(pd.Series)


4950


In [274]:
reco_instance = MovieRecommender()
fit_model = reco_instance.fit(train_adjust)

In [275]:
predictions = reco_instance.transform(test_data, simdf)

In [278]:
predictions.head(20)

Unnamed: 0,user,movie,actualrating,prediction
0,877,471,4,
1,1853,471,4,
2,1457,1580,3,
3,1166,1580,4,
4,829,1580,4,
5,1066,1580,4,
6,1230,1580,5,
7,820,1580,2,
8,785,1580,3,
9,2015,1591,5,3.644878


In [241]:
result = pd.merge(predictions, simdf, how='outer', on='movie')

In [242]:
result.head(16)

Unnamed: 0,user,movie,prediction,similar
0,877.0,471,,"[16, 22, 42, 63, 101, 104, 133, 136, 144, 173,..."
1,1853.0,471,,"[16, 22, 42, 63, 101, 104, 133, 136, 144, 173,..."
2,1457.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
3,1166.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
4,829.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
5,1066.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
6,1230.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
7,820.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
8,785.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
9,2015.0,1591,3.393097,"[16, 22, 42, 63, 101, 104, 133, 136, 144, 173,..."


In [243]:
nulldf = result[(result['prediction'].isna()) & 
                (result['similar'].notnull())].reset_index()

fulldf = result[(result['prediction'].notnull()) | 
                (result['similar'].isna())].reset_index()

In [244]:
fulldf.prediction[fulldf.similar.isnull()] = train_data['rating'].mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [245]:
test = nulldf.copy()

In [246]:
test.head(16)

Unnamed: 0,index,user,movie,prediction,similar
0,0,877.0,471,,"[16, 22, 42, 63, 101, 104, 133, 136, 144, 173,..."
1,1,1853.0,471,,"[16, 22, 42, 63, 101, 104, 133, 136, 144, 173,..."
2,2,1457.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
3,3,1166.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
4,4,829.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
5,5,1066.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
6,6,1230.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
7,7,820.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
8,8,785.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
9,10,710.0,1591,,"[16, 22, 42, 63, 101, 104, 133, 136, 144, 173,..."


In [247]:
fulldf.head(5)

Unnamed: 0,index,user,movie,prediction,similar
0,9,2015.0,1591,3.393097,"[16, 22, 42, 63, 101, 104, 133, 136, 144, 173,..."
1,15,931.0,2366,3.608625,
2,22,1100.0,1025,3.608625,
3,23,1074.0,1084,3.608625,
4,29,889.0,1507,3.608625,


In [248]:
train_data.head()

Unnamed: 0,user,movie,rating
0,6039,2791,4
1,6039,2083,3
2,6038,2716,3
3,6038,1223,5
4,6036,3005,2


In [250]:
testing = test.loc[0, 'similar']

In [251]:
testset = []
for item in testing:

    user_rating = train_data[(train_data['user'] == 877) & 
                         (train_data['movie'] == item)]
    
    if len(user_rating) == 0:
        pass
    else:
        testset.append(user_rating)


In [252]:
len(testset)

0

In [219]:
test.head()

Unnamed: 0,index,user,movie,prediction,similar
0,0,877.0,471,,"[16, 22, 42, 63, 101, 104, 133, 136, 144, 173,..."
1,1,1853.0,471,,"[16, 22, 42, 63, 101, 104, 133, 136, 144, 173,..."
2,2,1457.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
3,3,1166.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."
4,4,829.0,1580,,"[161, 226, 248, 301, 366, 376, 377, 379, 457, ..."


In [217]:
def get_avg(row):
    ratings = []
    for mov in row['similar']:
        print(mov,user)
    
    
        print(user_rating)
        ratings.append(user_rating)
    #print(ratings)
    arr = np.array(ratings)
    mu = np.mean(arr)
    return mu

        
        

In [218]:
test.apply(get_avg, axis=1)

16 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []
22 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []
42 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []
63 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []
101 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []
104 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []
133 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []
136 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []
144 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []
173 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []
174 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []
185 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []
187 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []
194 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []
223 877.0
Empty DataFrame
Columns: [user, movie, rating]
Index: []

ValueError: ('cannot copy sequence with size 0 to array axis with dimension 3', 'occurred at index 0')

### Evaluate model

In [11]:
RMSE(predictions, test_data)

  return this.join(other, how=how, return_indexers=return_indexers)


nan

In [12]:
violin_plot(predictions, test_data)

IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match