In [153]:
import pandas as pd
pd.set_option('float_format', '{:f}'.format)

import numpy as np

import warnings
warnings.filterwarnings('ignore')

import math

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso

In [154]:
jo = pd.DataFrame([[3.5, 1, 0, 0], [4.5, 0, 0, 1], [5.0, 0, 1, 0], [4.0, 0, 0, 1], [1.0, 0, 1, 1]], columns = ['rating', 'comedy', 'thriller', 'action'])
jo

Unnamed: 0,rating,comedy,thriller,action
0,3.5,1,0,0
1,4.5,0,0,1
2,5.0,0,1,0
3,4.0,0,0,1
4,1.0,0,1,1


In [155]:
jo['predict'] = 3.5 + (0.3 * jo['comedy']) + (0.2 * jo['thriller']) + (0.1 * jo['action'])
jo

Unnamed: 0,rating,comedy,thriller,action,predict
0,3.5,1,0,0,3.8
1,4.5,0,0,1,3.6
2,5.0,0,1,0,3.7
3,4.0,0,0,1,3.6
4,1.0,0,1,1,3.8


In [156]:
mean_squared_error(jo['rating'], jo['predict'])

2.118

In [157]:
jo['predict'] = 4.0 + (-0.1 * jo['comedy']) + (0.1 * jo['thriller']) + (-0.2 * jo['action'])
jo

Unnamed: 0,rating,comedy,thriller,action,predict
0,3.5,1,0,0,3.9
1,4.5,0,0,1,3.8
2,5.0,0,1,0,4.1
3,4.0,0,0,1,3.8
4,1.0,0,1,1,3.9


In [158]:
mean_squared_error(jo['rating'], jo['predict'])

1.9819999999999993

In [159]:
reg = LinearRegression()
reg.fit(jo[['comedy', 'thriller', 'action']], jo['rating'])
print(reg.coef_)
print(reg.intercept_)

[-4.75 -3.25 -4.  ]
8.25


In [160]:
jo['predict'] = reg.predict(jo[['comedy', 'thriller', 'action']])
jo

Unnamed: 0,rating,comedy,thriller,action,predict
0,3.5,1,0,0,3.5
1,4.5,0,0,1,4.25
2,5.0,0,1,0,5.0
3,4.0,0,0,1,4.25
4,1.0,0,1,1,1.0


In [161]:
mean_squared_error(jo['rating'], jo['predict'])

0.025

In [162]:
ex = pd.DataFrame([[1, np.nan, 4.5, np.nan, np.nan, np.nan, 1, 1, 0],
                   [2, 3.5, np.nan, 4.0, 3.5, np.nan, 1, 0, 0],
                   [3, 4.5, 5.0, 2.0, np.nan, np.nan, 0, 0, 1],
                   [4, 5.0, 5.0, 1.0, np.nan, 5.0, 0, 1, 0],
                   [5, 4.0, np.nan, np.nan, 3.0, 4.5, 0, 0, 1],
                   [6, np.nan, 4.0, 4.5, 4.0, np.nan, 0, 0, 0],
                   [7, 1.0, 0.5, np.nan, 2.0, np.nan, 0, 1, 1]],
                   columns = ['movieid', 'jo', 'amy', 'beth', 'meg', 'mommy', 'comedy', 'thriller', 'action'])

In [163]:
ex

Unnamed: 0,movieid,jo,amy,beth,meg,mommy,comedy,thriller,action
0,1,,4.5,,,,1,1,0
1,2,3.5,,4.0,3.5,,1,0,0
2,3,4.5,5.0,2.0,,,0,0,1
3,4,5.0,5.0,1.0,,5.0,0,1,0
4,5,4.0,,,3.0,4.5,0,0,1
5,6,,4.0,4.5,4.0,,0,0,0
6,7,1.0,0.5,,2.0,,0,1,1


In [164]:
def user_profile(user):
    ex2 = ex[ex[user].notnull()]
    reg = LinearRegression()
    reg.fit(ex2[['comedy', 'thriller', 'action']], ex2[user])
    predict = reg.predict(ex2[['comedy', 'thriller', 'action']])
    return reg.coef_, reg.intercept_, mean_squared_error(ex2[user], predict)

In [165]:
user_profile('jo')

(array([-4.75, -3.25, -4.  ]), 8.25, 0.025)

In [166]:
user_profile('amy')

(array([ 0.875, -1.75 , -1.75 ]), 5.375, 1.5125)

In [167]:
user_profile('beth')

(array([-0.5, -3.5, -2.5]), 4.5, 9.860761315262648e-31)

In [168]:
user_profile('meg')

(array([-0.5, -1. , -1. ]), 3.999999999999999, 2.465190328815662e-31)

In [169]:
user_profile('mommy')

(array([-5.55111512e-17,  2.50000000e-01, -2.50000000e-01]), 4.75, 0.0)

In [170]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

In [174]:
def user_profile_lasso(user):
    ex2 = ex[ex[user].notnull()]
    lasso = RandomizedSearchCV(estimator = Lasso(), 
                               param_distributions = {'alpha': uniform(0,1)},
                               n_iter=100,
                               cv=len(ex2))
    lasso.fit(ex2[['comedy', 'thriller', 'action']], ex2[user])
    predict = lasso.predict(ex2[['comedy', 'thriller', 'action']])
    return lasso.best_estimator_.alpha, lasso.best_estimator_.intercept_, lasso.best_estimator_.coef_, mean_squared_error(ex2[user], predict)

In [175]:
user_profile_lasso('jo')

(0.9874638591270191, 3.6, array([-0., -0., -0.]), 1.94)

In [176]:
user_profile_lasso('amy')

(0.9224880870684697, 3.8, array([ 0., -0., -0.]), 2.8600000000000003)

In [177]:
user_profile_lasso('beth')

(0.1253615609712928,
 3.748552295986677,
 array([ 0.        , -2.24710313, -1.24710605]),
 0.28269979986129923)

In [178]:
user_profile_lasso('meg')

(0.39538324412641324, 3.125, array([ 0., -0., -0.]), 0.546875)

In [179]:
user_profile_lasso('mommy')

(0.406380722217611, 4.75, array([ 0.,  0., -0.]), 0.0625)

In [180]:
def user_profile_lasso2(user):
    ex2 = ex[ex[user].notnull()]
    reg = Lasso(alpha=0.15)
    reg.fit(ex2[['comedy', 'thriller', 'action']], ex2[user])
    predict = reg.predict(ex2[['comedy', 'thriller', 'action']])
    return reg.coef_, reg.intercept_, mean_squared_error(ex2[user], predict)

In [181]:
user_profile_lasso2('jo')

(array([-0.        , -0.46428566, -0.53571428]),
 4.107142830563653,
 1.5392857328911582)

In [182]:
user_profile_lasso2('amy')

(array([ 0.        , -0.74999988, -1.24999998]),
 4.749999917309141,
 1.8250000434127045)

In [183]:
user_profile_lasso2('beth')

(array([ 0.        , -2.04997751, -1.0499925 ]),
 3.649992504655581,
 0.3912589944975732)

In [184]:
user_profile_lasso2('meg')

(array([ 0.        , -0.40001524, -0.44999238]), 3.45, 0.16624771379218384)

In [185]:
user_profile_lasso2('mommy')

(array([ 0.,  0., -0.]), 4.75, 0.0625)

In [186]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
genres = pd.read_pickle('genres.p')

In [187]:
ratings.sample()

Unnamed: 0,userId,movieId,rating,timestamp
51956,337,762,5.0,860255866


In [188]:
genres.sample()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
129011,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [189]:
ratings = ratings.merge(genres, left_on='movieId', right_index=True)
ratings.sample()

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
56443,373,500,2.0,846830016,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [190]:
genres_cols = genres.columns
genres_cols

Index(['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western'],
      dtype='object')

In [191]:
train, test = train_test_split(ratings, test_size=.1, random_state=42)

In [None]:
user_profile_list = []

for userId in train['userId'].unique():
    user = train[train['userId'] == userId]
    X_train = user[genres_cols]
    y_train = user['rating']
    
    lasso = RandomizedSearchCV(estimator = Lasso(), 
                               param_distributions = {'alpha': uniform(0,1)},
                               n_iter=20,
                               cv=10,
                               random_state=42)
    
    lasso.fit(X_train, y_train)

    user_profile_list.append([lasso.best_estimator_.alpha, lasso.best_estimator_.intercept_, *lasso.best_estimator_.coef_])

In [None]:
user_profile = pd.DataFrame(user_profile_list, index=train['userId'].unique(), columns=['alpha','intercept', *genres_cols])
user_profile

In [None]:
predict = []

for idx, row in test.iterrows():
    user = row['userId']
    predict.append(sum(user_profile.loc[user, genres_cols] * row[genres_cols]) + user_profile.loc[user, 'intercept'])

rmse = np.sqrt(mean_squared_error(test['rating'], predict))

In [None]:
# n_iter=10, cv=10 : 0.9147408936491921
# n_iter=20, cv=10 : 0.9086938386047155
rmse

In [None]:
user_profile['alpha'].unique()