In [1]:
import pandas as pd
import numpy as np
import fnmatch
import os

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, r2_score, mean_squared_error

In [2]:
def repl(string):
    string1 = string.replace(']','').replace('[', '').replace('\'','')
    return string1

def str_to_list(string):
    lst = list(string[1:-1].split(", "))
    lst1 = []
    for el in lst:
        lst1.append(el[1:-1])
    
    return ' '.join(lst1)

In [3]:
def one_decimal(rating):
    return round(rating, 1)

def categorize_ratings(df, col):
    df[col] = df[col].apply(one_decimal)
    return df
def lower(text):
    return text.lower()

def lower_cols(df, cols):
    for col in cols:
        df[col] = df[col].apply(lower)
    return df

def remove_spaces(text):
    text = text.replace(' ', '', 10)
    return text

def join_names(df, col):
    df[col] = df[col].apply(remove_spaces)
    return df

def read_books_info(filespath='../data/big_data_temp/'):
    pattern = 'gr_books_df_*.csv'
    print('Pattern :', pattern )

    files = os.listdir(filespath) 
    dfs_files = []
    for name in files: 
        if fnmatch.fnmatch(name, pattern):
            dfs_files.append(name)
    dfs_files
    dfs_paths = []
    for file in dfs_files:
        dfs_paths.append(filespath + file)

    revs_lst = []
    for file in dfs_paths:
        try:
            revs_lst.append(pd.read_csv(file))
        except:
            print(f'file {file} failed')
    print(len(revs_lst))
    df = pd.concat(revs_lst,axis=0)
    return df

In [536]:
books_df = read_books_info()
books_df['author_name'] = books_df['author_name'].apply(repl)
books_df = categorize_ratings(books_df,'avg_rating')
books_df['genres'] = books_df['genres'].apply(str_to_list)
books_df = books_df[['isbn', 'book_title', 'avg_rating', 'author_name', 'book_desc', 'genres']]
books_df = lower_cols(books_df, ['book_title', 'author_name', 'book_desc', 'genres'])
books_df = join_names(books_df, 'author_name')

Pattern : gr_books_df_*.csv
6


In [538]:
books_df.head(2)

Unnamed: 0,isbn,book_title,avg_rating,author_name,book_desc,genres
0,1594488843,drive: the surprising truth about what motivat...,4.0,danielh.pink,the new york times bestseller that gives reade...,nonfiction business psychology leadership self...
1,033050889X,the mind's eye,3.9,oliversacks,"in musicophilia, oliver sacks explored music a...",nonfiction science psychology biology neurosci...


### TfIdf for books_df

In [300]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0, stop_words='english', max_features=200)
tfidf_matrix = tf.fit_transform(books_df['genres'])

# tf.stop_words_
# tf.vocabulary_
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tf.get_feature_names())
books_df = pd.concat([books_df, tfidf_df], axis=1)
features = tf.get_feature_names()

In [308]:
tfidf_df.head(3)

Unnamed: 0,19th,20th,21st,academic,adult,adventure,africa,american,animals,anthologies,...,travel,trek,trivia,true,tv,unfinished,war,womens,world,writing
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.294656
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.409444,0.0,0.0,0.0,0.0


In [585]:
reviews_df = pd.read_csv('../data/big_data_temp/reviews.csv')

to_pivot = reviews_df[['UserID', 'ISBN','Rating']].drop_duplicates(subset=None, keep='first')

to_pivot[to_pivot[['UserID', 'ISBN']].duplicated()]
to_pivot.drop(index=[344,66423],inplace=True)

to_pivot

In [507]:
sparse_df = to_pivot.pivot(index='UserID', columns='ISBN', values='Rating')

In [510]:
sparse_df.fillna(0.0, inplace=True)

### Light FM

In [329]:
from lightfm import LightFM
from lightfm.data import Dataset

In [331]:
dataset = Dataset()
dataset.fit(reviews_df['UserID'],reviews_df['ISBN'])

In [339]:
dataset.fit_partial(items=books_df['isbn'], users=reviews_df['UserID'], item_features=features, user_features=None)

In [348]:
user_isbn = zip(reviews_df['UserID'], reviews_df['ISBN'])

In [349]:
(interactions, weights) = dataset.build_interactions(user_isbn)

In [335]:
np.array(books_df.loc[298,['isbn']])[0]

'0525954821'

In [343]:
list(tf_m_arr[0])

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.3768485485053178,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.24610936727437915,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.2249084177636906,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.40275026303800254,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.2984450681050103,
 0.0,
 0.0,
 0.0,
 0.3617456892004505,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.06540415309031819,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.24610936727437915,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.3431793038603882,
 0.0,
 0.12530726027986935,
 0.0,

In [345]:
tfidf_dct = tfidf_df.to_dict('index')

In [350]:
tf_m_arr = tfidf_matrix.toarray()

features_tuple = [(np.array(books_df.loc[i,['isbn']])[0], list(tfidf_dct[i])) for i in range(tf_m_arr.shape[0])]

In [351]:
item_features = dataset.build_item_features(features_tuple)

In [352]:
model = LightFM(loss='bpr')
model.fit(interactions, item_features=item_features)

<lightfm.lightfm.LightFM at 0x7fa3aaae7210>

In [373]:
model.predict([143], [234])

array([-3.58480263])

In [360]:
import numpy as np
from lightfm.datasets import fetch_movielens
movielens = fetch_movielens()

In [361]:
for key, value in movielens.items():
    print(key, type(value), value.shape)

train <class 'scipy.sparse.coo.coo_matrix'> (943, 1682)
test <class 'scipy.sparse.coo.coo_matrix'> (943, 1682)
item_features <class 'scipy.sparse.csr.csr_matrix'> (1682, 1682)
item_feature_labels <class 'numpy.ndarray'> (1682,)
item_labels <class 'numpy.ndarray'> (1682,)


In [362]:
train = movielens['train']
test = movielens['test']

In [369]:
train.toarray()

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]], dtype=int32)

In [371]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

model = LightFM(learning_rate=0.05, loss='bpr')
model.fit(train, epochs=10)

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.60, test 0.10.
AUC: train 0.90, test 0.86.


In [366]:
from scipy.sparse import coo_matrix

In [None]:
coo_matrix(D)

In [368]:
coo_matrix (tfidf_matrix.toarray())

<299x186 sparse matrix of type '<class 'numpy.float64'>'
	with 3246 stored elements in COOrdinate format>

### ratings predition with random forest for the train set.

In [5]:
books_df = read_books_info()
books_df['author_name'] = books_df['author_name'].apply(repl)
books_df = categorize_ratings(books_df,'avg_rating')
books_df['genres'] = books_df['genres'].apply(str_to_list)
books_df = books_df[['isbn', 'book_title', 'avg_rating', 'author_name', 'book_desc', 'genres']]
books_df = lower_cols(books_df, ['book_title', 'author_name', 'book_desc', 'genres'])
books_df = join_names(books_df, 'author_name')
books_df.reset_index(inplace=True)

Pattern : gr_books_df_*.csv
6


In [49]:
reviews_df = pd.read_csv('../data/big_data_temp/reviews.csv')
# Keep some data for testing
reviews_df_train, reviews_df_test, ys_tr, ys_te = train_test_split(reviews_df, reviews_df['Rating'], test_size=0.2, stratify = reviews_df['Rating'])
reviews_df_test.to_csv('../data/big_data_temp/test_reviews.csv')
reviews_df = reviews_df_train

In [50]:
to_pivot = reviews_df[['UserID', 'ISBN','Rating']].drop_duplicates(subset=None, keep='first')
print(to_pivot[to_pivot[['UserID', 'ISBN']].duplicated()])

     UserID       ISBN  Rating
325   10631  802714625       5


In [54]:
print(f'Number of users: {len(reviews_df.UserID.unique())}')
print(f'Max UserID: {max(reviews_df.UserID.unique())}')

Number of users: 11408
Max UserID: 12392


In [None]:
# Without SVD:

In [475]:
rfc=RandomForestRegressor(n_estimators=1000,max_depth=5, max_features='auto', n_jobs=-1)
rfc.fit(X_train,y_train)
y_pred=rfc.predict(X_test)

In [476]:
print(f'MSE =  {mean_squared_error(y_pred,y_test)}')
print(f'R2 = {r2_score(y_pred, y_test)}')

MSE =  1.063862485527479
R2 = -13.256476603236434


In [477]:
# gbc=GradientBoostingRegressor()
# gbc.fit(X_train,y_train)
# y_pred=gbc.predict(X_test)
# accuracy_score(y_pred,y_test)

In [479]:
df_pred = pd.DataFrame(y_pred)
df_test = pd.DataFrame(y_test)

In [480]:
df_test.reset_index()
df_test['Pred'] = y_pred[0]

In [481]:
df_test.describe()

Unnamed: 0,Rating,Pred
count,4054.0,4054.0
mean,3.914899,4.185964
std,1.069003,3.97953e-13
min,1.0,4.185964
25%,3.0,4.185964
50%,4.0,4.185964
75%,5.0,4.185964
max,5.0,4.185964


In [15]:
to_pivot.drop(index=[325, 66423],inplace=True)

sparse_df = to_pivot.pivot(index='UserID', columns='ISBN', values='Rating')
sparse_df.fillna(0.0, inplace=True)

In [25]:
sparse_df

ISBN,006145205X,014027541X,014311526X,033050889X,038535066X,038553082X,039330700X,039333810X,039335038X,039335279X,...,805095152,805390456,812967887,812975219,865478007,902543522X,91886570,91888050,91898242,965900584
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12387,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12388,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
books_df.head()

Unnamed: 0,index,isbn,book_title,avg_rating,author_name,book_desc,genres
0,0,1594488843,drive: the surprising truth about what motivat...,4.0,danielh.pink,the new york times bestseller that gives reade...,nonfiction business psychology leadership self...
1,1,033050889X,the mind's eye,3.9,oliversacks,"in musicophilia, oliver sacks explored music a...",nonfiction science psychology biology neurosci...
2,2,0465090796,quirkology: how we discover the big truths in ...,3.8,richardwiseman,"for over twenty years, psychologist professor ...",nonfiction psychology science audiobook scienc...
3,3,0684868768,"emergence: the connected lives of ants, brains...",4.0,stevenjohnson,in the tradition of being digital and the tipp...,science nonfiction science technology philosop...
4,4,067976867X,consilience: the unity of knowledge,3.9,edwardo.wilson,one of our greatest living scientists--and the...,science philosophy nonfiction science biology ...


In [760]:
### tfidf

In [24]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0, stop_words='english', max_features=200)
tfidf_matrix = tf.fit_transform(books_df['genres'])

# tf.stop_words_
# tf.vocabulary_
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tf.get_feature_names())

print(f'Indices of matrices are identical: {all(books_df.index == tfidf_df.index)}')

books_df = books_df.join(tfidf_df)
features = tf.get_feature_names()

In [810]:
users = content_df.UserID.unique()
books = content_df.ISBN.unique()

n_u = len(users)
n_b = len(books)
n_u* n_b

In [56]:
sparse_df

ISBN,006145205X,014027541X,014311526X,033050889X,038535066X,038553082X,039330700X,039333810X,039335038X,039335279X,...,805095152,805390456,812967887,812975219,865478007,902543522X,91886570,91888050,91898242,965900584
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12387,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12388,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import random as sparse_random
from sklearn.random_projection import sparse_random_matrix

svd = TruncatedSVD(n_components=5, n_iter=10, random_state=42)
svd.fit(sparse_df)

result = svd.transform(sparse_df)
result.shape

(11402, 5)

In [63]:
svd_5_df = pd.DataFrame(result, index=sparse_df.index)

In [64]:
svd_5_df

Unnamed: 0_level_0,0,1,2,3,4
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4.272372,-1.270422,-1.128989,-4.362246,0.055161
1,0.630145,0.259981,-0.048365,0.038609,-0.376098
3,0.119610,-0.128369,0.009897,0.000087,0.224200
4,0.190231,-0.064066,-0.045982,-0.070886,0.106251
5,0.547433,0.077496,0.098857,0.245665,0.083601
...,...,...,...,...,...
12387,0.699128,0.290328,0.203523,0.026209,-2.923470
12388,0.047164,0.021638,0.016098,0.020197,0.107159
12389,0.264596,0.183992,0.029223,0.109029,0.112097
12391,0.040479,-0.004121,-0.007023,0.022787,0.007309


In [65]:
svd_5_df.reset_index(inplace=True)
# svd_5_df.rename(columns={'index':'UserID'},inplace=True)

In [66]:
svd_5_df

Unnamed: 0,UserID,0,1,2,3,4
0,0,4.272372,-1.270422,-1.128989,-4.362246,0.055161
1,1,0.630145,0.259981,-0.048365,0.038609,-0.376098
2,3,0.119610,-0.128369,0.009897,0.000087,0.224200
3,4,0.190231,-0.064066,-0.045982,-0.070886,0.106251
4,5,0.547433,0.077496,0.098857,0.245665,0.083601
...,...,...,...,...,...,...
11397,12387,0.699128,0.290328,0.203523,0.026209,-2.923470
11398,12388,0.047164,0.021638,0.016098,0.020197,0.107159
11399,12389,0.264596,0.183992,0.029223,0.109029,0.112097
11400,12391,0.040479,-0.004121,-0.007023,0.022787,0.007309


In [518]:
###content df preprocessing

In [67]:
content_df = pd.merge(reviews_df, books_df, how='left', left_on='ISBN', right_on='isbn').dropna()
content_df = content_df.drop(columns=['Unnamed: 0', 'isbn', 'book_title', 'author_name', 'book_desc', 'genres'])

content_df = pd.merge(content_df, svd_5_df, how='left',on='UserID')

content_df.head()

Unnamed: 0,ISBN,UserID,Rating,index,avg_rating,19th,20th,21st,academic,adult,...,unfinished,war,womens,world,writing,0,1,2,3,4
0,1101984597,11051,5,0.0,4.2,0.0,0.0,0.0,0.0,0.387448,...,0.0,0.0,0.0,0.0,0.0,8.020884,3.725489,1.366453,1.914509,-7.013962
1,1851687793,341,4,28.0,3.9,0.0,0.0,0.0,0.0,0.0,...,0.305845,0.0,0.0,0.0,0.0,0.062571,0.007594,0.008298,0.057658,0.061236
2,902543522X,534,4,37.0,3.9,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.033398,-0.003643,0.027115,0.015707,0.027908
3,014027541X,11448,4,26.0,4.1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.062886,0.028851,0.021465,0.026929,0.142878
4,159420229X,6484,4,11.0,3.9,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.385566,0.674054,0.420167,0.387916,0.028966


In [70]:
content_df[content_df['ISBN'].isna()]

Unnamed: 0,ISBN,UserID,Rating,index,avg_rating,19th,20th,21st,academic,adult,...,unfinished,war,womens,world,writing,0,1,2,3,4


In [816]:
X = content_df.copy()
y = X.pop('Rating')

In [817]:
y = y.apply(float)

In [794]:
X.head(2)

Unnamed: 0,ISBN,UserID,index_x,avg_rating,19th,20th,21st,academic,adult,adventure,...,war,womens,world,writing,index_y,0,1,2,3,4
0,1592406599,6650,48.0,3.9,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6650.0,0.602262,-3.873803,0.516025,-0.025861,-0.155379
1,1861978766,6671,26.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6671.0,0.160134,0.058783,0.108543,0.121479,-0.390201


In [818]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)

In [819]:
user_isbn_train = X_train[['UserID', 'ISBN']]
user_isbn_test = X_test[['UserID', 'ISBN']]
X_train = X_train.drop(columns=['UserID', 'ISBN'])
X_test = X_test.drop(columns=['UserID', 'ISBN'])

In [820]:
X_train = X_train.drop(columns=['index_x','index_y'])

In [821]:
X_train[X_train.isna()]

Unnamed: 0,avg_rating,19th,20th,21st,academic,adult,adventure,africa,american,animals,...,unfinished,war,womens,world,writing,0,1,2,3,4
9900,,,,,,,,,,,...,,,,,,,,,,
9000,,,,,,,,,,,...,,,,,,,,,,
9303,,,,,,,,,,,...,,,,,,,,,,
2490,,,,,,,,,,,...,,,,,,,,,,
10564,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7357,,,,,,,,,,,...,,,,,,,,,,
4414,,,,,,,,,,,...,,,,,,,,,,
6984,,,,,,,,,,,...,,,,,,,,,,
5873,,,,,,,,,,,...,,,,,,,,,,


In [786]:
# Without SVD:

In [799]:
rfc=RandomForestRegressor(n_estimators=1000,max_depth=5, max_features='auto', n_jobs=-1)
rfc.fit(X_train,y_train)
y_pred=rfc.predict(X_test)

print(f'MSE =  {mean_squared_error(y_pred,y_test)}')
print(f'R2 = {r2_score(y_pred, y_test)}')

# gbc=GradientBoostingRegressor()
# gbc.fit(X_train,y_train)
# y_pred=gbc.predict(X_test)
# accuracy_score(y_pred,y_test)

df_pred = pd.DataFrame(y_pred)
df_test = pd.DataFrame(y_test)

df_test.reset_index()
df_test['Pred'] = y_pred[0]

df_test.describe()

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [632]:
# With SVD:

In [481]:
rfc=RandomForestRegressor(n_estimators=1000,max_depth=5, max_features='auto', n_jobs=-1)
rfc.fit(X_train,y_train)
y_pred=rfc.predict(X_test)

print(f'MSE =  {mean_squared_error(y_pred,y_test)}')
print(f'R2 = {r2_score(y_pred, y_test)}')

df_pred = pd.DataFrame(y_pred)
df_test = pd.DataFrame(y_test)

df_test.reset_index()
df_test['Pred'] = y_pred[0]

df_test.describe()

In [None]:
# rfc.predict(X_test)

In [635]:
X_test

Unnamed: 0,index_x,avg_rating,19th,20th,21st,academic,adult,adventure,africa,american,...,war,womens,world,writing,index_y,0,1,2,3,4
14466,33.0,4.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,3788,0.192330,0.033792,0.007854,0.007132,-0.073681
14916,18.0,4.0,0.0,0.0,0.0,0.348379,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,10924,0.016456,0.008767,0.019750,0.020787,0.028996
8177,27.0,4.2,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,11547,0.160250,-0.002660,0.124062,0.019191,-0.152169
18925,11.0,4.1,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,5931,0.040909,-0.008463,0.021125,0.061762,0.174223
5610,9.0,3.8,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,10026,0.202833,-0.007743,0.060261,0.148803,0.376101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15958,43.0,4.1,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.365699,9480,0.407080,0.070342,0.124524,0.222818,0.380940
807,9.0,4.0,0.0,0.0,0.0,0.000000,0.254748,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,2816,5.782952,0.377226,-3.399042,0.422128,-4.968830
2620,17.0,4.4,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,6759,0.315508,0.003683,0.097574,0.227000,0.506375
7212,30.0,4.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,11778,0.402372,0.154817,0.194839,0.126299,-0.083244


### Train the model on whole dataset:

In [726]:
X = rated_subset.copy()
y = X.pop('Rating')

In [727]:
user_isbn_data = X[['UserID', 'ISBN']]
X = X.drop(columns=['UserID', 'ISBN'])

In [728]:
rfr=RandomForestRegressor(n_estimators=1000,max_depth=5, max_features='auto', n_jobs=-1)
rfr.fit(X,y)

RandomForestRegressor(max_depth=5, n_estimators=1000, n_jobs=-1)

In [729]:
user_isbn_data

Unnamed: 0,UserID,ISBN
0,510,1400063515
69,4433,1400063515
138,206,1400063515
166,206,076790818X
180,206,1400052173
...,...,...
263791,4892,1439192812
263792,4892,1439192812
263793,4892,1439192812
263794,4892,1439192812


In [730]:
pairs = []
for user  in users:
    for book in books:
        dct = {}
        dct['UserID'] = user
        dct['ISBN'] = book
        pairs.append(dct)

all_df = pd.DataFrame(pairs)
all_df = pd.merge(all_df, books_df, how='left', left_on='ISBN', right_on='isbn')
all_df =all_df.drop(columns=['index', 'isbn', 'book_title', 'author_name', 'book_desc', 'genres'])
all_df = pd.merge(all_df, svd_5_df, how='left', left_on='UserID', right_on='index')
all_df =all_df.drop(columns=['index'])
df_all = pd.merge(all_df, content_df[['UserID','ISBN','Rating']], how='left', on=['UserID','ISBN'])

rated_subset = df_all.dropna()
to_rate = df_all[df_all['Rating'].isna()]

X_to_rate = to_rate.copy()
y = X_to_rate.pop('Rating')
user_isbn_to_rate = X_to_rate[['UserID', 'ISBN']]
X_to_rate = X_to_rate.drop(columns=['UserID', 'ISBN'])

In [731]:
ratings_pred = rfr.predict(X_to_rate)
df_pred = pd.DataFrame(ratings_pred)

In [736]:
predicted_ratings_df = user_isbn_to_rate.reset_index().join(df_pred)

In [743]:
predicted_ratings_df = predicted_ratings_df.rename(columns={0:'Rating'}).drop(columns='index')

In [745]:
predicted_ratings_df.head(2)

Unnamed: 0,UserID,ISBN,Rating
0,510,1400067820,3.967359
1,510,1101874937,3.98471


In [747]:
rated_df = rated_subset[['UserID','ISBN','Rating']]

In [749]:
all_ratings_df = pd.concat([predicted_ratings_df,rated_df])

In [751]:
all_ratings_df.to_csv('../data/big_data_temp/predicted_ratings.csv')