In [4]:
import re
import csv
import time
import nltk
import json
import string
import numpy as np
import pandas as pd
from nltk import stem
import sklearn.metrics
from random import randint
from numpy.linalg import norm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

with open('../new_userDict.json', 'r') as fp:
    
    new_userDict = json.load(fp)
    
with open('../new_isbnDict.json', 'r') as fp:
    
    new_isbnDict = json.load(fp)
    
with open('../dict_row.json', 'r') as fp:
    
    dict_row = json.load(fp)
    
with open('../dict_col.json', 'r') as fp:
    
    dict_col = json.load(fp)
    
        
print("Ok")

Ok


In [5]:
def booksRatedUser(new_isbnDict, new_userDict, user_number, score):
    
    '''
    
    input:  new_userDict (Dict), user_number (int), score (int)
    
    action: select all books well rated by my user
    
    output: books_rated (list)
    
    '''
    
    books_rated = []
    
    for book in new_userDict[str(user_number)]:
        
        if int(new_userDict[str(user_number)][book]) > score:
            
            try:
                
                new_isbnDict[book]
                books_rated.append(book)
                
            except:
                
                continue
            
    return list(set(books_rated))


def SimilarityBooks(utility_DataFrame, book_number, books_similar):
    
    '''
    
    input: utility_DataFrame (DataFrame), book_number (int), books_similar (List)
    
    action: compute cosine similarity between book_number and all the books in books_similar
    
    output: new_similarity (List of tuples)
    
    '''

    x = utility_DataFrame[str(book_number)]
    x_length = norm(x)
    
    y = utility_DataFrame[books_similar]
    y_length = norm(utility_DataFrame[books_similar],axis=0)

    
    num = (y.T.values*x.values).sum(axis=1)
    den = x_length*y_length

    similarity = num/den
    similarity = np.nan_to_num(similarity)
    
    d = list(zip(list(books_similar),similarity))
    new_similarity = sorted(d, key=lambda tup: tup[1], reverse=True)
    
    return new_similarity

def itemItemsRecommendation(new_similarity, user_number, book_number, k, new_userDict):
    
    
    '''
    
    input:  new_similarity(List of tuples), new_isbnDict(Dict), k(int)
    
    action: compute mean similarity of the first k items
    
    output: recommendation (float)
    
    '''
    
    
    if len(new_similarity) < k:
        
        recommendation = np.mean([u[1] for u in new_similarity])
    
    else:
        
        recommendation = np.mean([u[1] for u in new_similarity[:k]])
        
    
    return recommendation


def itemItemsScore(new_userDict, new_similarity, k, user_number):
    
    score = [int(new_userDict[str(user_number)][u[0]]) for u in new_similarity[:k]]
    
    if score == []:
        
        return None
    
    return np.mean(score)


def CollaborativeFilteringItemItemsRMSE(utility_DataFrame, new_userDict, new_isbnDict, user_number, score_min, book_number, k):
    
    books_rated_user = booksRatedUser(new_isbnDict, new_userDict, user_number,score_min)
    
    if books_rated_user == []:
        
        return None
    
    new_similarity = SimilarityBooks(utility_DataFrame, book_number, books_rated_user)
    
    if new_similarity == []:

        return None
    
    score = itemItemsScore(new_userDict, new_similarity[1:], k, user_number)
    
    return score

In [6]:
def createSampleDict(new_isbnDict, new_userDict, t1, t2):
    

    small_isbnDict = {}

    for book in new_isbnDict:

        temp = new_isbnDict[book]
        i = 0

        for t in temp.values():
            if t != "0":
                i = i+1

        if i >t1:

            small_isbnDict[book] = new_isbnDict[book]   

    small_userDict = {}

    for user in new_userDict:

        temp = new_userDict[user]
        i = 0

        for t in temp.values():
            if t != "0":
                i = i+1

        if i >t2:

            small_userDict[user] = new_userDict[user]
            
            
    return small_isbnDict, small_userDict


def computeMatrices(train_userDict,train_isbnDict,small_userDict,small_isbnDict, dict_row, dict_col):

    n = len(small_isbnDict)
    m = len(small_userDict)
    
    index = sorted(small_userDict.keys())
    columns = sorted(small_isbnDict.keys())

    dict_row = {k:v for v,k in enumerate(index)}
    dict_col = {k:v for v,k in enumerate(columns)}

    u = np.zeros((m,n)) 
    R = np.zeros((m,n))
    for user in train_userDict:
        for isbn in train_userDict[user]:
            try:
                u[dict_row[user]][dict_col[isbn]] = train_userDict[user][isbn]
                R[dict_row[user]][dict_col[isbn]] = 1
            except:
                continue

    for isbn in train_isbnDict:
        for user in train_isbnDict[isbn]:
            try:
                u[dict_row[user]][dict_col[isbn]] = train_isbnDict[isbn][user]
                R[dict_row[user]][dict_col[isbn]] = 1
            except:
                continue
                
    small_utility_DataFrame = pd.DataFrame(u, index = index, columns = columns)
    R = pd.DataFrame(R, index = index, columns = columns)

    return u, R, small_utility_DataFrame


def selectSample(i,b,list_user,small_userDict, small_isbnDict):

    test_index = [k for k in range((i*b),(i+1)*b)]
    train_index = [k for k in range(len(list_user)) if k not in test_index]

    train = list_user[train_index]
    test  = list_user[test_index]

    train_userDict = {}
    train_isbnDict = {isbn:{} for isbn in small_isbnDict}

    for user in train:

        train_userDict[user] = small_userDict[user]

    for isbn in small_isbnDict:
        for user in small_isbnDict[isbn]:

            try:
                train_userDict[user]
                train_isbnDict[isbn][user] = small_isbnDict[isbn][user]
            except:
                continue
                
                
    test_userDict = {}
    test_isbnDict = {isbn:{} for isbn in small_isbnDict}

    for user in test:

        test_userDict[user] = small_userDict[user]

    for isbn in small_isbnDict:
        for user in small_isbnDict[isbn]:

            try:
                test_userDict[user]
                test_isbnDict[isbn][user] = small_isbnDict[isbn][user]
            except:
                continue
                
    return train_userDict, train_isbnDict, test_userDict, test_isbnDict, test


#indices = [i for i in range(len(list_user))]
#indices = np.array(indices)
#k = 5
#b = len(small_userDict)//k


def selectSampleRandom(indices,b,list_user,small_userDict, small_isbnDict):
    
    test_index = np.random.choice(indices, size = b, replace=False)
    
    indices = np.delete(indices,test_index)
    
    train_index = [k for k in range(len(list_user)) if k not in test_index]
    

    train = list_user[train_index]
    test  = list_user[test_index]

    train_userDict = {}
    train_isbnDict = {isbn:{} for isbn in small_isbnDict}

    for user in train:

        train_userDict[user] = small_userDict[user]

    for isbn in small_isbnDict:
        for user in small_isbnDict[isbn]:

            try:
                train_userDict[user]
                train_isbnDict[isbn][user] = small_isbnDict[isbn][user]
            except:
                continue
                
    return train_userDict, train_isbnDict, test_userDict, test_isbnDict, test, indices

In [10]:
def mainItemItemsRMSE(utility_DataFrame,R, new_userDict, new_isbnDict, score_min=0, k=3):
    

    rmse_dict = {j:{} for j in new_userDict.keys()}

    for user in new_userDict:

        for book in new_userDict[user]:

            if new_userDict[user][book] != "0":

                try:
                    new_isbnDict[book]
                    rmse_dict[user][book] = new_userDict[user][book]
                except:

                    continue

    d = rmse_dict.keys()
    v = []

    for key in d:

        if rmse_dict[key] == {}:

            v.append(key)

    for i in v:

        del rmse_dict[i]

    rmse_vector_itemItems = []
    
    i = 0
    for user in rmse_dict:
        
        i = i+1
        if i%1000 == 0:
            
            print("Pause")
            time.sleep(30)

        for isbn in rmse_dict[user]:
            
            prediction_score = None
            user_number = user
            book_number = isbn
            
            try:
                prediction_score = CollaborativeFilteringItemItemsRMSE(utility_DataFrame, new_userDict, new_isbnDict, 
                                                                   user_number, score_min, book_number, k)
                true_score = utility_DataFrame[book_number][user_number]
                
                if prediction_score == None:
                    
                    term1 = utility_DataFrame[book_number]
                    r1 = R[book_number]
                    term1 = term1[r1 ==1]
                    
                    term2 = utility_DataFrame.loc[user_number]
                    r2 = R.loc[user_number]
                    term2 = term2[r2==1]
                    
                    prediction_score = (np.mean(term1) + np.mean(term2))/2
            
            except:
                continue

            if prediction_score != None:

                rmse_vector_itemItems.append(tuple([user_number, book_number, prediction_score, true_score]))


    rmse = np.sqrt((np.array([u[2]-u[3] for u in rmse_vector_itemItems])**2).sum()/len(rmse_vector_itemItems))
    
    return rmse, rmse_vector_itemItems

In [22]:
u, R, small_utility_DataFrame = computeMatrices(small_userDict,small_isbnDict,small_userDict,small_isbnDict,dict_row,dict_col)

rmse_test, rmse_vector_itemItems = mainItemItemsRMSE(small_utility_DataFrame,R,small_userDict,small_isbnDict,score_min=0,k=3)

Pause
Pause
Pause


In [24]:
rmse_vector_itemItems[:100]

[('183985', '0374199698', 5.840909090909091, 9.0),
 ('188513', '0312421702', 4.0, 9.0),
 ('188513', '034545104X', 9.0, 4.0),
 ('49061', '0385491026', 5.5, 8.0),
 ('167038', '0451210743', 2.6666666666666665, 8.0),
 ('226205', '0812513738', 7.333333333333333, 9.0),
 ('226205', '0399151648', 7.333333333333333, 6.0),
 ('226205', '0553264303', 7.0, 8.0),
 ('226205', '0671027646', 7.333333333333333, 8.0),
 ('226205', '0743411269', 7.333333333333333, 8.0),
 ('226205', '0399145885', 7.333333333333333, 8.0),
 ('226205', '0446343129', 7.333333333333333, 7.0),
 ('226205', '0425191729', 7.333333333333333, 7.0),
 ('226205', '0425177068', 7.333333333333333, 7.0),
 ('83587', '0440993717', 8.0, 7.0),
 ('83587', '0553211404', 7.0, 8.0),
 ('175822', '2253057843', 4.875, 6.0),
 ('33345', '0312870582', 2.6666666666666665, 4.0),
 ('77724', '0425158640', 6.923076923076923, 10.0),
 ('57109', '0816707987', 6.0, 8.0),
 ('170255', '0452284449', 5.0, 7.0),
 ('170255', '0156027321', 7.0, 5.0),
 ('31252', '0380813

# K-fold cross validation for collaborative filtering item-based

In [8]:
def itemItems_KfoldsCV(small_userDict,small_isbnDict, dict_row, dict_col, kk):

    RMSE = []
    #kk = 5
    list_user = np.array(sorted(small_userDict.keys()))
    b = len(small_userDict)//kk

    for iterator in range(kk):

        train_userDict, train_isbnDict, test_userDict, test_isbnDict, test =  selectSample(iterator,b,list_user,
                                                                                           small_userDict, small_isbnDict)
# use only train_userDict to create the utility matrix
        u, R, small_utility_DataFrame = computeMatrices(train_userDict,small_isbnDict,small_userDict,small_isbnDict, dict_row, dict_col)
# search users only in test_userDict
        rmse_test, rmse_vector_itemItems = mainItemItemsRMSE(small_utility_DataFrame,R, test_userDict, test_isbnDict, score_min=0, k=3)
        print("Ok")

        RMSE.append(rmse_test)
        
    return RMSE

In [19]:
small_userDict = {k:new_userDict[k] for k in list(new_userDict.keys())[:5000]}
small_isbnDict = {k:new_isbnDict[k] for k in list(new_isbnDict.keys())[:5000]}

In [25]:
#small_isbnDict, small_userDict = createSampleDict(new_isbnDict, new_userDict, 20, 20)

In [20]:
RMSE = itemItems_KfoldsCV(small_userDict,small_isbnDict, dict_row, dict_col, 5)

Ok
Ok
Ok
Ok
Ok


In [21]:
print("rmse collaborative filtering item-based: ",np.mean(RMSE))

rmse collaborative filtering item-based:  2.04692489866
