In [25]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")


In [26]:
# import the dataset

books = pd.read_csv("Books.csv")
users = pd.read_csv("Users.csv")
ratings = pd.read_csv("Ratings.csv")

In [27]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [28]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [29]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [30]:
print(books.shape)
print(ratings.shape)
print(users.shape)

(271360, 8)
(1149780, 3)
(278858, 3)


In [31]:
books.isnull().sum()


ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [32]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [33]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [34]:
print(books.duplicated().sum())
print(users.duplicated().sum())
print(ratings.duplicated().sum())

0
0
0


In [35]:
ratings_with_book_titles = ratings.merge(books,on='ISBN')

In [36]:
ratings_with_book_titles.drop(columns=["ISBN","Image-URL-S","Image-URL-M"],axis=1,inplace=True)

In [37]:
complete_df = ratings_with_book_titles.merge(users.drop("Age", axis=1), on="User-ID")
complete_df.drop(columns=["Image-URL-L"], axis = 1, inplace = True)
complete_df.head()

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Location
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"tyler, texas, usa"
1,276726,5,Rites of Passage,Judith Rae,2001,Heinle,"seattle, washington, usa"
2,276727,0,The Notebook,Nicholas Sparks,1996,Warner Books,"h, new south wales, australia"
3,276729,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,"rijeka, n/a, croatia"
4,276729,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,"rijeka, n/a, croatia"


In [38]:
complete_df['Location'] = complete_df['Location'].str.split(',').str[-1].str.strip()

In [39]:
complete_df.head()

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Location
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,usa
1,276726,5,Rites of Passage,Judith Rae,2001,Heinle,usa
2,276727,0,The Notebook,Nicholas Sparks,1996,Warner Books,australia
3,276729,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,croatia
4,276729,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,croatia


In [40]:
# Select user IDs with more than 200 book ratings
min_ratings_threshold = 300

# Count book ratings per user
num_ratings_per_user = complete_df.groupby('User-ID')['Book-Rating'].count()

# Filter users with more than the minimum threshold
knowledgeable_user_ids = num_ratings_per_user[num_ratings_per_user > min_ratings_threshold].index

In [41]:
# Filter ratings from knowledgeable users
knowledgeable_user_ratings = complete_df[complete_df['User-ID'].isin(knowledgeable_user_ids)]

In [42]:
min_ratings_count_threshold=50
rating_counts= knowledgeable_user_ratings.groupby('Book-Title').count()['Book-Rating']
popular_books = rating_counts[rating_counts >= min_ratings_count_threshold].index

In [43]:
final_ratings =  knowledgeable_user_ratings[knowledgeable_user_ratings['Book-Title'].isin(popular_books)]

In [44]:
pt = final_ratings.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating')
pt

User-ID,2276,3363,4385,6251,6543,6575,7158,7346,8681,8936,...,270713,271284,273979,274004,274061,274301,274308,275970,277427,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1st to Die: A Novel,,,,,9.0,,0.0,,,0.0,...,,,,,,,,,,
2nd Chance,10.0,,,,0.0,,,,,,...,,,,,,,0.0,,,
4 Blondes,,,,0.0,,,,,,,...,,,,,,,,,,
A Bend in the Road,,,,,,1.0,,,,,...,,,0.0,,,,,,,
A Case of Need,,,,0.0,,,,,,0.0,...,,0.0,,,0.0,,7.0,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Without Remorse,,,,0.0,,,,,,,...,,0.0,,,,,,,,
Wuthering Heights,,,,,,,,,,,...,,,,,,,0.0,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,0.0,,0.0,,,,,,,...,,,,,,,,0.0,,
Zoya,,,,,,,,,,,...,,0.0,0.0,,,,,,,


In [45]:
pt.fillna(0,inplace=True)
pt

User-ID,2276,3363,4385,6251,6543,6575,7158,7346,8681,8936,...,270713,271284,273979,274004,274061,274301,274308,275970,277427,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1st to Die: A Novel,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Case of Need,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Without Remorse,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Wuthering Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
def matrix_factorization_sgd(R, K, steps=5000, alpha=0.0002, lam=0.01):
    """
    Perform matrix factorization to predict empty entries in a matrix.
    R: user-item rating matrix
    K: number of latent dimensions
    steps: number of iterations to perform stochastic gradient descent
    alpha: learning rate
    lam: regularization parameter
    """
    N, M = R.shape
    U = np.random.rand(N, K)
    V = np.random.rand(M, K)

    for step in range(steps):
        for i in range(N):
            for j in range(M):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(U[i,:], V[j,:].T)
                    for k in range(K):
                        U[i][k] += alpha * (2 * eij * V[j][k] - 2 * lam * U[i][k])
                        V[j][k] += alpha * (2 * eij * U[i][k] - 2 * lam * V[j][k])

        # Compute total loss
        loss = 0
        for i in range(N):
            for j in range(M):
                if R[i][j] > 0:
                    loss += (R[i][j] - np.dot(U[i,:], V[j,:].T))**2
        loss += lam * (np.linalg.norm(U)**2 + np.linalg.norm(V)**2)
        if step % 1000 == 0:
            print("Step:", step, " Loss:", loss)

    return U, V

K = 2  # Number of latent dimensions
U, V = matrix_factorization_sgd(pt.values, K)
print("User Latent Factors (U):")
print(U)
print("Item Latent Factors (V):")
print(V)

print(U.shape, V.shape)

Step: 0  Loss: 445379.8430980849
Step: 1000  Loss: 15527.859018858875
Step: 2000  Loss: 14788.447898794355
Step: 3000  Loss: 14544.549386515508
Step: 4000  Loss: 14421.708609482675
User Latent Factors (U):
[[ 1.43188278  2.29868827]
 [ 1.54228285  2.3249422 ]
 [-0.14269763  2.81320488]
 [ 2.24111605  1.2912349 ]
 [ 0.13768315  3.28821004]
 [-0.74362803  4.72774007]
 [ 2.6311997   1.35256723]
 [ 2.08623883  1.64444181]
 [ 2.637581    1.30592332]
 [ 1.9384601   2.04689921]
 [ 2.07674415  1.48992908]
 [ 1.71827302  2.86520736]
 [ 2.28643357  1.7536296 ]
 [ 2.69462768  1.29927474]
 [ 1.27763767  2.68302657]
 [ 1.96034955  1.95391017]
 [ 1.71503173  2.12693199]
 [ 2.02708989  2.55520132]
 [ 1.34984589  3.179215  ]
 [ 3.51961457  0.81058221]
 [ 2.68685692  1.11785456]
 [ 2.21132552  1.17419543]
 [ 0.79751511  2.93428243]
 [ 1.5966628   2.51451929]
 [ 2.77736842  1.21747946]
 [ 1.88937382  2.14734211]
 [ 1.99292948  2.20993178]
 [ 2.23874242  1.87297223]
 [ 1.28662641  2.86089348]
 [ 2.062657

In [47]:
def generate_recommendations(U, V, user_id, books_df, rating_matrix, top_n=5):
    """
    Generate recommendations for a given user.
    U: User latent factors matrix
    V: Item latent factors matrix
    user_id: ID of the user for whom recommendations are generated
    books_df: DataFrame containing book information
    rating_matrix: User-item rating matrix
    top_n: Number of recommendations to generate
    """

    user_index = rating_matrix.columns.get_loc(user_id)
    
    # Get the latent factors for the user
    user_latent_factors = U[user_index]

    # Compute predicted ratings for the user
    predicted_ratings = np.dot(user_latent_factors, V.T)

    # Get the indices of top-N items with the highest predicted ratings
    top_indices = np.argsort(predicted_ratings)[::-1][:top_n]

    # Filter out items that the user has already rated
    rated_books = rating_matrix.iloc[:, user_index][rating_matrix.iloc[:, user_index].notnull()].index
    top_indices = [idx for idx in top_indices if idx < len(rating_matrix.index) and idx not in rated_books]

    # Get the corresponding book IDs
    recommended_books = rating_matrix.index[top_indices]

    return recommended_books.values

userList = [6575, 7346, 11601, 11676, 12538,	13552,	15408]

for user_id in userList:
    top_n = 3  # Number of recommendations to generate
    
    # Generate recommendations for the user
    recommendations = generate_recommendations(U, V, user_id, books, pt, top_n)
    print("Top", top_n, "Recommendations for User", user_id, ":")
    print(recommendations)

Top 3 Recommendations for User 6575 :
['Heartbreaker' 'The Angel of Darkness' 'Stillwatch']
Top 3 Recommendations for User 7346 :
['Dances With Wolves' '4 Blondes' 'The Bourne Supremacy']
Top 3 Recommendations for User 11601 :
['Heartbreaker' 'The Bourne Supremacy']
Top 3 Recommendations for User 11676 :
['Dances With Wolves' '4 Blondes' 'The Bourne Supremacy']
Top 3 Recommendations for User 12538 :
['Heartbreaker' 'The Bourne Supremacy']
Top 3 Recommendations for User 13552 :
['Heartbreaker' 'The Bourne Supremacy' 'Dances With Wolves']
Top 3 Recommendations for User 15408 :
['Five Days in Paris' 'The Stone Diaries' 'The Glass Lake']


In [48]:
from sklearn.metrics.pairwise import cosine_similarity 


In [49]:
similarity_score = cosine_similarity(pt)

In [50]:
def recommend(book_name):
    index = np.where(pt.index==book_name)[0][0]
    similar_books = sorted(list(enumerate(similarity_score[index])),key=lambda x:x[1], reverse=True)[1:6]
    
    data = []
    
    for i in similar_books:
        item = []
        temp_df = books[books['Book-Title'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))
        
        data.append(item)
    return data

In [51]:
recommend("Rebecca")

[['The Edge',
  'Catherine Coulter',
  'http://images.amazon.com/images/P/0515128600.01.MZZZZZZZ.jpg'],
 ['Fried Green Tomatoes at the Whistle Stop Cafe',
  'Fannie Flagg',
  'http://images.amazon.com/images/P/0070212570.01.MZZZZZZZ.jpg'],
 ['Voyager',
  'DIANA GABALDON',
  'http://images.amazon.com/images/P/0440217563.01.MZZZZZZZ.jpg'],
 ['To Kill a Mockingbird',
  'Harper Lee',
  'http://images.amazon.com/images/P/0446310786.01.MZZZZZZZ.jpg'],
 ['A Prayer for Owen Meany',
  'John Irving',
  'http://images.amazon.com/images/P/0345361792.01.MZZZZZZZ.jpg']]

In [52]:
recommend("Call of the Wild")

[['Invasion',
  'Robin Cook',
  'http://images.amazon.com/images/P/0425155404.01.MZZZZZZZ.jpg'],
 ['The Shipping News : A Novel',
  'Annie Proulx',
  'http://images.amazon.com/images/P/0743225406.01.MZZZZZZZ.jpg'],
 ['The Girl Who Loved Tom Gordon',
  'Stephen King',
  'http://images.amazon.com/images/P/0671042858.01.MZZZZZZZ.jpg'],
 ['Daisy Fay and the Miracle Man',
  'Fannie Flagg',
  'http://images.amazon.com/images/P/0446394521.01.MZZZZZZZ.jpg'],
 ['The Hours: A Novel',
  'Michael Cunningham',
  'http://images.amazon.com/images/P/0312305060.01.MZZZZZZZ.jpg']]