<a href="https://colab.research.google.com/github/flo-shi/Recommender-System/blob/master/recommenda_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats # For statistical functions
from sklearn.model_selection import train_test_split # Splitting train and test data
from sklearn.metrics.pairwise import cosine_similarity # For user to user similarities
import warnings

Importing datasets

In [4]:
df1 = pd.read_csv(r'C:\Users\pjr\OneDrive\Documents\data science\Recommender system\archive (1)\Books.csv')
df2 = pd.read_csv(r'C:\Users\pjr\OneDrive\Documents\data science\Recommender system\archive (1)\Ratings.csv')
df3 = pd.read_csv(r'C:\Users\pjr\OneDrive\Documents\data science\Recommender system\archive (1)\Users.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\pjr\\OneDrive\\Documents\\data science\\Recommender system\\archive (1)\\Books.csv'

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Understanding this dataset

In [None]:
# df1 Books
print(df1.shape) # No of rows and columns
df1.head() # First 5 rows in the dataset

In [None]:
df1.tail()

In [None]:
df1.drop(columns=['Image-URL-S','Image-URL-M','Image-URL-L'],inplace=True)

In [None]:
# df2 Ratings
print(df2.shape)
df2.head()

In [None]:
# df3 Users
print(df3.shape)
df3.head()

In [None]:
df1.describe()

In [None]:
df3.describe()

Data Cleaning

In [None]:
df1.duplicated() #  Check for duplicates in rows

In [None]:
df2.duplicated()


In [None]:
df3.duplicated()

In [None]:
df1.nunique().sort_values(ascending=False)

In [None]:
df1['Year-Of-Publication'] = pd.to_numeric(df1['Year-Of-Publication'], errors='coerce')

In [None]:
# Dealing with null values of df1(Books)
df1.isnull().sum().sort_values(ascending=True)

In [None]:
# Remove rows with null values
df1 = df1.dropna(subset=['Book-Author','Publisher','Year-Of-Publication'])

In [None]:
df1.info()

In [None]:
df2.isnull().sum().sort_values(ascending=True)

In [None]:
df3.isnull().sum().sort_values(ascending=True)

In [None]:
# Dealing with null values of df3(User's age)
df3['Age'].dropna(inplace=True)

Dealing with outliers

In [None]:
# df3
plt.figure(figsize=(10,8))
plt.title('boxplot for users ')
sns.boxplot(data=df3, x='Age')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
plt.hist(data=df3, x = 'Age')
plt.show()

In [None]:
def find_boundaries(data,variable):
    # Finding the IQR
    Q1 = data[variable].quantile(.25)
    Q3 = data[variable].quantile(.75)
    IQR = Q3 - Q1
    # Finding lower and upper bounadry
    l_boundary = Q1 - 1.5 * IQR
    u_boundary = Q3 + 1.5 * IQR
    return l_boundary, u_boundary

l_column , u_column = find_boundaries(df3, 'Age')
print("Lower boundary is: ", l_column)
print("Upper boundary is: ", u_column)


In [None]:
remove = np.where(df3['Age'] > 74, True,
                  np.where(df3['Age'] < 10, True, False))

df3 = df3.loc[~(remove)]


In [None]:
plt.figure(figsize=(8,5))
sns.set_theme(style='darkgrid',palette="rocket")
ax =  sns.histplot(data=df3, x= 'Age', kde=True, bins=40)

In [None]:
sns.boxplot(df3['Age'])

In [None]:
# df1
plt.figure(figsize=(10,8))
plt.title('boxplot for Year-Of-Publication ')
sns.boxplot(df1['Year-Of-Publication'])
plt.show()

In [None]:
# df1['Year-Of-Publication'].plot(kind='hist')
sns.histplot(df1['Year-Of-Publication'])

In [None]:
lb_column,ub_column = find_boundaries(df1, 'Year-Of-Publication')
print("Lower boundary is: ", lb_column)
print("Upper boundary is: ", ub_column)

In [None]:
remove_year = np.where(df1['Year-Of-Publication'] < 1972, True,
                       np.where(df1['Year-Of-Publication'] > 2016, True, False))
df1 = df1.loc[~(remove_year)]

# Histogram for year of population
plt.figure(figsize=(8,5))
sns.set_theme(style='darkgrid')
ax = sns.histplot(data=df1, x = 'Year-Of-Publication', kde=True)

Dealing with categorical data

In [None]:
# Construct new attribute
df3['Country'] = df3['Location'].str.extract(r',\s*([^,]+)$')

In [None]:
df3.head()

In [None]:
df3.drop(columns=['Location'], inplace=True)

In [None]:
country_counts = df3['Country'].value_counts()
print(country_counts)

In [None]:
plt.figure(figsize=(12,16))
plt.pie(country_counts, labels=country_counts.index, autopct='%1.1f%%')

Merging the diferent dataframes

In [None]:
df_merge = pd.merge(df1,df2, on='ISBN', how='inner')

In [None]:
final_df = pd.merge(df_merge, df3, on='User-ID', how='inner')

In [None]:
print(final_df.shape)
final_df.head()

In [None]:
# Identifing unique values
print("The total dataset has ", final_df.shape[0]," values.")
print("The dataset has ", final_df['ISBN'].nunique(), " books with unique values")
print("The dataset has ", final_df['User-ID'].nunique(), " users with unique values")
print("The dataset has ", sorted(final_df['Book-Rating'].unique()), " ratings.")



In [None]:
final_df['Book-Rating'].hist()

In [None]:
final_df.isnull().sum().sort_values(ascending=True)

In [None]:
final_df.dropna(subset=['Age','Country'], inplace=True)
print(final_df.shape)

In [None]:
author_counts = final_df['Book-Author'].value_counts()
print(author_counts)

Transforming categorical data and splitting train and test data

In [None]:
# Divide data into training and vlidation subsets
x_train, x_test, y_train, y_test = train_test_split(final_df[['Book-Title','User-ID','Book-Author','Publisher','Year-Of-Publication','Book-Rating']]
                                                    ,final_df['ISBN'], train_size=0.8, test_size=0.2, random_state=0)


In [None]:
cat_variable = ['Book-Author','Publisher']

In [None]:
# Frequency Encoding Technique
encoder_dict = {}
for var in cat_variable:
    encoder_dict[var] = (x_train[var].value_counts()/len(x_train)).to_dict()


encoder_dict

In [None]:
# Replace the categorical value with numeric value
for var in cat_variable:
    x_train[var] = x_train[var].map(encoder_dict[var])


In [None]:
print(x_train.shape)
x_train.head()

In [None]:

train_df = pd.concat([y_train, x_train], axis=1)
train_df.head()

In [None]:
# Identifing unique values in the train df
print("The total dataset has ", train_df.shape[0]," values.")
print("The dataset has ", train_df['ISBN'].nunique(), " books with unique values")
print("The dataset has ", train_df['User-ID'].nunique(), " users with unique values")
print("The dataset has ", sorted(train_df['Book-Rating'].unique()), " ratings.")



In [None]:
# Group movies by column showing number and mean of rating for each book
agg_rating = train_df.groupby('ISBN').agg(rating_count = ('Book-Rating','count'),
                                          rating_mean = ('Book-Rating', 'mean')).reset_index()
print(agg_rating)
agg_rating.info()

In [None]:
# Filter books with 50+ ratings only
popular_books = agg_rating[agg_rating['rating_count']>= 120]
print(len(popular_books))

In [None]:
popular_books.sort_values(by='rating_count', ascending=False).head()


In [None]:
popular_books.sort_values(by='rating_count', ascending=False).tail()

In [None]:
# Checking the relationship between rating count and mean.
sns.relplot(x='rating_mean', y='rating_count', data=popular_books)

In [None]:
sns.jointplot(x='rating_mean', y='rating_count', data=popular_books) #Better approach

In [None]:
# mearge the popular books and train dataframe
final_train_df = pd.merge(train_df,popular_books, on='ISBN', how='inner')
final_train_df.head()

In [None]:
final_train_df.tail()

In [None]:
print(final_train_df.shape)
print(final_train_df['User-ID'].nunique())

User item Matrix

In [None]:
# User item matrix. Rows = users, column = book , value = rating
matrix = final_train_df.pivot_table(index='User-ID', columns='Book-Title', values='Book-Rating')
matrix.head(10)

In [None]:
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis= 'rows')
print(matrix_norm.shape)
matrix_norm.head()

In [None]:
matrix_norm.tail()

In [None]:
# Pearson correlation similarity
user_similarity = matrix_norm.T.corr()
user_similarity.head()

In [None]:
# Cosine Similarities
c_user_similarity = cosine_similarity(matrix_norm.fillna(0))
c_user_similarity

In [None]:
# Using User-ID 1 as an example to find similar users
pick_user = 75

# Remove him from the list
user_similarity.drop(index=pick_user, inplace=True)

# Take a look at the data
user_similarity.head()

Similarity matrix ranges from -1 to 1. with +ve values means same movie preference and -ve values means opposite movie preference
n = 10. picking out top 10 most similar users
since we are looking for similar interest we will set the threshold to a +ve value 0.3
sort the similarity from highest to lowest


In [None]:
# Number of similar users
n = 10

# User similarity threashold
similarity_threashold = 0.3

# Get top 10 similar users
similar_user = user_similarity[user_similarity[pick_user]>similarity_threashold] [pick_user].sort_values(ascending=False)[:n]

print(f"Similar users for user id {pick_user} are ", similar_user)

Narrowing down the item pool

In [None]:
# Books that picked user has read
pick_user_read = matrix_norm[matrix_norm.index == pick_user].dropna(axis=1, how='all')
pick_user_read

In [None]:
# Movies that similar users watched. Remove movies that none of the similar users have watched
similar_user_book = matrix_norm[matrix_norm.index.isin(similar_user.index)].dropna(axis=1, how='all')
similar_user_book

In [None]:
# Remove books watched by target user and keep books similar users read
for column in pick_user_read.columns:
    if column in similar_user_book.columns:
        similar_user_book[column].dropna(inplace=True)

similar_user_book

In [None]:
# Dictionary to store item scores
item_scores = {}

# For loop to loop through columns/ books
for i in similar_user_book.columns:

    # Rating for book i
    book_rating = similar_user_book[i]

    # total score for book
    total = 0

    # Total number of scores
    count = 0

    # For loop for similar users through each book
    for u in similar_user.index:

        # Check for rating in each book by the user
        pd.isna(book_rating[u]) == False

        # If there is a rating, calculate score for the book
        score = similar_user[u] * book_rating[u]

        # Add score to the total score for that movie so far
        total += score

        # Add extra count for that score
        count += 1

    # Get the avg score for the movie
        item_scores[i] = total / count

# Convert dictionary to pandas df
item_scores = pd.DataFrame(item_scores.items(), columns=['Book','Book-Score'])

# Rank in decsending order
Ranked_scores = item_scores.sort_values(by='Book-Score', ascending=False)

# Select top ten rated books
m = 10
Ranked_scores.head(m)
