In [1]:
# To run all the csv files in one call
import glob

# Data Cleaninig and Manipulation
import pandas as pd
import numpy as np 

# To ignore warnings 
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
# Import all book metadata
path = r'./Books_csv' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df_book = pd.concat(li, axis=0, ignore_index=True)

In [3]:
# Import all user ratign metadata

path = r'./Rating_csv' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df_rating = pd.concat(li, axis=0, ignore_index=True)

In [4]:
# Make our dataset smaller in memory Thanks to : https://www.kaggle.com/aantonova/some-new-risk-and-clusters-feature
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df



### Cleaning book dataframe

In [5]:
# To Take quick overview of the dataframe

def overview_df(dataframe):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    
    print('##################### Number of Observations #####################')
    print(dataframe.shape[0])
    print('##################### Number of Variables #####################')
    print(dataframe.shape[1])
    print("##################### Number of Categoric Variables#####################")
    print(len(cat_cols))
    print("##################### Number of Numeric Variables#####################")
    print(len(num_cols))
    print("##################### Types of Variables#####################")
    print(dataframe.dtypes)
    print("##################### NA #####################")
    print(dataframe.isnull().sum())

In [6]:
reduce_mem_usage(df_book)

Mem. usage decreased to 215.28 Mb (27.4% reduction)


Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,...,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,pagesNumber,Description,Count of text reviews,PagesNumber
0,700000,A Passion to Preserve: Gay Men as Keepers of C...,Will Fellows,0299196844,3.750000,2005,1,9,University of Wisconsin Press,5:5,...,3:8,2:3,1:0,total:36,6,,298.0,"From large cities to rural communities, gay me...",6.0,
1,700002,Culture Keepers-Florida: Oral History of the A...,Deborah Johnson-Simon,1425935168,0.000000,2006,18,7,Authorhouse,5:0,...,3:0,2:0,1:0,total:0,0,,140.0,,0.0,
2,700003,Holiday Favorites: The Best of the Williams-So...,Allen Rosenberg,0848728009,4.550781,2004,1,4,Oxmoor House,5:7,...,3:1,2:0,1:0,total:11,2,,336.0,This collector's edition features fabulous ful...,2.0,
3,700004,"Soups, Salads & Starters: the Best of Williams...",Allan Rosenberg,0848728068,4.699219,2004,1,4,Oxmoor House,5:8,...,3:1,2:0,1:0,total:10,2,,336.0,,2.0,
4,700005,Breakfasts & Brunches,Time-Life Books,0783503210,3.880859,1997,1,4,Time Life Medical,5:6,...,3:4,2:2,1:0,total:17,0,eng,108.0,"America's most respected cookware retailer, th...",0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1850305,499992,The Frugal Gourmet Cooks Three Ancient Cuisine...,Jeff Smith,0688075894,3.810547,1989,1,1,William Morrow & Company,5:347,...,3:278,2:82,1:36,total:1095,18,,525.0,,,
1850306,499994,Meditation and Its Practice,Swami Rama,0893891533,4.218750,2007,12,2,Himalayan Institute Press,5:127,...,3:35,2:11,1:5,total:245,18,,110.0,,,
1850307,499997,The New York Times Almanac 2006: The Almanac o...,John W. Wright,0143036521,4.000000,2005,17,10,Penguin Books,5:0,...,3:0,2:0,1:0,total:1,0,,1008.0,,,
1850308,499998,The Meditations of the Emperor Marcus Aurelius...,Marcus Aurelius,1406788406,4.230469,2007,1,1,Pomona Press,5:53530,...,3:15170,2:4005,1:1777,total:106489,0,,332.0,,,


In [7]:
reduce_mem_usage(df_rating)

Mem. usage decreased to  6.22 Mb (25.0% reduction)


Unnamed: 0,ID,Name,Rating
0,1,Agile Web Development with Rails: A Pragmatic ...,it was amazing
1,1,The Restaurant at the End of the Universe (Hit...,it was amazing
2,1,Siddhartha,it was amazing
3,1,The Clock of the Long Now: Time and Responsibi...,really liked it
4,1,"Ready Player One (Ready Player One, #1)",really liked it
...,...,...,...
362591,5403,"The MacGregors: Alan & Grant (The MacGregors, ...",it was ok
362592,5403,The MacGregors: Serena & Caine (The MacGregors...,it was ok
362593,5403,Time and Again: Time Was / Times Change,it was ok
362594,5403,"Dance Upon The Air (Three Sisters Island, #1)",really liked it


In [8]:
## Required columns are choosen and new dataframe have been created with these columns
column_list_for_df_books = ['Id','Name','Authors','ISBN','PublishYear']

df_book_new = df_book[column_list_for_df_books]
df_book_new.head(10)

Unnamed: 0,Id,Name,Authors,ISBN,PublishYear
0,700000,A Passion to Preserve: Gay Men as Keepers of C...,Will Fellows,0299196844,2005
1,700002,Culture Keepers-Florida: Oral History of the A...,Deborah Johnson-Simon,1425935168,2006
2,700003,Holiday Favorites: The Best of the Williams-So...,Allen Rosenberg,0848728009,2004
3,700004,"Soups, Salads & Starters: the Best of Williams...",Allan Rosenberg,0848728068,2004
4,700005,Breakfasts & Brunches,Time-Life Books,0783503210,1997
5,700006,Vegetarian (Best of Williams-Sonoma Kitchen Li...,Allan Rosenberg,0848730577,2005
6,700007,Pork and Lamb,Joanne Weir,0783503091,1999
7,700008,Ice Creams and Sorbets,Sarah Tenaglia,0783503105,1999
8,700011,Paradise/Tender Triumph (Omnibus),Judith McNaught,0743428331,2001
9,700015,50 Thrifty Maui Restaurants: Dining on a Budge...,Yvonne Biegel,097426721X,2004


In [9]:
# To check is there any duplicated users in the dataframe
df_book_new.Id.value_counts()

76773      2
85990      2
86020      2
86019      2
86018      2
          ..
3424051    1
3424050    1
3424049    1
3424047    1
499999     1
Name: Id, Length: 1850115, dtype: int64

In [10]:
df_book_new.Id.nunique()

1850115

In [11]:
df_book_new.shape

(1850310, 5)

Number of unique users and the shape of the user dataframe is not equal. So I dropped duplicated rows from the dataframe. 

In [12]:
df_book_new.drop_duplicates(inplace=True)
df_book_new.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


(1850115, 5)

In [13]:
### To taka a quick overview of the new dataframe
overview_df(df_book_new)

##################### Number of Observations #####################
1850115
##################### Number of Variables #####################
5
##################### Number of Categoric Variables#####################
3
##################### Number of Numeric Variables#####################
2
##################### Types of Variables#####################
Id              int32
Name           object
Authors        object
ISBN           object
PublishYear     int32
dtype: object
##################### NA #####################
Id                0
Name              0
Authors           0
ISBN           5922
PublishYear       0
dtype: int64


The dataframe has 5 variables (3 numeric, and 2 categoric), and ISBN column has 5922 missing observations. So these missing observations are dropped from dataframe.

In [14]:
df_book_new.dropna(inplace= True)

In [15]:
df_book_new.reset_index(inplace = True)

In [16]:
df_book_new.shape

(1844193, 6)

ISBN number has 10 digits, and if the book is republished it has 13 digits. Books that do not comply with this rule have been deleted from the dataframe.

In [17]:
missing_isbn_indexes = []

for index,each in enumerate(df_book_new.ISBN):
    if 'X' in each:
        missing_isbn_indexes.append(index)

In [18]:
df_book_new.drop(missing_isbn_indexes,inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [19]:
df_book_new.shape

(1685179, 6)

In [20]:
df_book_new.reset_index(inplace=True)

In [21]:
incorrenct_ISBN = []
for index,each in enumerate(df_book_new.ISBN):
    if (len(each) != 10) & (len(each) != 13):
        incorrenct_ISBN.append(index)
incorrenct_ISBN

[39516,
 159902,
 165087,
 175716,
 187754,
 237874,
 243118,
 268590,
 376710,
 379735,
 465182,
 507577,
 524410,
 549333,
 608878,
 622293,
 635581,
 640031,
 642738,
 644605,
 709808,
 726176,
 748963,
 757148,
 758601,
 777356,
 810792,
 828858,
 840268,
 852297,
 879906,
 930984,
 936441,
 973299,
 1094316,
 1106518,
 1115840,
 1122692,
 1123803,
 1164864,
 1175915,
 1181660,
 1218474,
 1253916,
 1277262,
 1325076,
 1336267,
 1359633,
 1431404,
 1433079,
 1460752,
 1515001,
 1680770]

In [22]:
df_book_new.drop(incorrenct_ISBN, inplace = True)

In [23]:
df_book_new.shape

(1685126, 7)

### Cleaning in rating dataframe 

In [24]:
df_rating.columns

Index(['ID', 'Name', 'Rating'], dtype='object')

Rating values are defined as strings in the data set. In order to make calculations with ratings, the rating value must be integer.

In [25]:
df_rating.Rating.unique()

array(['it was amazing', 'really liked it', 'liked it', 'did not like it',
       'it was ok', "This user doesn't have any rating"], dtype=object)

In [26]:
conditions = [(df_rating['Rating'] == 'it was amazing'), (df_rating['Rating'] == 'really liked it'), (df_rating['Rating'] == 'did not like it'),
              (df_rating['Rating'] == 'it was ok') , (df_rating['Rating'] == 'liked it')]
values = [5, 4, 1, 3, 2, ]
df_rating['Rating'] = np.select(conditions, values)

In [27]:
df_rating.Rating.value_counts()

4    132808
2     96047
5     92354
3     28811
1      7811
0      4765
Name: Rating, dtype: int64

#### To check is there duplicated observation in the dataframe

In [28]:
df_rating.shape

(362596, 3)

In [29]:
df_rating.drop_duplicates(inplace=True)
df_rating.shape

(362161, 3)

In [30]:
## Merge Cleaned books dataframe and rating dataframes based on name of the book
df = df_rating.merge(df_book_new, how= 'left', on = 'Name' )
df.head()

Unnamed: 0,ID,Name,Rating,level_0,index,Id,Authors,ISBN,PublishYear
0,1,Agile Web Development with Rails: A Pragmatic ...,5,,,,,,
1,1,The Restaurant at the End of the Universe (Hit...,5,153082.0,153581.0,862825.0,Douglas Adams,517545357.0,1981.0
2,1,Siddhartha,5,22791.0,22849.0,742021.0,Hermann Hesse,1434605000.0,2008.0
3,1,Siddhartha,5,69451.0,69675.0,1646972.0,Hermann Hesse,811202925.0,1951.0
4,1,Siddhartha,5,237076.0,237840.0,1482027.0,Hermann Hesse,3518458531.0,2007.0


In [31]:
overview_df(df)

##################### Number of Observations #####################
1064102
##################### Number of Variables #####################
9
##################### Number of Categoric Variables#####################
3
##################### Number of Numeric Variables#####################
6
##################### Types of Variables#####################
ID               int16
Name            object
Rating           int64
level_0        float64
index          float64
Id             float64
Authors         object
ISBN            object
PublishYear    float64
dtype: object
##################### NA #####################
ID                  0
Name                0
Rating              0
level_0        139016
index          139016
Id             139016
Authors        139016
ISBN           139016
PublishYear    139016
dtype: int64


Merged dataframe consist of 9 variables (3 categoric and 6 numeric) and 139016 observations have ratings for books which are not in the books dataframe. So these observations are dropped from dataframe.

In [32]:
df.dropna(inplace=True)

In [33]:
df.shape

(925086, 9)

In [34]:
user_book_df = df.pivot_table(index=["ID"], columns=["Name"], values="Rating")



In [35]:
def keyword_recommender():
    keyword = input('Please enter keyword to find book: ')
    liste = [col for col in user_book_df.columns if keyword in col]
    for i in range(len(liste)):
        print('press', i,'for',liste[i] )
    a = input()
    choosen = liste[int(a)]
    book_name = user_book_df[choosen]
    print(user_book_df.corrwith(book_name).sort_values(ascending=False).head(10))

In [36]:
keyword_recommender()

Please enter keyword to find book: Potter
press 0 for Beatrix Potter's Journal
press 1 for Beatrix Potter: A Life in Nature
press 2 for Centering In Pottery, Poetry, And The Person
press 3 for From Potter's Field (Kay Scarpetta, #6)
press 4 for Harrius Potter et Philosophi Lapis
press 5 for Harry Potter Boxed Set, Books 1-5 (Harry Potter, #1-5)
press 6 for Harry Potter Collection (Harry Potter, #1-6)
press 7 for Harry Potter Series Box Set (Harry Potter, #1-7)
press 8 for Harry Potter and the Chamber of Secrets (Harry Potter, #2)
press 9 for Harry Potter and the Deathly Hallows (Harry Potter, #7)
press 10 for Harry Potter and the Goblet of Fire (Harry Potter, #4)
press 11 for Harry Potter and the Half-Blood Prince (Harry Potter, #6)
press 12 for Harry Potter and the Half-blood Prince (Harry Potter, #6)
press 13 for Harry Potter and the Order of the Phoenix
press 14 for Harry Potter and the Order of the Phoenix (Harry Potter, #5)
press 15 for Harry Potter and the Philosopher's Stone
pre

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Name
Harry Potter and the Goblet of Fire (Harry Potter, #4)           1.000000
Harry Potter and the Half-Blood Prince (Harry Potter, #6)        1.000000
Harry Potter and the Order of the Phoenix (Harry Potter, #5)     1.000000
Harry Potter and the Deathly Hallows (Harry Potter, #7)          1.000000
Harry Potter und der Feuerkelch (Harry Potter, #4)               1.000000
Harry Potter und der Gefangene von Askaban (Harry Potter, #3)    1.000000
Harry Potter und die Kammer des Schreckens (Harry Potter, #2)    1.000000
Harry Potter und der Stein der Weisen (Harry Potter, #1)         0.846154
"A Problem from Hell": America and the Age of Genocide                NaN
"A" Is for Africa                                                     NaN
dtype: float64


When there is no user who reads similar books among the users, the similarity rate returns "NaN". This situation is called cold start problem in advisory systems. Therefore, no user testing has been done and a new model will be developed with Content based filtering method instead of Collaborative filtering method.

In [37]:
keyword_recommender()


Please enter keyword to find book: Potter
press 0 for Beatrix Potter's Journal
press 1 for Beatrix Potter: A Life in Nature
press 2 for Centering In Pottery, Poetry, And The Person
press 3 for From Potter's Field (Kay Scarpetta, #6)
press 4 for Harrius Potter et Philosophi Lapis
press 5 for Harry Potter Boxed Set, Books 1-5 (Harry Potter, #1-5)
press 6 for Harry Potter Collection (Harry Potter, #1-6)
press 7 for Harry Potter Series Box Set (Harry Potter, #1-7)
press 8 for Harry Potter and the Chamber of Secrets (Harry Potter, #2)
press 9 for Harry Potter and the Deathly Hallows (Harry Potter, #7)
press 10 for Harry Potter and the Goblet of Fire (Harry Potter, #4)
press 11 for Harry Potter and the Half-Blood Prince (Harry Potter, #6)
press 12 for Harry Potter and the Half-blood Prince (Harry Potter, #6)
press 13 for Harry Potter and the Order of the Phoenix
press 14 for Harry Potter and the Order of the Phoenix (Harry Potter, #5)
press 15 for Harry Potter and the Philosopher's Stone
pre