In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
abs_path = os.path.abspath('')

In [3]:
csv = os.path.join(abs_path, "..", "resources", "books_1.Best_Books_Ever.csv")

In [4]:
df = pd.read_csv(csv)

In [5]:
# Creation of 3 different dataframes
# Certain information is only relevant for user display using Streamlit
# There's also information relevant for both, user and analysis
# By this segmentation we can analyze deeper our feature selection

df_user, df_analysis, df_discarded = df.copy(), df.copy(), df.copy()

## First clean

In [6]:
df_discarded = df[["bookId","series","characters","bookFormat","edition","publisher","publishDate","firstPublishDate","setting","price"]]

In [7]:
df_discarded

Unnamed: 0,bookId,series,characters,bookFormat,edition,publisher,publishDate,firstPublishDate,setting,price
0,2767052-the-hunger-games,The Hunger Games #1,"['Katniss Everdeen', 'Peeta Mellark', 'Cato (H...",Hardcover,First Edition,Scholastic Press,09/14/08,,"['District 12, Panem', 'Capitol, Panem', 'Pane...",5.09
1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter #5,"['Sirius Black', 'Draco Malfoy', 'Ron Weasley'...",Paperback,US Edition,Scholastic Inc.,09/28/04,06/21/03,['Hogwarts School of Witchcraft and Wizardry (...,7.38
2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,"['Scout Finch', 'Atticus Finch', 'Jem Finch', ...",Paperback,,Harper Perennial Modern Classics,05/23/06,07/11/60,"['Maycomb, Alabama (United States)']",
3,1885.Pride_and_Prejudice,,"['Mr. Bennet', 'Mrs. Bennet', 'Jane Bennet', '...",Paperback,"Modern Library Classics, USA / CAN",Modern Library,10/10/00,01/28/13,"['United Kingdom', 'Derbyshire, England (Unite...",
4,41865.Twilight,The Twilight Saga #1,"['Edward Cullen', 'Jacob Black', 'Laurent', 'R...",Paperback,,"Little, Brown and Company",09/06/06,10/05/05,"['Forks, Washington (United States)', 'Phoenix...",2.1
...,...,...,...,...,...,...,...,...,...,...
52473,11492014-fractured,Fateful #2,[],Nook,,Cheri Schmidt,May 28th 2011,,[],
52474,11836711-anasazi,Sense of Truth #2,[],Paperback,First Edition,Bokheim Publishing,August 5th 2011,August 3rd 2011,[],
52475,10815662-marked,Soul Guardians #1,[],Paperback,,CreateSpace,March 18th 2011,March 15th 2011,[],7.37
52476,11330278-wayward-son,,[],Paperback,1st edition,Cascada Productions,September 1st 2011,April 5th 2011,[],2.86


In [8]:
df_analysis = df.drop(df_discarded.columns, axis = 1)

In [9]:
for i in df_analysis[["rating","numRatings","likedPercent","ratingsByStars","bbeScore","bbeVotes"]].columns:

    print(f"Length of {i}: {len(df_analysis[i])}, NaNs in {i}: {len(df_analysis[df_analysis[i].isnull() == True])}")

Length of rating: 52478, NaNs in rating: 0
Length of numRatings: 52478, NaNs in numRatings: 0
Length of likedPercent: 52478, NaNs in likedPercent: 622
Length of ratingsByStars: 52478, NaNs in ratingsByStars: 0
Length of bbeScore: 52478, NaNs in bbeScore: 0
Length of bbeVotes: 52478, NaNs in bbeVotes: 0


In [10]:
# likedPercent has Nan's and we can use rating instead
# bbeScore isn't scaled and we can use rating instead
# bbeVotes values are much lower than numRatings ones
# ratingsByStars could be useful in df_user but not in df_analysis

df_analysis.drop(["likedPercent","bbeScore","bbeVotes","ratingsByStars"], axis = 1, inplace= True)

In [11]:
# Deleting those without specified language

df_analysis = df_analysis[df_analysis["language"].isnull() == False].reset_index(drop=True)

In [12]:
# Deleting those without specified pages

df_analysis = df_analysis[df_analysis["pages"].isnull() == False].reset_index(drop=True)

In [13]:
# Deleting those without specified genres

df_analysis = df_analysis[df_analysis["genres"] != "[]"].reset_index(drop = True)

In [14]:
l_new_title = []
for title, author in zip(df_analysis["title"],df_analysis["author"]):
    
    l_new_title.append(' - '.join((title,author)))
    
df_analysis["title"] = l_new_title

df_analysis.drop("author", axis = 1, inplace=True)

df_analysis.drop_duplicates(keep = False, inplace=True)

df_analysis.reset_index(drop = True, inplace = True)

In [15]:
df_analysis

Unnamed: 0,title,rating,description,language,isbn,genres,pages,awards,numRatings,coverImg
0,The Hunger Games - Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780439023481,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,['Locus Award Nominee for Best Young Adult Boo...,6376780,https://i.gr-assets.com/images/S/compressed.ph...
1,Harry Potter and the Order of the Phoenix - J....,4.50,There is a door at the end of a silent corrido...,English,9780439358071,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,['Bram Stoker Award for Works for Young Reader...,2507623,https://i.gr-assets.com/images/S/compressed.ph...
2,To Kill a Mockingbird - Harper Lee,4.28,The unforgettable novel of a childhood in a sl...,English,9999999999999,"['Classics', 'Fiction', 'Historical Fiction', ...",324,"['Pulitzer Prize for Fiction (1961)', 'Audie A...",4501075,https://i.gr-assets.com/images/S/compressed.ph...
3,"Pride and Prejudice - Jane Austen, Anna Quindl...",4.26,Alternate cover edition of ISBN 9780679783268S...,English,9999999999999,"['Classics', 'Fiction', 'Romance', 'Historical...",279,[],2998241,https://i.gr-assets.com/images/S/compressed.ph...
4,Twilight - Stephenie Meyer,3.60,About three things I was absolutely positive.\...,English,9780316015844,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,"['Georgia Peach Book Award (2007)', 'Buxtehude...",4964519,https://i.gr-assets.com/images/S/compressed.ph...
...,...,...,...,...,...,...,...,...,...,...
43845,Fractured - Cheri Schmidt (Goodreads Author),4.00,The Fateful Trilogy continues with Fractured. ...,English,2940012616562,"['Vampires', 'Paranormal', 'Young Adult', 'Rom...",0,[],871,https://i.gr-assets.com/images/S/compressed.ph...
43846,Anasazi - Emma Michaels,4.19,"'Anasazi', sequel to 'The Thirteenth Chime' by...",English,9999999999999,"['Mystery', 'Young Adult']",190,[],37,https://i.gr-assets.com/images/S/compressed.ph...
43847,Marked - Kim Richardson (Goodreads Author),3.70,--READERS FAVORITE AWARDS WINNER 2011--Sixteen...,English,9781461017097,"['Fantasy', 'Young Adult', 'Paranormal', 'Ange...",280,"[""Readers' Favorite Book Award (2011)""]",6674,https://i.gr-assets.com/images/S/compressed.ph...
43848,"Wayward Son - Tom Pollack (Goodreads Author), ...",3.85,A POWERFUL TREMOR UNEARTHS AN ANCIENT SECRETBu...,English,9781450755634,"['Fiction', 'Mystery', 'Historical Fiction', '...",507,[],238,https://i.gr-assets.com/images/S/compressed.ph...


In [16]:
df_analysis = df_analysis[(df_analysis["numRatings"]> 100)&(df_analysis["rating"]>0)].sort_values("numRatings", ascending=False)
df_analysis.reset_index(drop=True, inplace=True)

In [17]:
df_analysis.reset_index(drop=False,inplace=True)

df_analysis.rename(columns = {"index" : "book_index"},inplace=True
                  )

In [18]:
df_analysis

Unnamed: 0,book_index,title,rating,description,language,isbn,genres,pages,awards,numRatings,coverImg
0,0,Harry Potter and the Sorcerer's Stone - J.K. R...,4.47,Harry Potter's life is miserable. His parents ...,English,9999999999999,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",309,"[""Mythopoeic Fantasy Award for Children's Lite...",7048471,https://i.gr-assets.com/images/S/compressed.ph...
1,1,The Hunger Games - Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780439023481,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,['Locus Award Nominee for Best Young Adult Boo...,6376780,https://i.gr-assets.com/images/S/compressed.ph...
2,2,Twilight - Stephenie Meyer,3.60,About three things I was absolutely positive.\...,English,9780316015844,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,"['Georgia Peach Book Award (2007)', 'Buxtehude...",4964519,https://i.gr-assets.com/images/S/compressed.ph...
3,3,To Kill a Mockingbird - Harper Lee,4.28,The unforgettable novel of a childhood in a sl...,English,9999999999999,"['Classics', 'Fiction', 'Historical Fiction', ...",324,"['Pulitzer Prize for Fiction (1961)', 'Audie A...",4501075,https://i.gr-assets.com/images/S/compressed.ph...
4,4,"The Great Gatsby - F. Scott Fitzgerald, Franci...",3.92,Alternate Cover Edition ISBN: 0743273567 (ISBN...,English,9999999999999,"['Classics', 'Fiction', 'School', 'Literature'...",200,['Grammy Award Nominee for Best Spoken Word Al...,3775504,https://i.gr-assets.com/images/S/compressed.ph...
...,...,...,...,...,...,...,...,...,...,...,...
40848,40848,My Country 'Tis of Thee - Keith Ellison,3.82,The first Muslim to be elected to Congress spe...,English,9999999999999,"['Memoir', 'Politics', 'Biography', 'Religion'...",304,[],101,https://i.gr-assets.com/images/S/compressed.ph...
40849,40849,La vieja tigresa o el erotismo en la senectud ...,3.86,Vuelve Miguel Noguera con otro de sus exitosos...,Spanish,9788416290161,"['Humor', 'Comics', 'Graphic Novels']",192,[],101,https://i.gr-assets.com/images/S/compressed.ph...
40850,40850,Tehtaan varjossa - Toivo Pekkanen,3.44,Omaelämäkerrallinen romaani kuvaa työläispojan...,Finnish,9789510050880,"['Fiction', 'Finnish Literature']",271,['Valtion kirjallisuuspalkinto (1933)'],101,https://i.gr-assets.com/images/S/compressed.ph...
40851,40851,"دستور زبان نما - Roy Thompson, محمد گذرآبادی (...",3.69,«دستور زبان نما» برای کسانی طراحی شده که تازه ...,Persian,9786006128696,"['Film', 'Nonfiction']",264,[],101,https://i.gr-assets.com/images/S/compressed.ph...


In [19]:
################################################################################################################################

## Genres 

In [20]:
def get_dict(row):
    try:
        l_cleaned = eval(row)
    except:
        l_cleaned = {}
    return l_cleaned

In [21]:
len(df_analysis["genres"])

40853

In [22]:
l_cleaned_categories = list()
for i in df_analysis["genres"]:
    l_cleaned_categories.append(get_dict(i))

In [23]:
l_unique_categories = list()
for i in l_cleaned_categories:
    
    for j in i:
        l_unique_categories.append(j)

In [24]:
len(set(l_unique_categories))

966

In [25]:
s_unique_categories = set(l_unique_categories)

In [26]:
# Counting most common genres

d_cont = {i:0 for i in s_unique_categories}
for i in l_cleaned_categories:
    for j in i:
        d_cont[j] = d_cont[j] + 1

In [27]:
# 50 most common genres

l_sorted_categories = list(sorted(d_cont.items(), key=lambda item: item[1]))
l_sorted_categories = l_sorted_categories[::-1]

In [28]:
l_sorted_categories = [j[0] for j in l_sorted_categories[:50]]

In [29]:
l_datos = list()


for row in l_cleaned_categories:

    
    l_categorias_peliculas = list()
    
    for cat in l_sorted_categories:
    
        if cat in row:
            l_categorias_peliculas.append(1)

        else:

            l_categorias_peliculas.append(0)

    l_datos.append(l_categorias_peliculas)

In [30]:
df_generos_peliculas = pd.DataFrame(data = l_datos, columns = l_sorted_categories)

df_generos_peliculas.insert(0, 'title', df_analysis["title"])
df_generos_peliculas.insert(0, 'book_index', df_analysis["book_index"])  

df_generos_peliculas

Unnamed: 0,book_index,title,Fiction,Romance,Fantasy,Young Adult,Contemporary,Adult,Novels,Mystery,...,Realistic Fiction,Drama,Memoir,Religion,New Adult,20th Century,War,Vampires,Erotica,American
0,0,Harry Potter and the Sorcerer's Stone - J.K. R...,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,The Hunger Games - Suzanne Collins,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,Twilight - Stephenie Meyer,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3,To Kill a Mockingbird - Harper Lee,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4,"The Great Gatsby - F. Scott Fitzgerald, Franci...",1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40848,40848,My Country 'Tis of Thee - Keith Ellison,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
40849,40849,La vieja tigresa o el erotismo en la senectud ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40850,40850,Tehtaan varjossa - Toivo Pekkanen,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40851,40851,"دستور زبان نما - Roy Thompson, محمد گذرآبادی (...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
################################################################################################################################

## Authors (Discarded)

In [32]:
l_cleaned_authors = df["author"].values
l_cleaned_authors

array(['Suzanne Collins', 'J.K. Rowling, Mary GrandPré (Illustrator)',
       'Harper Lee', ..., 'Kim Richardson (Goodreads Author)',
       'Tom Pollack (Goodreads Author), John Loftus (Goodreads Author), Jim Alves',
       'Misty Moncur (Goodreads Author)'], dtype=object)

In [33]:
l_unique_author = list(df["author"].unique())

In [34]:
df["author"].value_counts()[:30]

Nora Roberts (Goodreads Author)           86
Agatha Christie                           72
Stephen King (Goodreads Author)           66
Erin Hunter                               59
Bella Forrest (Goodreads Author)          53
Meg Cabot (Goodreads Author)              52
Lucian Bane (Goodreads Author)            49
Karen Kingsbury (Goodreads Author)        48
Terry Pratchett                           48
NOT A BOOK                                47
Mercedes Lackey                           47
Bonnie Bryant                             47
Enid Blyton                               45
Carolyn Keene                             43
Dean Koontz (Goodreads Author)            41
Sherrilyn Kenyon (Goodreads Author)       41
Orson Scott Card                          40
J.D. Robb (Goodreads Author)              39
Christine Feehan (Goodreads Author)       39
Charlaine Harris (Goodreads Author)       38
Terry Brooks (Goodreads Author)           37
Kristen Ashley (Goodreads Author)         37
Laurell K.

In [35]:
df.groupby("author",as_index=False).mean().sort_values("rating", ascending=False).tail(50)

Unnamed: 0,author,rating,numRatings,likedPercent,bbeScore,bbeVotes
21788,Richard Butler,1.75,6.0,83.0,77.5,1.0
27342,Yolanda Williams (Goodreads Author),1.75,1.0,50.0,98.0,1.0
4647,Christine Catlin,1.67,136.0,18.0,95.0,1.0
5232,D. Terry (Goodreads Author),1.5,0.333333,100.0,97.5,1.0
19315,Nathan Preedy (Goodreads Author),1.5,5.5,73.0,90.0,1.0
17024,Marco Morrone,1.0,2.0,0.0,98.0,1.0
4602,Christina Corlett,1.0,1.0,0.0,95.0,1.0
7344,Eloise Epps MacKinnon,1.0,1.0,0.0,100.0,1.0
24454,Stephen Coleman,1.0,1.0,0.0,99.0,1.0
9454,Gwyn Prins,0.0,0.0,,76.0,1.0


#### Probably discarded

In [36]:
################################################################################################################################

## Series (Discarded)

In [37]:
l_unique_series_uncleaned = set(df["series"].values)
l_unique_series_uncleaned = [i for i in l_unique_series_uncleaned if type(i)==str]

In [38]:
unique_series = list()
for i in l_unique_series_uncleaned:
    unique_series.append(i.split(" #")[0])

#### Probably discarded

In [39]:
################################################################################################################################

## Pages

In [40]:
df_analysis["pages"]=df_analysis["pages"].apply(lambda x: x.split()[0] if pd.notnull(x) else x)

In [41]:
df_analysis["pages"]=df_analysis["pages"].apply(lambda x: int(x) if pd.notnull(x) else x)

In [42]:
l_pages = list()

for i in df_analysis["pages"]:
        if i < 200:
            l_pages.append("short")
        elif 200 <= i < 500:
            l_pages.append("medium")
        elif i >= 500:
            l_pages.append("large")
        else:
            l_pages.append(np.nan)

In [43]:
df_analysis["pages"] = l_pages

In [44]:
df_pages = pd.DataFrame()
df_pages["title"] = df_analysis["title"]
df_pages["short"] = [1 if i == "short" else 0 for i in l_pages]
df_pages["medium"] = [1 if i == "medium" else 0 for i in l_pages]
df_pages["large"] = [1 if i == "large" else 0 for i in l_pages]

In [45]:
df_pages.insert(0, 'book_index', df_analysis["book_index"])  

df_pages

Unnamed: 0,book_index,title,short,medium,large
0,0,Harry Potter and the Sorcerer's Stone - J.K. R...,0,1,0
1,1,The Hunger Games - Suzanne Collins,0,1,0
2,2,Twilight - Stephenie Meyer,0,0,1
3,3,To Kill a Mockingbird - Harper Lee,0,1,0
4,4,"The Great Gatsby - F. Scott Fitzgerald, Franci...",0,1,0
...,...,...,...,...,...
40848,40848,My Country 'Tis of Thee - Keith Ellison,0,1,0
40849,40849,La vieja tigresa o el erotismo en la senectud ...,1,0,0
40850,40850,Tehtaan varjossa - Toivo Pekkanen,0,1,0
40851,40851,"دستور زبان نما - Roy Thompson, محمد گذرآبادی (...",0,1,0


In [46]:
################################################################################################################################

## Simulating **`"User DataFrame"`**

In [47]:
# Simulating user´s input

df_user_input = pd.DataFrame()

df_user_input["title"] = ['The Hunger Games - Suzanne Collins', 'Harry Potter and the Deathly Hallows - J.K. Rowling',
       'The Hobbit, or There and Back Again - J.R.R. Tolkien','The Catcher in the Rye - J.D. Salinger', 'Eclipse - Stephenie Meyer',
       'The Golden Compass - Philip Pullman',"Me Before You - Jojo Moyes (Goodreads Author)","A Clash of Kings - George R.R. Martin"]
df_user_input["rating"] =[2,6,4,2,9,2,6,5]

In [48]:
df_user_input

Unnamed: 0,title,rating
0,The Hunger Games - Suzanne Collins,2
1,Harry Potter and the Deathly Hallows - J.K. Ro...,6
2,"The Hobbit, or There and Back Again - J.R.R. T...",4
3,The Catcher in the Rye - J.D. Salinger,2
4,Eclipse - Stephenie Meyer,9
5,The Golden Compass - Philip Pullman,2
6,Me Before You - Jojo Moyes (Goodreads Author),6
7,A Clash of Kings - George R.R. Martin,5


In [49]:
df_generos_peliculas

Unnamed: 0,book_index,title,Fiction,Romance,Fantasy,Young Adult,Contemporary,Adult,Novels,Mystery,...,Realistic Fiction,Drama,Memoir,Religion,New Adult,20th Century,War,Vampires,Erotica,American
0,0,Harry Potter and the Sorcerer's Stone - J.K. R...,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,The Hunger Games - Suzanne Collins,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,Twilight - Stephenie Meyer,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3,To Kill a Mockingbird - Harper Lee,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4,"The Great Gatsby - F. Scott Fitzgerald, Franci...",1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40848,40848,My Country 'Tis of Thee - Keith Ellison,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
40849,40849,La vieja tigresa o el erotismo en la senectud ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40850,40850,Tehtaan varjossa - Toivo Pekkanen,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40851,40851,"دستور زبان نما - Roy Thompson, محمد گذرآبادی (...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Creating **`"Weighted Genre Matrix"`**

In [50]:
df_user_weights = pd.merge(left=df_generos_peliculas, right=df_pages, how="inner", on="book_index")

df_user_weights.drop(["book_index","title_y"],axis=1, inplace=True)

df_user_weights.rename(columns={"title_x":"title"}, inplace=True)


In [51]:
df_user_weights

Unnamed: 0,title,Fiction,Romance,Fantasy,Young Adult,Contemporary,Adult,Novels,Mystery,Historical Fiction,...,Religion,New Adult,20th Century,War,Vampires,Erotica,American,short,medium,large
0,Harry Potter and the Sorcerer's Stone - J.K. R...,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,The Hunger Games - Suzanne Collins,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,Twilight - Stephenie Meyer,1,1,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,To Kill a Mockingbird - Harper Lee,1,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
4,"The Great Gatsby - F. Scott Fitzgerald, Franci...",1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40848,My Country 'Tis of Thee - Keith Ellison,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
40849,La vieja tigresa o el erotismo en la senectud ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
40850,Tehtaan varjossa - Toivo Pekkanen,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
40851,"دستور زبان نما - Roy Thompson, محمد گذرآبادی (...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [52]:
df_user_weights = pd.merge(left=df_user_input, right=df_user_weights, how="outer", on="title")
df_user_weights

Unnamed: 0,title,rating,Fiction,Romance,Fantasy,Young Adult,Contemporary,Adult,Novels,Mystery,...,Religion,New Adult,20th Century,War,Vampires,Erotica,American,short,medium,large
0,The Hunger Games - Suzanne Collins,2.0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,Harry Potter and the Deathly Hallows - J.K. Ro...,6.0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,"The Hobbit, or There and Back Again - J.R.R. T...",4.0,1,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,The Catcher in the Rye - J.D. Salinger,2.0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
4,Eclipse - Stephenie Meyer,9.0,1,1,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40848,My Country 'Tis of Thee - Keith Ellison,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
40849,La vieja tigresa o el erotismo en la senectud ...,,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
40850,Tehtaan varjossa - Toivo Pekkanen,,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
40851,"دستور زبان نما - Roy Thompson, محمد گذرآبادی (...",,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [53]:
df_user_weights.iloc[:,2:].values

array([[1, 1, 1, ..., 0, 1, 0],
       [1, 0, 1, ..., 0, 0, 1],
       [1, 0, 1, ..., 0, 1, 0],
       ...,
       [1, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 1, 0, ..., 0, 1, 0]], dtype=int64)

In [54]:
weighted_genre_matrix = list()

for rating, genres in zip(df_user_weights[df_user_weights["rating"]>0]["rating"].values, df_user_weights.iloc[:,2:].values):
    weighted_genre_matrix.append(rating*genres)

In [55]:
l_columns_pages = ["short", "medium", "large"]
weighted_genre_matrix = pd.DataFrame(weighted_genre_matrix, columns = l_sorted_categories+l_columns_pages)

weighted_genre_matrix

Unnamed: 0,Fiction,Romance,Fantasy,Young Adult,Contemporary,Adult,Novels,Mystery,Historical Fiction,Nonfiction,...,Religion,New Adult,20th Century,War,Vampires,Erotica,American,short,medium,large
0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
1,6.0,0.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
2,4.0,0.0,4.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
3,2.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0
4,9.0,9.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,9.0
5,2.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
6,6.0,6.0,0.0,0.0,6.0,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0
7,5.0,0.0,5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


## Creating **`"User profile"`**

In [56]:
s_user_weights = weighted_genre_matrix.sum()

s_user_weights = s_user_weights/s_user_weights.sum()

s_user_weights

Fiction                    0.104956
Romance                    0.049563
Fantasy                    0.081633
Young Adult                0.072886
Contemporary               0.017493
Adult                      0.032070
Novels                     0.017493
Mystery                    0.000000
Historical Fiction         0.000000
Nonfiction                 0.000000
Audiobook                  0.055394
Classics                   0.034985
Adventure                  0.055394
Historical                 0.000000
Literature                 0.005831
Paranormal                 0.026239
Science Fiction            0.011662
Childrens                  0.034985
Thriller                   0.000000
Magic                      0.017493
Humor                      0.000000
Crime                      0.000000
Urban Fantasy              0.000000
Suspense                   0.000000
Contemporary Romance       0.017493
History                    0.000000
Science Fiction Fantasy    0.049563
Chick Lit                  0

## Creating **`"Weighted Books Matrix"`**

In [57]:
df_recomendation = df_user_weights[~(df_user_weights["rating"] > 0)]

df_recomendation

Unnamed: 0,title,rating,Fiction,Romance,Fantasy,Young Adult,Contemporary,Adult,Novels,Mystery,...,Religion,New Adult,20th Century,War,Vampires,Erotica,American,short,medium,large
8,Harry Potter and the Sorcerer's Stone - J.K. R...,,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,Twilight - Stephenie Meyer,,1,1,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
10,To Kill a Mockingbird - Harper Lee,,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
11,"The Great Gatsby - F. Scott Fitzgerald, Franci...",,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
12,The Fault in Our Stars - John Green (Goodreads...,,1,1,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40848,My Country 'Tis of Thee - Keith Ellison,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
40849,La vieja tigresa o el erotismo en la senectud ...,,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
40850,Tehtaan varjossa - Toivo Pekkanen,,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
40851,"دستور زبان نما - Roy Thompson, محمد گذرآبادی (...",,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [58]:
weighted_books_matrix = list()

for j in df_recomendation.iloc[:, 2:].values:
    weighted_books_matrix.append(s_user_weights.values*j)

In [59]:
np.array(weighted_books_matrix).shape

(40845, 53)

In [60]:
weighted_books_matrix = pd.DataFrame(data = weighted_books_matrix, columns = l_sorted_categories+l_columns_pages)

weighted_books_matrix

Unnamed: 0,Fiction,Romance,Fantasy,Young Adult,Contemporary,Adult,Novels,Mystery,Historical Fiction,Nonfiction,...,Religion,New Adult,20th Century,War,Vampires,Erotica,American,short,medium,large
0,0.104956,0.000000,0.081633,0.072886,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.046647,0.000000
1,0.104956,0.049563,0.081633,0.072886,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.026239,0.0,0.000000,0.0,0.000000,0.058309
2,0.104956,0.000000,0.000000,0.072886,0.000000,0.0,0.017493,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.046647,0.000000
3,0.104956,0.049563,0.000000,0.000000,0.000000,0.0,0.017493,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.005831,0.0,0.046647,0.000000
4,0.104956,0.049563,0.000000,0.072886,0.017493,0.0,0.017493,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.046647,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40840,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.005831,0.0,0.046647,0.000000
40841,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000
40842,0.104956,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.046647,0.000000
40843,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.046647,0.000000


In [61]:
weighted_books_matrix.sum(axis = 1)

0        0.577259
1        0.504373
2        0.282799
3        0.265306
4        0.376093
           ...   
40840    0.052478
40841    0.000000
40842    0.151603
40843    0.046647
40844    0.212828
Length: 40845, dtype: float64

## Creating **`"Recomendation DataFrame"`**

In [62]:
df_recom_def = pd.concat([df_analysis[["title", "description", "awards", "isbn", "coverImg"]], weighted_books_matrix.sum(axis = 1)], axis = 1).sort_values(0, ascending = False)
df_recom_def.rename(columns={0:"coincidence_rating"}, inplace=True)

In [63]:
df_recom_def[df_recom_def["coincidence_rating"]>0]

Unnamed: 0,title,description,awards,isbn,coverImg,coincidence_rating
17,The Girl with the Dragon Tattoo - Stieg Larsso...,"Harriet Vanger, a scion of one of Sweden’s wea...",['Barry Award for Mystery/Crime Novel Of The D...,9999999999999,https://i.gr-assets.com/images/S/compressed.ph...,0.588921
16,Angels & Demons - Dan Brown (Goodreads Author),World-renowned Harvard symbologist Robert Lang...,"[""Humo's Gouden Bladwijzer (2005)""]",9781416524793,https://i.gr-assets.com/images/S/compressed.ph...,0.588921
14,The Catcher in the Rye - J.D. Salinger,The hero-narrator of The Catcher in the Rye is...,['Teen Read Award Nominee for Best All-Time-Fa...,9780316769174,https://i.gr-assets.com/images/S/compressed.ph...,0.588921
0,Harry Potter and the Sorcerer's Stone - J.K. R...,Harry Potter's life is miserable. His parents ...,"[""Mythopoeic Fantasy Award for Children's Lite...",9999999999999,https://i.gr-assets.com/images/S/compressed.ph...,0.577259
8,Divergent - Veronica Roth (Goodreads Author),"In Beatrice Prior's dystopian Chicago world, s...","['Georgia Peach Book Award (2012)', 'South Car...",9780062024039,https://i.gr-assets.com/images/S/compressed.ph...,0.577259
...,...,...,...,...,...,...
15997,Spellsinger - Alan Dean Foster,"Among sentient animals and humans, hardheaded ...",[],9780743498258,https://i.gr-assets.com/images/S/compressed.ph...,0.005831
31635,Absolute Brightness - James Lecesne,Darkness: Where light is not. Light: Brightnes...,['William C. Morris YA Debut Award Nominee (20...,9780061256271,https://i.gr-assets.com/images/S/compressed.ph...,0.005831
28127,What Falls Away: A Memoir - Mia Farrow,"In an exquisitely written memoir, Mia Farrow i...",[],9780553564662,https://i.gr-assets.com/images/S/compressed.ph...,0.005831
27492,Blind Spot - Dani Pettrey (Goodreads Author),FBI agent Declan Grey is in the chase of his l...,[],9780764212963,https://i.gr-assets.com/images/S/compressed.ph...,0.005831
