In [1]:
import json
import pickle
from types import SimpleNamespace
import pandas as pd
import numpy as np
import re

from fuzzywuzzy import fuzz

In [2]:
# define object classes
class DMCBook:
    def __init__(self, title, author, itemID, main_topic):
        self.title = title
        self.author = author
        self.author_lastname = self.get_author_lastname()
        self.itemID = itemID
        self.main_topic = main_topic

        self.goodreads_match = None
    
    def get_author_lastname(self):
        a = self.author.lower()
        a = re.sub("[()*]", " ", a)
        parts = a.split()
        return parts[0] if len(parts) > 0 else ""

In [3]:
class GoodreadsBook:
    def __init__(self, title, author, authors, isbn, book_id, lg, similar_books):
        self.title = title
        self.author = author
        self.all_authors = authors
        self.isbn = isbn
        self.book_id = book_id
        self.lg = lg
        self.similar_books = similar_books

    def __str__(self):
        return f"Title: {self.title}\nAuthors: {self.authors}\nISBN: {self.isbn}\nBook-ID: {self.book_id}\nLG: {self.lg}\nsimilar books: {self.similar_books}"

    

In [4]:
class GoodreadsAuthor:
    def __init__(self, name, author_id, books):
        self.name = name
        self.lastname = self._get_lastname()
        self.author_id = author_id
        self.books = books
    
    def _get_lastname(self):
        a = self.name.lower()
        a = re.sub("[()*]", " ", a)
        parts = a.split()
        return parts[0] if len(parts) > 0 else ""

    def __str__(self):
        return f"Name: {self.name}\nAuthor_ID: {self.author_id}"

In [5]:
class BXBook:
    def __init__(self, title, author):
        self.title = title
        self.author = author

In [6]:
# read items.csv file
# preprocessing (drop duplicates, clean, lowercase author and title, 
# replace special characters in author)
filename = "DMC-2021-Task/items.csv"
df = pd.read_csv(filename, sep="|", header=0)
df = df.replace(np.nan, "", regex=True)
df["title"] = df["title"].apply(lambda x: x.lower())

# drop duplicates (for now, duplicate titles with different ids get removed)
df = df.drop_duplicates(subset=["title", "author"], keep=False)

dmc_books = [DMCBook(r["title"], r["author"], r["itemID"], r["main topic"]) for i,r in df.iterrows()]
print(len(dmc_books))

70398


In [7]:
# group books by main topic/genre
dmc_genres = dict()
for b in dmc_books:
    if b.main_topic in dmc_genres.keys():
        dmc_genres[b.main_topic].append(b)
    else:
        dmc_genres[b.main_topic] = [b]

In [None]:
# statistics about genres
import collections
import matplotlib.pyplot as plt

freq_list = [len(v) for v in dmc_genres.values()]
counter=collections.Counter(freq_list)
plt.bar(counter.keys(), counter.values(), color='g')
plt.xlim([0,145])
plt.xlabel("main topics with x books")
plt.ylabel("books per main topic")
plt.show()

print(f"Total Main Topics: {len(counter.keys())}")

print(f"Topics with more than 100 books: {sum([1 for k,v in dmc_genres.items() if len(v) > 200])}")


The Goodreads dump is separated into the following genres:

**Children** - 124k books  
**Comics & Graphic** - 89k books  
**Fantasy & Paranormal** - 259k books  
**History & Biography** - 303k books  
**Mystery, Thriller & Crime** - 219k books  
**Poetry** - 37k books  
**Romance** - 336k books  
**Young Adult** - 93k books  


##### Factual non-fiction Categories (Probably no Match)
A - The Arts  
C - Language and Linguistics  
G - Reference, Information and Interdisciplinary Subjects  
J - Society and Social Sciences  
K - Economics, Finance, Business and Management  
L - Law  
M - Medicine and Nursing  
P - Mathematics and Science  
Q - Philosopy and Religion  
R - Earth Sciences, Geography, Environment, Planning  
S - Sports and Active Outdoor recreation  
T - Technology, Engineering, Agriculture, Industrial processes  
U - Computing and Information Technology  
V - Health, Relationships and Personal Development  
W - Lifestyle, Hobbies and Leisure  

### History & Biography
D - Biography, Literature and Literary Studies   
N - History and Archaelogy  
FC - biographical  

### Comics & Graphic
X - Graphic Novels, Compic books, Cartoons  

### Children
Y - Children's, Teenage and Educational  

### Young Adult
Y (except YP)  

### Mystery, Thriller & Crime
FF - Crime and Mystery  
FH - Thriller  
FJ - adventure  
FL - science  

### Fantasy & Paranormal
FK - Horro and supernatural Fiction  
FM - Fantasy  
FN - Traditional Stories, Myths and Fairy Tales  
FW - Religious and spiritual  
FJ - adventure  
FL - science  

### Romance 
FR - Romance
FP - erotic  

-----------------------------

# Multiple Matches/Non
FB - general & literary
FD - speculative
FG - sports
FQ - contemporary lifestyle
FS - family life
FT - generational sagas
FU - humorous fiction
FV - historical -> fantasy, myster, history
FX - narrative themes
FY - special features 
FZ - fiction companions





In [8]:
non_fictional_genres = ["A","C","G","J","K","L","M","P","Q","R","S","T","U","V","W"]
history_bio_genres = ["D","N", "FC"]
comics_genres = ["X"]
children_genres = ["Y"]
young_adult_genres = ["Y"]
mystery_genres = ["FF","FH","FJ","FL"]
fantasy_genres = ["FK","FM","FN","FW","FJ","FL"]
romance_genres = ["FR","FP"]
rest = ["FB","FD","FG","FQ","FS","FT","FU","FV","FX","FY","FZ"]

In [9]:
# Count genre matches and generate genre batches
nf_books = []
rest_books = []
without_genre = []

history_books = []
comic_books = []
children_books = []
young_books = []
mystery_books = []
fantasy_books = []
romance_books = []

for k,v in dmc_genres.items():
    if len(k) < 1:
        without_genre.extend(v)
        continue
    elif k[0] in non_fictional_genres:
        nf_books.extend(v)
    elif k[0] in history_bio_genres or k[0:2] in history_bio_genres:
        history_books.extend(v)
    elif k[0] in comics_genres:
        comic_books.extend(v)
    elif k[0] in children_genres:
        children_books.extend(v)
        young_books.extend(v)
    elif k[0:2] in romance_genres:
            romance_books.extend(v)
    elif k[0:2] in mystery_genres or k[0:2] in fantasy_genres:
        if k[0:2] in fantasy_genres:
            fantasy_books.extend(v)
        else:
            mystery_books.extend(v)
    else:
        rest_books.extend(v)
        
print(f"History Books: {len(history_books)}")
print(f"Comics Books: {len(comic_books)}")
print(f"Children Books: {len(children_books)}")
print(f"Young Books: {len(young_books)}")
print(f"Mystery Books: {len(mystery_books)}")
print(f"Fantasy Books: {len(fantasy_books)}")
print(f"Romance Books: {len(romance_books)}")
print(f"Non-Fiction Books: {len(nf_books)}")
print(f"Rest Books: {len(rest_books)}")
print(f"Without Genre: {len(without_genre)}")

History Books: 449
Comics Books: 905
Children Books: 44811
Young Books: 44811
Mystery Books: 427
Fantasy Books: 19447
Romance Books: 648
Non-Fiction Books: 1348
Rest Books: 2118
Without Genre: 245


For now, ignore non-fiction, rest and without genre (~3700)

In [10]:
def group_genre_by_author(dmc_books):
    '''Group books of one genre by their author'''
    dmc_authors = dict()
    for b in dmc_books:
        if b.author_lastname in dmc_authors.keys():
            dmc_authors[b.author_lastname].append(b)
        else:
            dmc_authors[b.author_lastname] = [b]

In [11]:
def parse_authors(filename):
    authors = {}
    with open(filename) as f:
        for jsonObj in f:
            a = json.loads(jsonObj, object_hook=lambda d: SimpleNamespace(**d))
            authors[a.author_id] = GoodreadsAuthor(a.name, a.author_id, [])
    return authors

In [12]:
def parse_books_to_obj(filename, all_authors, relevant_authors):
    '''Read in goodreads genre dump'''
    books_by_author = {}
    with open(filename, "r") as f:
        c = 0
        for jsonObj in f:
            b = json.loads(jsonObj, object_hook=lambda d: SimpleNamespace(**d))
            
            # match authorID with name
            a_obj = None
            try:
                main_authorID = b.authors[0].author_id
                if all_authors[main_authorID]:
                    a_obj = all_authors[main_authorID]
                    lastname = a_obj.lastname

                    # discard books with irrelevant authors
                    if lastname in relevant_authors:
                        grb = GoodreadsBook(
                                b.title,
                                a_obj,
                                b.authors,
                                b.isbn,
                                b.book_id,
                                b.language_code,
                                b.similar_books)
                        if lastname in books_by_author:
                            books_by_author[lastname].append(grb)
                        else:
                            books_by_author[lastname] = [grb]
            except IndexError:
                #print("book without author")
                pass
            c += 1
            if (c % 100 == 0):
                print(f"read {c} books", end="\r")

    return books_by_author


In [13]:
class DMCGenre:
    def __init__(self, dmc_books, dumpfile):
        self.dmc_books = dmc_books
        self.relevant_authors = self._retrieve_relevant_authors()
        self.dumpfile = dumpfile
        
        self.goodreads_books = None
    
    def _retrieve_relevant_authors(self):
        return set([b.author_lastname for b in self.dmc_books])
    
    def read_in_dump(self, authors):
        self.goodreads_books = parse_books_to_obj(self.dumpfile, authors, self.relevant_authors)
    
    def find_matches(self):
        c = 0
        t = len(self.dmc_books)
        for b in self.dmc_books:
            if b.author_lastname in self.goodreads_books.keys():
                find_match(b,self.goodreads_books[b.author_lastname])
                c += 1
                print(f"checked {c} of {t}", end="\r")
    
    def pickle(self, filename):
        with open(filename, "wb") as f:
            pickle.dump(self.dmc_books,f)

In [14]:
def find_match(dmcb, goodreads_books):
    higher_match = False
    if dmcb.author_lastname == "":
        higher_match = True
    for grb in goodreads_books:
        ratio = fuzz.ratio(dmcb.title.lower(), grb.title.lower())
        if ratio < 95:
            partial_ratio = fuzz.partial_ratio(dmcb.title.lower(), grb.title.lower())
            if partial_ratio < 85:
                continue
            else:
                if higher_match:
                    if partial_ratio < 95:
                        continue
        dmcb.goodreads_match = grb

In [15]:
# parse author file
authors = parse_authors("goodreads/goodreads_book_authors.json")

In [16]:
all_genres = [history_books, 
              comic_books, 
              children_books, 
              young_books, 
              mystery_books,
              fantasy_books,
              romance_books]
genre_dumps = ["history_biography",
                "comics_graphic",
                "children",
                "young_adult",
                "mystery_thriller_crime",
                "fantasy_paranormal",
                "romance"]

In [None]:
# PICKLED FILES EXIST ALREADY!!!
# process genre batches
matched_counter = 0
unmatched_genre_books = []
for books,f in zip(all_genres,genre_dumps):
    dmc_genre = DMCGenre(books,f"goodreads/goodreads_books_{f}.json")
    
    dmc_genre.read_in_dump(authors)
    print(f"{f}: read dump")
    
    dmc_genre.find_matches()
    print(f"{f}: checked for matches")
    
    for b in dmc_genre.dmc_books:
        if b.goodreads_match != None:
            matched_counter += 1
        else:
            unmatched_genre_books.append(b)
    print(f"{f} matched {matched} books of {len(dmc_genre.dmc_books)}")
    
    dmc_genre.pickle(f"goodreads/matched/{f}_with_matches.p")

In [None]:
# combine books with difficult or none genre with unmatched books with genre
rest_genre_dmc_books = nf_books + rest_books + without_genre + unmatched_genre_books

In [None]:
# for remaining books use full book corpus
dmc_remaining = DMCGenre(rest_genre_dmc_books,f"goodreads/goodreads_books.json")
dmc_remaining.read_in_dump(authors)
print(f"remaining_books: read dump")
    
dmc_remaining.find_matches()
print(f"remaining_books: checked for matches")
matched_remaining = sum([1 for x in dmc_remaining.dmc_books if x.goodreads_match != None])
print(f"remaining_books: matched {matched_remaining} books of {len(dmc_remaining.dmc_books)}")
    
dmc_remaining.pickle(f"goodreads/matched/dmc_remaining_with_matches.p")

In [70]:
# determine how many books have been matched in total
total = dict()
matched = dict()
matched_with_rec = dict()
unmatched = dict()

for p in genre_dumps + ["dmc_remaining"]:
    with open(f"goodreads/matched/{p}_with_matches.p","rb") as f:
        reloaded_dmc_books = pickle.load(f)
        for dmc in reloaded_dmc_books:
            # all
            if dmc.itemID not in total.keys():
                total[dmc.itemID] = [dmc]
            else:
                total[dmc.itemID].append(dmc)
            # matched
            if dmc.goodreads_match != None:
                if dmc.itemID not in matched.keys():
                    matched[dmc.itemID] = [dmc]
                else:
                    matched[dmc.itemID].append(dmc)
                # matched with rec
                if len(dmc.goodreads_match.similar_books) > 0:
                    if dmc.itemID not in matched_with_rec.keys():
                        matched_with_rec[dmc.itemID] = [dmc]
                    else:
                        matched_with_rec[dmc.itemID].append(dmc)
            # unmatched
            else:
                if dmc.itemID not in unmatched.keys():
                    unmatched[dmc.itemID] = [dmc]
                else:
                    unmatched[dmc.itemID].append(dmc)

In [71]:
# substract matched bookIDs from unmatched dictionary
# this happens due to double parsing of some books in different genres 
# (e.g. book x is part of genre "children" and "youngadults" but will only have a match in "children")
for k in matched.keys():
    unmatched.pop(k,None)

print(f"total: {len(total.keys())}")
print(f"matched: {len(matched.keys())}")
print(f"matched with recommendation: {len(matched_with_rec.keys())}")
print(f"unmatched: {len(unmatched.keys())}")

total: 70398
matched: 23208
matched with recommendation: 13131
unmatched: 47190


In [29]:
# parse all books again and sort quickly by id
def parse_all_books(filename):
    '''Read in complete goodreads dump'''
    books_by_id = dict()
    with open(filename, "r") as f:
        c = 0
        for jsonObj in f:
            b = json.loads(jsonObj, object_hook=lambda d: SimpleNamespace(**d))
            grb = GoodreadsBook(
                                b.title,
                                None,
                                b.authors,
                                b.isbn,
                                b.book_id,
                                b.language_code,
                                b.similar_books)
            books_by_id[b.book_id] = grb
            c += 1
            if (c % 100 == 0):
                print(f"read {c} books", end="\r")

    return books_by_id


In [30]:
all_goodreads = parse_all_books("goodreads/goodreads_books.json")

read 2360600 books

In [99]:
# link similar books of goodreads match with actual book title
# re-merge dmc objects based on their ids in cases where they got recommendations from multiple corpora
matched_with_rec_with_title = dict()

for dmc_id, dmc_list in matched_with_rec.items():
    recs = []
    for dmc_obj in dmc_list:
        for sim in dmc_obj.goodreads_match.similar_books:
            try:
                sim = (all_goodreads[sim],None)
                recs.append(sim)
            except KeyError:
                pass
                # books that got recommended but are not in goodreads corpus
    recs = recs[0:15] # limit to 15 recs due to runtime
    
    dmc_obj.goodreads_match.similar_books = recs
    matched_with_rec_with_title[dmc_id] = dmc_obj


In [None]:
# check if similar books in items.csv (currently too time intensive, needs re-writing)
ci = 0
for dmc in matched_with_rec_with_title.values():
    # only look at books which have goodreads obj rec and therefore non-empty recs list
    if len(dmc.goodreads_match.similar_books) > 0:
        si = 0
        for sim in dmc.goodreads_match.similar_books:
            higher_match = False
            if sim[0].author == None:
                higher_match = True
            elif sim[0].author.author_lastname == "":
                higher_match = True
            for dmc_candidate in dmc_books:
                ratio = fuzz.ratio(dmc_candidate.title.lower(), sim[0].title.lower())
                if ratio < 95:
                    partial_ratio = fuzz.partial_ratio(dmc_candidate.title.lower(), sim[0].title.lower())
                    if partial_ratio < 85:
                        continue
                    else:
                        if higher_match:
                            if partial_ratio < 95:
                                continue
                sim = (sim[0],dmc_candidate)
            si += 1
            print(f"recs: {si} of {len(dmc.goodreads_match.similar_books)}",end="\r")
    ci += 1
    if ci % 50 == 0:
        print(f"\nfinished: {ci} of {len(matched_with_rec_with_title)}")

total_recs = []
recs_in_dmc = []
for dmc in matched_with_rec_with_title:
    recs = dmc.goodreads_match.similar_books
    total_recs.extend(recs) 
    for r in recs:
        if r[1] != None:
            recs_in_dmc.append(r)
print(f"total recs: {len(total_recs)}")
print(f"recs in dmc: {len(recs_in_dmc)}")

with open("goodreads/dmc_with_goodreads_match_in_dmc.p","wb") as f:
    pickle.dump(matched_with_rec_with_title,f)