In [1]:
import json

with open('noveldata.json','r',encoding='utf8') as f:
    noveldata = f.read()

In [2]:
json_novel_data = json.loads(noveldata)

In [3]:
j_file = json.loads(noveldata)[0]

In [4]:
def rating_num_correction(ratings):
    ratings = ratings.split()
    return ratings[3]

In [5]:
def parse_fields(novel_data):
    return {
        "url": novel_data["url"],
        "title": novel_data["title"],
        "ratings": rating_num_correction(novel_data["rating"]),
        "cover": novel_data["image"]
    }

In [6]:
parse_fields(j_file)

{'url': 'https://www.novelupdates.com/series/taming-the-villainesses/',
 'title': 'Taming The Villainesses',
 'ratings': '183',
 'cover': 'https://cdn.novelupdates.com/images/2022/06/Taming-The-Villainesses.jpg'}

In [7]:
Books = []

with open('noveldata.json','r',encoding='utf8') as f:
    data = f.read()
    noveldata = json.loads(data)
    i = 0
    while i!= len(noveldata)-1:
        fields = parse_fields(noveldata[i])
    
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue

        if ratings>10:
            Books.append(fields)
        i = i + 1

In [8]:
import pandas as pd

titles = pd.DataFrame.from_dict(Books)

In [9]:
titles["ratings"] = pd.to_numeric(titles["ratings"])

In [10]:
titles["reduced_titles"] = titles["title"].str.replace("[^a-zA-z0-9]","", regex=True)
titles["reduced_titles"] = titles["reduced_titles"].str.lower()
titles["reduced_titles"] = titles["reduced_titles"].str.replace("\s+"," ", regex = True)
titles = titles[titles["reduced_titles"].str.len()>0]

In [11]:
titles

Unnamed: 0,url,title,ratings,cover,reduced_titles
0,https://www.novelupdates.com/series/taming-the...,Taming The Villainesses,183,https://cdn.novelupdates.com/images/2022/06/Ta...,tamingthevillainesses
1,https://www.novelupdates.com/series/the-main-h...,The Main Heroines are Trying to Kill Me,428,https://cdn.novelupdates.com/images/2022/04/Th...,themainheroinesaretryingtokillme
2,https://www.novelupdates.com/series/the-regres...,The Regressed Demon Lord is Kind,529,https://cdn.novelupdates.com/images/2021/06/Th...,theregresseddemonlordiskind
3,https://www.novelupdates.com/series/kidnapped-...,Kidnapped Dragons,683,https://cdn.novelupdates.com/images/2020/10/Ki...,kidnappeddragons
4,https://www.novelupdates.com/series/im-really-...,I’m Really Not The Demon God’s Lackey,509,https://cdn.novelupdates.com/images/2021/06/Im...,imreallynotthedemongodslackey
...,...,...,...,...,...
11318,https://www.novelupdates.com/series/little-ger...,"Little Ger, Slow Life in Another World",69,https://cdn.novelupdates.com/images/2021/10/Li...,littlegerslowlifeinanotherworld
11319,https://www.novelupdates.com/series/sword-and-...,Sword and Love,24,https://cdn.novelupdates.com/images/2018/09/Sw...,swordandlove
11320,https://www.novelupdates.com/series/misunderst...,Misunderstood Lousy Cop: Getting Promoted for ...,17,https://www.novelupdates.com/img/noimagefound.jpg,misunderstoodlousycopgettingpromotedforrunning...
11321,https://www.novelupdates.com/series/the-queens...,The Queen’s Husband,37,https://cdn.novelupdates.com/images/2019/03/tq...,thequeenshusband


In [12]:
titles.to_json("books_titles.json")
titles.to_csv("bookdata.csv")

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer 
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["reduced_titles"])

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def make_clickable(val): 
    return '<a target="_blank" href="{}">NovelUpdates</a>'.format(val)

def show_image(val):
    return '<img src="{}" width=50></img>'.format(val)

def search(query,vectorizer):
    processed = re.sub("[^a-zA-Z0-9]","", query.lower())
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity,-5)[-5:]
    results = titles.iloc[indices]
    results = results.sort_values("ratings", ascending=False)
    return results.head(5).style.format({'url': make_clickable, 'cover':show_image })

   

In [15]:
search("Soul Of Negary", vectorizer)

Unnamed: 0,url,title,ratings,cover,reduced_titles
8986,NovelUpdates,Soul of Negary,140,,soulofnegary
3774,NovelUpdates,HP1 kara Hajimeru Isekai Musou,129,,hp1karahajimeruisekaimusou
3773,NovelUpdates,I Became the Youngest Daughter of the Mafia Family,43,,ibecametheyoungestdaughterofthemafiafamily
3772,NovelUpdates,"Of All the Transmigrations, Why Am I a Prisoner?",28,,ofallthetransmigrationswhyamiaprisoner
3771,NovelUpdates,"I’m an S-class Knight, and I was Appointed as a Captain of an Elite Unit, but all my Subordinates were Older S-class Female Knights",21,,imansclassknightandiwasappointedasacaptainofaneliteunitbutallmysubordinateswereoldersclassfemaleknights


In [16]:
liked_books = ['https://www.novelupdates.com/series/reverend-insanity/','https://www.novelupdates.com/series/soul-of-negary/']

In [17]:
titles[titles["url"].isin(liked_books)].to_csv("liked_books.csv")