In [1]:
import pandas as pd

In [6]:
x_train = pd.read_csv('../data/x_train.csv')
x_test = pd.read_csv('../data/x_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

In [7]:
x_train.head()

Unnamed: 0,name,assoc_names,original_language,authors,genres,tags,start_year,licensed,original_publisher,english_publisher,...,activity_week_rank,activity_month_rank,activity_all_time_rank,on_reading_lists,reading_list_month_rank,reading_list_all_time_rank,rating,rating_votes,related_series_ids,chapter_latest_translated
0,Capitalist Monster Hunter,"['Monster Hunter Born of Capitalism', '자본주의가 낳...",korean,['쌈마이'],"['action', 'adventure', 'comedy', 'fantasy', '...","['accelerated growth', 'angels', 'artificial i...",2018,False,kakaopage,,...,2000,2000,9365,914,18081,7999,3.8,37,,
1,Gear Drive,"['Haguruma Drive.', 'はぐるまどらいぶ。']",japanese,"['kabayakidare', 'かばやきだれ']","['action', 'adventure', 'fantasy', 'shoujo ai'...","['adapted to manga', 'adventurers', 'artificia...",2017,True,overlap,j-novel club,...,2000,2000,11716,405,3254,12722,4.2,20,,c25
2,Seirei Tsukai no Blade Dance Extra,"['Blade Dance of Elementalers Extra', 'Seirei ...",japanese,"['shimizu yuu', '志瑞祐']","['action', 'comedy', 'ecchi', 'fantasy', 'hare...","['adapted to anime', 'adapted to manga', 'assa...",2014,False,,,...,2000,2000,14354,576,10829,10594,3.9,16,[958],covers
3,I Was Forced by the Saintess to Marry the Ugly...,['Seijou-sama ni Minikui Kami-sama to no Kekko...,japanese,['村川咲（赤村咲）'],"['fantasy', 'romance', 'shoujo', 'supernatural']","['abandoned children', 'absent parents', 'abus...",2020,False,kakuyomu,,...,2000,11446,2934,1775,12720,4988,4.1,56,,v5c5
4,Jian Jiang Shan,"['Jian Jiang Shan', '见江山']",chinese,"['a big roll of toilet paper', '好大一卷卫生纸']","['drama', 'shounen ai', 'xianxia']","['ancient times', 'cultivation']",2018,False,jjwxc,,...,2000,2000,20744,154,19795,18117,3.7,6,,c5


In [9]:
# walk through 
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_TFIDF_features(train_data, test_data, feature_column):
    vectorizer = TfidfVectorizer()                                            # TF IDF vectorizer
    train_feature = vectorizer.fit_transform(train_data[feature_column])      # fit and transform the train data
    test_feature = vectorizer.transform(test_data[feature_column])            # transform the test data
    return train_feature, test_feature


In [15]:
# Combine genres and tags into single string for each book
x_train['genres_tags_feature'] = x_train['genres'] + ' ' + x_train['tags']
x_test['genres_tags_feature'] = x_test['genres'] + ' ' + x_test['tags']

# Extract TF-IDF features
train_feature, test_feature = extract_TFIDF_features(x_train, x_test, 'genres_tags_feature')


In [16]:
# print and view the features
print(train_feature)
print(test_feature)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 554474 stored elements and shape (17464, 909)>
  Coords	Values
  (0, 12)	0.07891933322403903
  (0, 20)	0.08351201336218922
  (0, 171)	0.07659299696708051
  (0, 304)	0.05706361556270087
  (0, 695)	0.13365341368238928
  (0, 313)	0.13365341368238928
  (0, 731)	0.10804165390387416
  (0, 799)	0.10661319100130412
  (0, 10)	0.19179129125218847
  (0, 363)	0.11780730073789981
  (0, 43)	0.19750910027437363
  (0, 64)	0.18987782504069628
  (0, 418)	0.18987782504069628
  (0, 133)	0.13604319689982183
  (0, 640)	0.2023515406596178
  (0, 211)	0.14169447453553222
  (0, 264)	0.1600976978478078
  (0, 332)	0.1680346035060014
  (0, 719)	0.1680346035060014
  (0, 336)	0.1111937955365396
  (0, 276)	0.12443260423248015
  (0, 357)	0.13941281094355765
  (0, 400)	0.18888783390551753
  (0, 462)	0.18987782504069628
  (0, 485)	0.05120455029918352
  :	:
  (17463, 315)	0.07083277969421926
  (17463, 804)	0.21088231087605247
  (17463, 16)	0.12355778831817191


In [24]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similar_books(book_name, x_train, train_feature):
    # Get the index of the book in the training set
    book_index = x_train[x_train['name'] == book_name].index[0]

    # Calculate cosine similarity between the book and all other books in the training set
    cosine_similarities = cosine_similarity(train_feature[book_index], train_feature).flatten()

    # Create a DataFrame with book names and their similarity scores
    similarity_df = pd.DataFrame({
        'name': x_train['name'],
        'similarity_score': cosine_similarities
    })

    # Sort the DataFrame by similarity score in descending order
    similarity_df = similarity_df.sort_values(by='similarity_score', ascending=False)

    return similarity_df


In [25]:

# Get the top 10 most similar books to "Reverend Insanity"
similar_books = get_similar_books('Reverend Insanity', x_train, train_feature)

# Display the top 10 most similar books
print(similar_books.head(10))                                # Fig_03-01.png



# Get the top 10 most similar books to "Warlock of the Magus World"
similar_books = get_similar_books('Warlock of the Magus World', x_train, train_feature)

# Display the top 10 most similar books
print(similar_books.head(10))                                # Fig_03-02.png

                                name  similarity_score
3924               Reverend Insanity          1.000000
11124           Pursuit of the Truth          0.577366
14596          The Ruler Of Darkness          0.554362
11916                Omnipotent Sage          0.536461
478          My Longevity Simulation          0.524413
10905  Aspiring to the Immortal Path          0.504625
8648            I’m Really a Villain          0.504229
9082            Emperor’s Domination          0.502463
32                      Sage Monarch          0.491454
12677         I Am the Fated Villain          0.490433
                                       name  similarity_score
7737             Warlock of the Magus World          1.000000
16704                      Netheril’s Glory          0.709142
16780                         Age of Adepts          0.552378
12666               Carefree Path of Dreams          0.497295
3976                 Sovereign of the Ashes          0.497204
9675                   

In [None]:

# example for reference of building features
def beer_features(review):
    '''create a feature vector based on beer features'''
    
    abv = review['beer/ABV']
    appearance = review['review/appearance']
    aroma = review['review/aroma']
    palate = review['review/palate']
    taste = review['review/taste']
    time = int(review['review/time'])
    
    features = [abv, appearance, aroma, palate, taste, time]
    return features

In [None]:
# features based on reviews
#def novel_features(review):