In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'myanimelist-novel-rating-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5002650%2F8547470%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240610%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240610T114625Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D0fcae9c9fd9e431ba4ee54120003499cdae01aea452d0cd7ac596af7def2542e310a83176bd57f6a52246d7d7661c799d221658daf53ca3e519b329cdfc8a347a847b00a5e1a292f4f41817e34a6b1f877779d8f1d8937ae48f62ed9dc92a87962261acbf35bad53f9001d81e1643e4ba4e7f684905c8106f95fe4f6e2525252ae79d3b19244b07c5ac56717a7a2cba5aaedc4159cd2c7ca737ba03dac5f2bdf5c1b312eb11aa0278ebc98cb16d657971cfe801eda1f35e4593cf06093ae5a2b3c53af4b74922060a9c359b56729f674738e14b80a17bb0eb10474a50a058b7811cb2e483692f790fd86600ea09fe5c648dd4236caa4a3dfe939ff42e5bf70ea'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading myanimelist-novel-rating-dataset, 819455410 bytes compressed
Downloaded and uncompressed: myanimelist-novel-rating-dataset
Data source import complete.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/myanimelist-novel-rating-dataset/interaction_all.csv
/kaggle/input/myanimelist-novel-rating-dataset/interaction_equal.csv
/kaggle/input/myanimelist-novel-rating-dataset/novels.csv
/kaggle/input/myanimelist-novel-rating-dataset/users.csv
/kaggle/input/myanimelist-novel-rating-dataset/interaction_ori.csv


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df1=pd.read_csv('/kaggle/input/myanimelist-novel-rating-dataset/interaction_ori.csv')
df2=pd.read_csv('/kaggle/input/myanimelist-novel-rating-dataset/interaction_all.csv')
df3=pd.read_csv('/kaggle/input/myanimelist-novel-rating-dataset/users.csv')
df4=pd.read_csv('/kaggle/input/myanimelist-novel-rating-dataset/interaction_equal.csv')
df5=pd.read_csv('/kaggle/input/myanimelist-novel-rating-dataset/novels.csv')




In [4]:
df1.head()

Unnamed: 0.1,Unnamed: 0,username,novel_id,interest
0,0,mourn,126479,1
1,1,ALEX_MV,126479,1
2,2,NordoN,126479,1
3,3,ED_9497,126479,1
4,4,Alex3730,126479,1


In [5]:
df2.head()

Unnamed: 0.1,Unnamed: 0,username,novel_id,interest
0,0,mourn,126479,1
1,1,ALEX_MV,126479,1
2,2,NordoN,126479,1
3,3,ED_9497,126479,1
4,4,Alex3730,126479,1


In [6]:
df3.head()

Unnamed: 0.1,Unnamed: 0,username,gender,age
0,0,mourn,Male,24
1,1,ALEX_MV,Male,0
2,2,NordoN,Unspecified,0
3,3,ED_9497,Unspecified,0
4,4,Alex3730,Unspecified,0


In [7]:
df4.head()

Unnamed: 0.1,Unnamed: 0,username,novel_id,interest
0,0,mourn,126479,1
1,1,ALEX_MV,126479,1
2,2,NordoN,126479,1
3,3,ED_9497,126479,1
4,4,Alex3730,126479,1


In [8]:
df5.head()

Unnamed: 0.1,Unnamed: 0,mal_id,title,title_eng,synopsis,type,n_chapters,authors,genres,n_volumes,status,score,scored_by,popularty,favorites,year_start,year_finnish,image
0,0,126479,Mikkakan no Koufuku,Three Days of Happiness,Kusunoki used to believe he was destined for g...,Novel,15.0,Miaki Sugaru,Drama,1.0,Finished,8.91,9855.0,583,1285,2013.0,,https://cdn.myanimelist.net/images/manga/3/248...
1,1,130826,Tian Guan Cifu,Heaven Official's Blessing: Tian Guan Ci Fu,"Born the crown prince of a prosperous kingdom,...",Novel,,Mo Xiang Tong Xiu,"Action,Adventure,Boys Love,Supernatural,Histor...",6.0,Finished,8.86,4004.0,2148,1090,2021.0,2022.0,https://cdn.myanimelist.net/images/manga/3/258...
2,2,35513,Koten-bu Series,,Houtarou Oreki is a high school student who li...,Novel,,Yonezawa Honobu,"Adventure,Mystery,School",,Publishing,8.78,3616.0,1010,663,2001.0,,https://cdn.myanimelist.net/images/manga/3/809...
3,3,117077,Kimi no Suizou wo Tabetai,I Want to Eat Your Pancreas,A high school boy finds the diary of his class...,Novel,10.0,Sumino Yoru,"Drama,Romance",1.0,Finished,8.74,3333.0,2655,401,2015.0,,https://cdn.myanimelist.net/images/manga/2/248...
4,4,143441,Omniscient Reader's Viewpoint,Omniscient Reader's Viewpoint,In the web novel Three Ways to Survive the Apo...,Novel,105.0,sing N song,"Action,Adventure,Fantasy",20.0,Finished,8.74,4328.0,1994,1168,2022.0,2023.0,https://cdn.myanimelist.net/images/manga/1/265...


In [9]:
merged_df = pd.merge(df3, df4, on='username')
merged_df.head()

Unnamed: 0,Unnamed: 0_x,username,gender,age,Unnamed: 0_y,novel_id,interest
0,0,mourn,Male,24,0,126479,1
1,0,mourn,Male,24,7671,81211,1
2,0,mourn,Male,24,14054,32735,1
3,0,mourn,Male,24,0,130826,0
4,0,mourn,Male,24,0,35513,0


In [10]:
merged_df=merged_df.drop(columns='Unnamed: 0_x')
merged_df.head()

Unnamed: 0,username,gender,age,Unnamed: 0_y,novel_id,interest
0,mourn,Male,24,0,126479,1
1,mourn,Male,24,7671,81211,1
2,mourn,Male,24,14054,32735,1
3,mourn,Male,24,0,130826,0
4,mourn,Male,24,0,35513,0


In [11]:
merged_df=merged_df.drop(columns='Unnamed: 0_y')
merged_df.head()

Unnamed: 0,username,gender,age,novel_id,interest
0,mourn,Male,24,126479,1
1,mourn,Male,24,81211,1
2,mourn,Male,24,32735,1
3,mourn,Male,24,130826,0
4,mourn,Male,24,35513,0


In [12]:
top_novel = df5[["genres"]]
top_novel.head()


Unnamed: 0,genres
0,Drama
1,"Action,Adventure,Boys Love,Supernatural,Histor..."
2,"Adventure,Mystery,School"
3,"Drama,Romance"
4,"Action,Adventure,Fantasy"


In [13]:
df5=df5.rename(columns={'mal_id': 'novel_id'})
df5.head()

Unnamed: 0.1,Unnamed: 0,novel_id,title,title_eng,synopsis,type,n_chapters,authors,genres,n_volumes,status,score,scored_by,popularty,favorites,year_start,year_finnish,image
0,0,126479,Mikkakan no Koufuku,Three Days of Happiness,Kusunoki used to believe he was destined for g...,Novel,15.0,Miaki Sugaru,Drama,1.0,Finished,8.91,9855.0,583,1285,2013.0,,https://cdn.myanimelist.net/images/manga/3/248...
1,1,130826,Tian Guan Cifu,Heaven Official's Blessing: Tian Guan Ci Fu,"Born the crown prince of a prosperous kingdom,...",Novel,,Mo Xiang Tong Xiu,"Action,Adventure,Boys Love,Supernatural,Histor...",6.0,Finished,8.86,4004.0,2148,1090,2021.0,2022.0,https://cdn.myanimelist.net/images/manga/3/258...
2,2,35513,Koten-bu Series,,Houtarou Oreki is a high school student who li...,Novel,,Yonezawa Honobu,"Adventure,Mystery,School",,Publishing,8.78,3616.0,1010,663,2001.0,,https://cdn.myanimelist.net/images/manga/3/809...
3,3,117077,Kimi no Suizou wo Tabetai,I Want to Eat Your Pancreas,A high school boy finds the diary of his class...,Novel,10.0,Sumino Yoru,"Drama,Romance",1.0,Finished,8.74,3333.0,2655,401,2015.0,,https://cdn.myanimelist.net/images/manga/2/248...
4,4,143441,Omniscient Reader's Viewpoint,Omniscient Reader's Viewpoint,In the web novel Three Ways to Survive the Apo...,Novel,105.0,sing N song,"Action,Adventure,Fantasy",20.0,Finished,8.74,4328.0,1994,1168,2022.0,2023.0,https://cdn.myanimelist.net/images/manga/1/265...


In [14]:
import dask.dataframe as dd
df5_dask = dd.from_pandas(df5, npartitions=5)
merged_df_dask = dd.from_pandas(merged_df, npartitions=5)

df5_selected_dask = df5_dask[['novel_id', 'title','genres']]
merged_df2_dask = merged_df_dask.merge(df5_selected_dask, on='novel_id', how='left')
merged_df2 = merged_df2_dask.compute()
print(merged_df2)

            username       gender  age  novel_id  interest  \
0              mourn         Male   24     35513         0   
1            ALEX_MV         Male    0     35513         0   
2             NordoN  Unspecified    0     35513         0   
3            ED_9497  Unspecified    0     35513         0   
4           Alex3730  Unspecified    0     70261         1   
...              ...          ...  ...       ...       ...   
53250   HawaiToastX3  Unspecified    0    169340         1   
53251        Foe_nem  Unspecified    0    169340         1   
53252         HM0710  Unspecified    0    169340         1   
53253  timewastelist  Unspecified    0    169340         1   
53254         OMURAA  Unspecified    0    169340         1   

                                          title                        genres  
0                               Koten-bu Series      Adventure,Mystery,School  
1                               Koten-bu Series      Adventure,Mystery,School  
2              

In [15]:
data_pivot_temp = merged_df2.pivot_table(index="title",columns="username",values="interest").fillna(0)
data_pivot_temp.head()

username,----2o,--Chinoda--,--Magma--,--Sunclaudius,--Tsubasa--,--sayori--,-0-haivyx-0-,-AJ-,-Abnormal-,-Absynthe-,...,zxtread,zxv_alsz,zygon14,zylars,zymbus_297,zynix,zyxel--,zzz,zzz-anime,zzzzzzzBen
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Bungaku Shoujo"" Minarai",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""Bungaku Shoujo"" Series",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""Fushigi"" Toriatsukaimasu: Tsukumodou Kottouten",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(Aruiwa) SF no Aru Fuukei,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//AI Buster,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
merged_df2['genres'] = merged_df2['genres'].fillna('')
unique_genres = set(genre for genres_list in merged_df2['genres'] for genre in genres_list)
for genre in unique_genres:
    merged_df2[genre] = merged_df2['genres'].apply(lambda x: 1 if genre in x else 0)

merged_df2.head()

Unnamed: 0,username,gender,age,novel_id,interest,title,genres,T,G,S,...,",",b,W,F,d,o,n,L,l,k
0,mourn,Male,24,35513,0,Koten-bu Series,"Adventure,Mystery,School",0,0,1,...,1,0,0,0,1,1,1,0,1,0
1,ALEX_MV,Male,0,35513,0,Koten-bu Series,"Adventure,Mystery,School",0,0,1,...,1,0,0,0,1,1,1,0,1,0
2,NordoN,Unspecified,0,35513,0,Koten-bu Series,"Adventure,Mystery,School",0,0,1,...,1,0,0,0,1,1,1,0,1,0
3,ED_9497,Unspecified,0,35513,0,Koten-bu Series,"Adventure,Mystery,School",0,0,1,...,1,0,0,0,1,1,1,0,1,0
4,Alex3730,Unspecified,0,70261,1,Mushoku Tensei: Isekai Ittara Honki Dasu,"Fantasy,Isekai,Reincarnation",0,0,0,...,1,0,0,1,0,1,1,0,0,1


In [17]:
merged_df2.drop(columns=['genres'], inplace=True)
merged_df2.head()

Unnamed: 0,username,gender,age,novel_id,interest,title,T,G,S,B,...,",",b,W,F,d,o,n,L,l,k
0,mourn,Male,24,35513,0,Koten-bu Series,0,0,1,0,...,1,0,0,0,1,1,1,0,1,0
1,ALEX_MV,Male,0,35513,0,Koten-bu Series,0,0,1,0,...,1,0,0,0,1,1,1,0,1,0
2,NordoN,Unspecified,0,35513,0,Koten-bu Series,0,0,1,0,...,1,0,0,0,1,1,1,0,1,0
3,ED_9497,Unspecified,0,35513,0,Koten-bu Series,0,0,1,0,...,1,0,0,0,1,1,1,0,1,0
4,Alex3730,Unspecified,0,70261,1,Mushoku Tensei: Isekai Ittara Honki Dasu,0,0,0,0,...,1,0,0,1,0,1,1,0,0,1


In [18]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

data_matrix = csr_matrix(data_pivot_temp.values)

model_knn = NearestNeighbors(metric = "cosine", algorithm = "brute")
model_knn.fit(data_matrix)

query_no = np.random.choice(data_pivot_temp.shape[0]) # random anime title and finding recommendation
print(f"rekomendasi  {query_no} novel {data_pivot_temp.index[query_no]}.")
distances, indices = model_knn.kneighbors(data_pivot_temp.iloc[query_no,:].values.reshape(1, -1), n_neighbors = 6)

recommended_indices = indices.flatten()
recommended_titles = [data_pivot_temp.index[idx] for idx in recommended_indices]
print("Rekomendasi novel:")
for title in recommended_titles:
    print(title)

rekomendasi  75 novel Alice in Gothicland.
Rekomendasi novel:
Alice in Gothicland
Anata ga Hoshii: Oukan to Aimitsu no Hanayome
Hard Days Nights
Daifugou no Kyuukon: Toshi no Sa Cinderella
Daitoubou: Onna Kishi no Koi no Junan
Furachi na Kyuuai: Taikutsuou no Okisakierabi


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents="unicode", analyzer="word",
                      token_pattern=r"\w{1,}", ngram_range=(1, 3), stop_words = "english")

rec_data = df5.copy()
rec_data.drop_duplicates(subset="title", keep="first", inplace=True)
rec_data.reset_index(drop=True, inplace=True)
genres = rec_data["genres"].str.split(", | , | ,").astype(str)
tfv_matrix = tfv.fit_transform(genres)

In [20]:
from sklearn.metrics.pairwise import sigmoid_kernel

sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

rec_indices = pd.Series(rec_data.index, index=rec_data["title"]).drop_duplicates()

def give_recommendation(title, sig=sig):
    idx = rec_indices[title]

    sig_score = list(enumerate(sig[idx]))

    sig_score = sorted(sig_score, key=lambda x: x[1], reverse=True)

    sig_score = sig_score[1:11]
    novel_indices = [i[0] for i in sig_score]

    rec_dic = {"No": range(1, 11),
               "Nama novel": rec_data["title"].iloc[novel_indices].values}
    dataframe = pd.DataFrame(data=rec_dic)
    dataframe.set_index("No", inplace=True)

    print(f"rekomendasi novel {title}:\n")

    return dataframe

In [21]:
title = "rekomendasi novel untukmu"
recommendations = give_recommendation('Mushoku Tensei: Isekai Ittara Honki Dasu')
print(recommendations)

rekomendasi novel Mushoku Tensei: Isekai Ittara Honki Dasu:

                                           Nama novel
No                                                   
1            Mushoku Tensei: Isekai Ittara Honki Dasu
2   Honzuki no Gekokujou: Shisho ni Naru Tame ni w...
3                      Tensei shitara Slime Datta Ken
4   Honzuki no Gekokujou: Shisho ni Naru Tame ni w...
5   Maou ni Natta node, Dungeon Tsukutte Jingai Mu...
6       Death March kara Hajimaru Isekai Kyousoukyoku
7   Seija Musou: Salaryman, Isekai de Ikinokoru Ta...
8   Jidou Hanbaiki ni Umarekawatta Ore wa Meikyuu ...
9                       Potion-danomi de Ikinobimasu!
10                   Tensei shichatta yo (Iya, Gomen)


In [22]:
title = "rekomendasi novel untukmu"
recommendations = give_recommendation('Shousetsu Sousou no Frieren: Zensou')
print(recommendations)

rekomendasi novel Shousetsu Sousou no Frieren: Zensou:

                                           Nama novel
No                                                   
1                     Zero kara Hajimeru Mahou no Sho
2                   Kingdom Hearts: Chain of Memories
3   Hitotsu no Tairiku no Monogatari: Allison to W...
4                       Apocripha/0: Infinite Fortune
5                 Shousetsu Sousou no Frieren: Zensou
6                                   Magdala de Nemure
7                          Kingdom Hearts: 358/2 Days
8              Isekai Meikyuu no Saishinbu wo Mezasou
9                                    Lillia to Treize
10                         Boku no Aishita Siegfriede
