In [1]:
import requests
import pandas as pd
import os
import re
import math
import numpy as np
from tqdm import tqdm
from difflib import SequenceMatcher
from bs4 import BeautifulSoup
from datetime import datetime
from collections import Counter
from natsort import natsorted

In [2]:
def dateparse(time_in_secs):
    return pd.to_datetime(time_in_secs, unit='s')

In [39]:
episodes = []
title=[]
types = []
popularity = []
description=[]
related=[]
release=[]
animeScore=[]
voices=[]
staff=[]
ranks=[]

with open("./anime_url.txt", "r", encoding = "utf-8") as f:
    urls = f.readlines()
# positioning
folder = r"./tsv_anime/"
# iter over the file
dtypes = {'animeTitle': 'str', 'animeType': 'str' }
for anime in tqdm(natsorted(os.listdir(folder))):
    df = pd.read_csv(folder+anime, sep = "\t", dtype=dtypes, parse_dates=['releaseDate','endDate'],date_parser=pd.to_datetime)   
    #print(df.info())
    #print(type(df["releaseDate"][0]))
    episodes.append(df["animeNumEpisode"][0])
    types.append(df["animeType"][0])
    popularity.append(df["animePopularity"][0])
    description.append(df["animeDescription"][0])
    title.append(df["animeTitle"][0])
    related.append(df["animeRelated"][0])
    release.append(df["releaseDate"][0])
    animeScore.append(df["animeScore"][0])
    voices.append(df["animeVoices"][0])
    staff.append(df["animeStaff"][0])
    ranks.append(df["animeRank"][0])
type(release[0])

100%|███████████████████████████████████████████████████████████████████████████| 19118/19118 [00:58<00:00, 325.20it/s]


pandas._libs.tslibs.timestamps.Timestamp

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   animeTitle        1 non-null      object        
 1   animeType         1 non-null      object        
 2   animeNumEpisode   1 non-null      int64         
 3   releaseDate       1 non-null      datetime64[ns]
 4   endDate           0 non-null      datetime64[ns]
 5   animeNumMembers   1 non-null      int64         
 6   animeScore        0 non-null      float64       
 7   animeUsers        0 non-null      float64       
 8   animeRank         0 non-null      float64       
 9   animePopularity   1 non-null      int64         
 10  animeDescription  1 non-null      object        
 11  animeRelated      1 non-null      object        
 12  animeCharacters   1 non-null      object        
 13  animeVoices       1 non-null      object        
 14  animeStaff        0 non-null  

We changed the datatype in these lists because there were nan elements inside them.

In [44]:
def change(l):
    for i in range(len(l)):
        if(type(l[i])!=str):
            l[i]=l[i].item()
            l[i]=str(l[i])

In [45]:
change(staff)
change(voices)

### Cosine function

We used "cosine" to determine how similar documents, regardless of their size, are to the query entered and correspondence, the similarity of texts to each other. We used this function to find similarities in these metrics: (Description, AnimeStaff, Voices,Type) We also used the "similar" function, showing an alternative way of finding the similarity and used this function to find the metric: Related

In [10]:
WORD = re.compile(r"\w+")

In [11]:
def cosine_text(input_text,comparison_text):
    if input_text == "":
        return 0
    else:
        intersection = set(input_text.keys()) & set(comparison_text.keys())
        numerator = sum([input_text[x] * comparison_text[x] for x in intersection])
        sum1 = sum([input_text[x] ** 2 for x in list(input_text.keys())])
        sum2 = sum([input_text[x] ** 2 for x in list(comparison_text.keys())])
        denominator = math.sqrt(sum1) * math.sqrt(sum2)
        if not denominator:
            return 0.0
        else:
            return float(numerator) / denominator  

In [12]:
def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

In [13]:
def cosine(input_text,comparison_text):
    vector1 = text_to_vector(input_text)
    vector2 = text_to_vector(comparison_text)
    cosine = cosine_text(vector1, vector2)
    return(cosine)

In [55]:
def similar(a, b):
    if(a==""):
        return 0
    else:
        return SequenceMatcher(None, a, b).ratio()

### Num_Episode metric

For num_episode variable, we have classified the amount of anime into 4 categories:
0-is film,1 is short,2 is medium,3 is large.First we define the category of the input data and Calculated their differences. Depending on the number of episodes, we will be able to determine the greatest suitable anime by length


In [56]:
def identify_category(input_eps,next_eps):
    if(type(next_eps) == str):
        return 0
    else:
        #Input_eps
        if(input_eps==1):
            category1=0
        elif(input_eps<25 and input_eps>=8):
            category1=1
        elif(input_eps<65 and input_eps>=25):
            category1=2
        else:
            category=3
       #Next_eps 
        if(next_eps==1):
            category2=0
        elif(next_eps<25 and input_eps>=8):
            category2=1
        elif(next_eps<65 and input_eps>=25):
            category2=2
        else:
            category2=3
        return(category1,category2)

In [57]:
def episode_score(input_eps,next_eps):
    if(input_eps==0 or type(next_eps)==str):
        return 0
    else:
        category=(identify_category(input_eps,next_eps))
        if(category[0]==category[1]):
            return 1
        elif(abs(category[0]-category[1])==1):
            return 0.66
        elif(abs(category[0]-category[1])>1):
            return 0.33
        else:
            return 0

## Rank

The logic of ranking metric is that we push away from the request, if the query anime was given to us by the top 100 anime, we  are looking for a suitable one from the top 100 or from the top 1000

In [17]:
def ranking(input_rank,next_rank):
    if input_rank == 0:
        return 0
    else:
        diff=abs(input_rank-next_rank)
        if(diff<=100):
            return 1
        elif(diff>100 and diff<=1000):
            return 0.66
        else:
            return 0.33

### Date

We created a date metric to determine how late and newer the anime is with the query being compared.

In [18]:
def date(input_date,releaseDate):
    if(input_date==0 or input_date==""):
        return 0
    else:
        diff = (abs(datetime.strptime(input_date,'%Y-%m-%d')-releaseDate))
        days = diff.days
        #Anime came out in the same season
        if(days>0 and days<=100):
            return 1
        #Differences in 1 year
        elif(days>100 and days<=300):
            return 0.5
        return(0)

#print(date(date_query,release[0]))

## Popularity metric

The logic of this metric is similar to the rank metric.

In [19]:
def score_pop(input_pop,next_pop):
    if input_pop == 0:
        return 0
    else:
        diff=abs(input_pop-next_pop)
        if(input_pop<=100 and diff<=100):
            return 0.66
        elif(input_pop > 100 and input_pop <= 1000 and diff > 100):
            return 0.44
        else:
            return 0.22

## animeScore metric

We also categorized the anime by a rating, like metric number episodes:1 High,  2 Average, 3 Low

In [20]:
def identify_s(input_s,next_s):
    if(input_s>=8):
        category1=1
    elif(input_s<8 and input_s>=5):
        category1=2
    else:
        category1=3
       
    if(next_s>=8):
        category2=1
    elif(next_s<8 and next_s>=5):
        category2=2
    else:
        category2=3
    return(category1,category2)

In [21]:
def score_s(input_s,next_s):
    if input_s == 0:
        return 0
    else:
        category = identify_s(input_s,next_s)
        if(category[0]==category[1]):
            return 1
        elif(abs(category[0]-category[1])==1):
            return 0.66
        elif(abs(category[0]-category[1])>1):
            return 0.33
        else:
            return 0

### Final score metric

The final score is our final function that calculates the score. We have used all of the above functions to determine the total weight of each metric. For our convenience, we have divided all the metrics into two functions. In the first function, we have an unusual formula, since we have chosen such metrics as animeTitle and animeDescription as the most basic and especially influencing the function. For example the metric animeTitle is multiplied by 4, animeDescription is multiplied by 1.5. We multiplied such information as Voices, Staff, etc. by 0.3 since they do not have a very strong influence on the search.

In [58]:
def final_score1(i,input_type,next_type,input_eps,next_eps,input_title,next_title,input_related,next_related,input_description,next_description,input_date,next_date,input_voice,next_voice,input_staff,next_staff):
    #print(i)
    #Type variable
    #print(title[i])
    score1=cosine(input_type,next_type)
    #print("Type_score",score1)
    #Episodes variable
    score2=episode_score(input_eps,next_eps)
    #print("Episode_score",score2)
    #Title variable
    score3=cosine(input_title,next_title)
    #print("Title_score",score3)
    #Related anime variable
    score4=similar(input_related,next_related)
    #print("Related_score",score4)
    #Description variable
    score5=cosine(input_description,next_description)
    #print("Description_score",score5)
    #Season date variable
    score6=date(input_date,next_date)
    #print("Season_score",score6)
    #Voices variable
    score7=cosine(input_voice,next_voice)
    #print("Voice",score7)
    #Staff  variable
    score8=cosine(input_staff,next_staff)
    #print("Staff",score8)
    return(score1*0.5+score2*0.5+score3*4+score4+score5*1.5+score6*0.5+score7*0.3+score8*0.3)

In [59]:
def final_score2(input_rank,next_rank,input_pop,next_pop,input_s,next_s):
    #Rank
    score9 = ranking(input_rank,next_rank)
    #print("Rank",score9)
    #Popularity
    score10=score_pop(input_pop,next_pop)
    #print("Popular",score10)
    #animeScore
    score11=score_s(input_s,next_s)
    #print("animeScore",score11)
    return(score9+score10+score11)


We examined 3 cases of writing a request by a user. The first example shows us information in which the user made a request for only one attribute, in the second case we added several attributes to our request and we see how our function has become better to select and recommend anime. In the very last case, we wrote all the necessary information and see an excellent match.

In [54]:
title_query=""
type_query=""
ep_query=0
related_query=""
description_query="Naruto Shipudden Konoha Saske Orochimaru "
date_query=0
voices_query=""
staff_query=""
rank_query=0
pop_query=0
score_query=0
scores=[]

for i in range(len(episodes)):
    score1=final_score1(i,type_query,types[i],ep_query,episodes[i],title_query,title[i],related_query,related[i],description_query,description[i],date_query,release[0],voices_query,voices[i],staff_query,staff[i])
    score2=final_score2(rank_query,ranks[i],pop_query,popularity[i],score_query,animeScore[i])
    score=score1+score2
    scores.append([score,i])
sorted_scores=sorted(scores, reverse=True)


df_score=[]
df_title=[]
df_urls=[]
df_l=[]
for i in range(len(episodes)):
    df_score.append(sorted_scores[i][0])
    df_title.append(title[sorted_scores[i][1]])
    df_urls.append(urls[sorted_scores[i][1]])
    df_l.append(sorted_scores[i][1])
df1 = pd.DataFrame (df_score, columns = ['Score:'])
df2 = pd.DataFrame (df_title, columns = ['Title:'])
df3 = pd.DataFrame (df_urls, columns = ['URL:'])
df4 = pd.DataFrame (df_l, columns = ['Rank:'])
pdList = [df1, df2, df3,df4 ]  
new_df = pd.concat(pdList, axis=1)
display(new_df)

Unnamed: 0,Score:,Title:,URL:,Rank:
0,3.354102,Boruto: Naruto Next Generations,https://myanimelist.net/anime/56/Avenger\n,9054
1,3.354102,Naruto: Shippuuden Movie 6 - Road to Ninja,https://myanimelist.net/anime/13667/Naruto__Sh...,1124
2,2.683282,Juliet,https://myanimelist.net/anime/6392/Issho_ni_Tr...,11698
3,2.683282,Naruto: Shippuuden Movie 4 - The Lost Tower,https://myanimelist.net/anime/8246/Naruto__Shi...,1902
4,2.683282,Naruto,https://myanimelist.net/anime/20/Naruto\n,607
...,...,...,...,...
19113,0.000000,Fruits Basket: The Final,https://myanimelist.net/anime/42938/Fruits_Bas...,4
19114,0.000000,Steins;Gate,https://myanimelist.net/anime/9253/Steins_Gate\n,3
19115,0.000000,Shingeki no Kyojin Season 3 Part 2,https://myanimelist.net/anime/38524/Shingeki_n...,2
19116,0.000000,Gintama°,https://myanimelist.net/anime/28977/Gintama°\n,1


In [53]:
title_query="Death Note"
type_query="TV"
ep_query=51
related_query=""
description_query="Death Note L Kira"
date_query=""
voices_query=""
staff_query=""
rank_query=1
pop_query=0
score_query=9.21
scores=[]

for i in range(len(episodes)):
    score1=final_score1(i,type_query,types[i],ep_query,episodes[i],title_query,title[i],related_query,related[i],description_query,description[i],date_query,release[0],voices_query,voices[i],staff_query,staff[i])
    score2=final_score2(rank_query,ranks[i],pop_query,popularity[i],score_query,animeScore[i])
    score=score1+score2
    scores.append([score,i])
sorted_scores=sorted(scores, reverse=True)


df_score=[]
df_title=[]
df_urls=[]
df_l=[]
for i in range(len(episodes)):
    df_score.append(sorted_scores[i][0])
    df_title.append(title[sorted_scores[i][1]])
    df_urls.append(urls[sorted_scores[i][1]])
    df_l.append(sorted_scores[i][1])
df1 = pd.DataFrame (df_score, columns = ['Score:'])
df2 = pd.DataFrame (df_title, columns = ['Title:'])
df3 = pd.DataFrame (df_urls, columns = ['URL:'])
df4 = pd.DataFrame (df_l, columns = ['Rank:'])
pdList = [df1, df2, df3,df4 ]  
new_df = pd.concat(pdList, axis=1)
display(new_df)

Unnamed: 0,Score:,Title:,URL:,Rank:
0,10.897114,Death Note,https://myanimelist.net/anime/1535/Death_Note\n,64
1,7.918076,Death Note: Rewrite,https://myanimelist.net/anime/2994/Death_Note_...,1086
2,7.570000,Soul Eater,https://myanimelist.net/anime/3588/Soul_Eater\n,760
3,6.068427,Death Parade,https://myanimelist.net/anime/28223/Death_Para...,337
4,5.398427,Hinako Note,https://myanimelist.net/anime/33948/Hinako_Note\n,4764
...,...,...,...,...
19113,0.825000,Hana no Hanashi,https://myanimelist.net/anime/24905/Go-hiki_no...,11458
19114,0.825000,Megumi to Taiyou II: Kajuu Gummi Tweet Mystery...,https://myanimelist.net/anime/9337/Mayo_Elle_O...,11453
19115,0.825000,Sougiya to Inu,https://myanimelist.net/anime/38893/Screw\n,11433
19116,0.825000,Ishindenshin Shiyou,https://myanimelist.net/anime/21833/Higenashi_...,11424


In [52]:
title_query="Gintama"
type_query="TV"
ep_query=51
related_query="Gintama Movie 2: Kanketsu-hen - Yorozuya yo Eien Nare, Gintama': Enchousen"
description_query="Gintoki, Shinpachi, and Kagura return as the fun-loving but broke members of the Yorozuya team! Living in an alternate-reality Edo, where swords are prohibited and alien overlords have conquered Japan, they try to thrive on doing whatever work they can get their hands on. However, Shinpachi and Kagura still haven't been paid... Does Gin-chan really spend all that cash playing pachinko?Meanwhile, when Gintoki drunkenly staggers home one night, an alien spaceship crashes nearby. A fatally injured crew member emerges from the ship and gives Gintoki a strange, clock-shaped device, warning him that it is incredibly powerful and must be safeguarded. Mistaking it for his alarm clock, Gintoki proceeds to smash the device the next morning and suddenly discovers that the world outside his apartment has come to a standstill. With Kagura and Shinpachi at his side, he sets off to get the device fixed; though, as usual, nothing is ever that simple for the Yorozuya team."
date_query="2015-06-21"
voices_query="Sugita, Tomokazu Kugimiya, Rie Sakaguchi, Daisuke Ishida, Akira Nakai, Kazuya"
staff_query="Fujita, Youichi Director, Storyboard, Planning Miyawaki, Chizuru Director, Storyboard, Key Animation Takamatsu, ShinjiSound Director"
rank_query=2
pop_query=337
score_query=9.09
scores=[]

for i in range(len(episodes)):
    score1=final_score1(i,type_query,types[i],ep_query,episodes[i],title_query,title[i],related_query,related[i],description_query,description[i],date_query,release[0],voices_query,voices[i],staff_query,staff[i])
    score2=final_score2(rank_query,ranks[i],pop_query,popularity[i],score_query,animeScore[i])
    score=score1+score2
    scores.append([score,i])
sorted_scores=sorted(scores, reverse=True)


df_score=[]
df_title=[]
df_urls=[]
df_l=[]
for i in range(len(episodes)):
    df_score.append(sorted_scores[i][0])
    df_title.append(title[sorted_scores[i][1]])
    df_urls.append(urls[sorted_scores[i][1]])
    df_l.append(sorted_scores[i][1])
df1 = pd.DataFrame (df_score, columns = ['Score:'])
df2 = pd.DataFrame (df_title, columns = ['Title:'])
df3 = pd.DataFrame (df_urls, columns = ['URL:'])
df4 = pd.DataFrame (df_l, columns = ['Rank:'])
pdList = [df1, df2, df3,df4 ]  
new_df = pd.concat(pdList, axis=1)
display(new_df)

Unnamed: 0,Score:,Title:,URL:,Rank:
0,12.220339,Gintama: Yorinuki Gintama-san on Theater 2D,https://myanimelist.net/anime/21899/Gintama__Y...,136
1,9.910090,Gintama°,https://myanimelist.net/anime/28977/Gintama°\n,1
2,9.640334,Gintama',https://myanimelist.net/anime/9969/Gintama\n,5
3,9.486018,Gintama.: Shirogane no Tamashii-hen,https://myanimelist.net/anime/36838/Gintama__S...,24
4,9.304666,Gintama.,https://myanimelist.net/anime/34096/Gintama\n,10
...,...,...,...,...
19113,1.265000,Playground,https://myanimelist.net/anime/23053/Nisou_no_K...,11504
19114,1.265000,Kakegae no,https://myanimelist.net/anime/31831/Fantasy\n,11498
19115,1.265000,Fantasy,https://myanimelist.net/anime/31759/Hakubutsus...,11497
19116,1.265000,Hakubutsushi,https://myanimelist.net/anime/29395/Fafa_Movie\n,11496
