In [1]:
import pandas as pd
import numpy as np
 
import nltk
tokenizer = nltk.RegexpTokenizer(r"\w+")
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer 
ps = PorterStemmer()

from collections import defaultdict
import pickle

from tqdm import tqdm # monitoring progress

import time
from joblib import Parallel, delayed # parallel processing

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
dataset = pd.read_csv('file.csv')
dataset.head()

Unnamed: 0,Title,Type,Episodes,Release date,End date,Members,Score,Users,Rank,Popularity,Description,Related,Characters,Voices,Staff
0,Fullmetal Alchemist: Brotherhood,TV,64,2009-04-05,2010-07-04 00:00:00,2676639,9.16,1622384,1,3,"[""After a horrific alchemy experiment goes wro...","['Adaptation: Fullmetal Alchemist', 'Alternati...","['Elric, Edward', 'Elric, Alphonse', 'Mustang,...","['Park, Romi', 'Kugimiya, Rie', 'Miki, Shinich...","[['Cook, Justin', 'Producer'], ['Yonai, Norito..."
1,Gintama: The Final,Movie,1,2021-01-08,-,79486,9.0,29979,10,1924,"['New ', <i>Gintama</i>, ' movie.']","['Adaptation: Gintama', 'Prequel: Gintama.: Sh...","['Sakata, Gintoki', 'Kagura', 'Shimura, Shinpa...","['Sugita, Tomokazu', 'Ishida, Akira', 'Hino, S...","[['Fujita, Youichi', 'Director'], ['Miyawaki, ..."
2,Gintama.,TV,12,2017-01-09,2017-03-27 00:00:00,246290,8.98,108581,11,726,"[""After joining the resistance against the bak...","['Adaptation: Gintama', 'Prequel: Gintama°', '...","['Sakata, Gintoki', 'Kagura', 'Katsura, Kotaro...","['Sugita, Tomokazu', 'Kugimiya, Rie', 'Ishida,...","[['Fujita, Youichi', 'Director'], ['Miyawaki, ..."
3,3-gatsu no Lion 2nd Season,TV,22,2017-10-14,2018-03-31 00:00:00,324393,8.97,155163,12,529,"['Now in his second year of high school, Rei K...","['Adaptation: 3-gatsu no Lion', 'Prequel: 3-ga...","['Kiriyama, Rei', 'Kawamoto, Hinata', 'Kawamot...","['Kawanishi, Kengo', 'Hanazawa, Kana', 'Kayano...","[['Shinbou, Akiyuki', 'Director, Series Compos..."
4,Koe no Katachi,Movie,1,2016-09-17,-,1780070,8.97,1208990,13,23,"['As a wild youth, elementary school student S...","['Adaptation: Koe no Katachi', 'Other: Koe no ...","['Nishimiya, Shouko', 'Ishida, Shouya', 'Nishi...","['Hayami, Saori', 'Irino, Miyu', 'Yuuki, Aoi',...","[['Senami, Riri', 'Assistant Producer'], ['Yam..."


In [3]:
dataset.shape

(19053, 15)

### Cleaning

In [4]:
def tokenize(description):
    # input: string
    # output: list of tokenized words included in the string
        
    low_descr = str.lower(description)
    
    # We tokenize the description and remove puncuation
    tok_descr = tokenizer.tokenize(low_descr)
    # Alternative way: first tokenize then remove punctuation
    # tok_descr = nltk.word_tokenize(low_descr)
    # nltk.download("punkt")
    # no_pun_descr = [word for word in tok_descr if word.isalnum()]
    
    return tok_descr    

In [5]:
def clean(tok_descr):
    # input: list of tokenized words included in the string
    # output: list of cleaned words included in the string
    
    # We remove stopwords from tokenized description
    no_stop_descr = [word for word in tok_descr if not word in stopwords.words('english')]
    
    # We carry out stemming
    stem_descr = [ps.stem(i) for i in no_stop_descr]
    
    # We remove isolated characters
    final_descr = [i for i in stem_descr if len(i) > 1]
        
    return final_descr    

In [6]:
def clean_fast(tok_descr):
    # Please note: by using intersection of sets instead of list comprehension we lose repeated words within the same description
    
    # input: list of tokenized words included in the string
    # output: list of cleaned words included in the string
    
    # We remove stopwords from tokenized description
    no_stop_descr = list(set(tok_descr) - (set(tok_descr) & set(stopwords.words('english'))))
    
    # We carry out stemming
    stem_descr = [ps.stem(i) for i in no_stop_descr]
    
    # We remove isolated characters
    final_descr = [i for i in stem_descr if len(i) > 1]
        
    return list(set(final_descr))    

#### example

In [7]:
dataset['Description'][0]

'["After a horrific alchemy experiment goes wrong in the Elric household, brothers Edward and Alphonse are left in a catastrophic new reality. Ignoring the alchemical principle banning human transmutation, the boys attempted to bring their recently deceased mother back to life. Instead, they suffered brutal personal loss: Alphonse\'s body disintegrated while Edward lost a leg and then sacrificed an arm to keep Alphonse\'s soul in the physical realm by binding it to a hulking suit of armor.", \'\\n\', \'\\n\', \'\\n\', \'\\r\\nThe brothers are rescued by their neighbor Pinako Rockbell and her granddaughter Winry. Known as a bio-mechanical engineering prodigy, Winry creates prosthetic limbs for Edward by utilizing "automail," a tough, versatile metal used in robots and combat armor. After years of training, the Elric brothers set off on a quest to restore their bodies by locating the Philosopher\\\'s Stone—a powerful gem that allows an alchemist to defy the traditional laws of Equivalent

In [8]:
clean(tokenize(dataset['Description'][0]))

['horrif',
 'alchemi',
 'experi',
 'goe',
 'wrong',
 'elric',
 'household',
 'brother',
 'edward',
 'alphons',
 'left',
 'catastroph',
 'new',
 'realiti',
 'ignor',
 'alchem',
 'principl',
 'ban',
 'human',
 'transmut',
 'boy',
 'attempt',
 'bring',
 'recent',
 'deceas',
 'mother',
 'back',
 'life',
 'instead',
 'suffer',
 'brutal',
 'person',
 'loss',
 'alphons',
 'bodi',
 'disintegr',
 'edward',
 'lost',
 'leg',
 'sacrif',
 'arm',
 'keep',
 'alphons',
 'soul',
 'physic',
 'realm',
 'bind',
 'hulk',
 'suit',
 'armor',
 'nthe',
 'brother',
 'rescu',
 'neighbor',
 'pinako',
 'rockbel',
 'granddaught',
 'winri',
 'known',
 'bio',
 'mechan',
 'engin',
 'prodigi',
 'winri',
 'creat',
 'prosthet',
 'limb',
 'edward',
 'util',
 'automail',
 'tough',
 'versatil',
 'metal',
 'use',
 'robot',
 'combat',
 'armor',
 'year',
 'train',
 'elric',
 'brother',
 'set',
 'quest',
 'restor',
 'bodi',
 'locat',
 'philosoph',
 'stone',
 'power',
 'gem',
 'allow',
 'alchemist',
 'defi',
 'tradit',
 'law',
 

### Dictionaries

- the first dictionary <code>word_2_id</code> maps word to word identification integer

- the inverted index dictionary <code>id_2_anime</code> maps word identification integer to list of indexes (main dataset indexes) of anime whose cleaned description contains the word identified by the integer

In [9]:
def dictionaries(dataset):
    # input: anime_df dataframe
    # output 1: the dictionary word_2_id maps word to word identification integer  
    # output 2: the inverted index dictionary id_2_anime maps word identification integer to list of indexes (main dataset indexes) of anime

    word_2_id = defaultdict()
    word_2_id['a'] = 0

    id_2_anime = defaultdict()
        
    for i in tqdm(range(len(dataset))):
        
        final_list = clean_fast(tokenize(dataset['Description'][i]))    
        
        if final_list == []:
            
            pass
        
        else:

            for j in final_list:

                if j not in word_2_id.keys():

                    word_2_id[j] = word_2_id[list(word_2_id.keys())[-1]] + 1

                    id_2_anime[word_2_id[j]] = [i]

                else:

                    id_2_anime[word_2_id[j]].append(i)
    
    # We save dictionaries as pkl
    word_2_id_file = open("word2id.pkl", "wb")
    pickle.dump(word_2_id, word_2_id_file)
    word_2_id_file.close()
    
    id_2_anime_file = open("id2anime.pkl", "wb")
    pickle.dump(id_2_anime, id_2_anime_file)
    id_2_anime_file.close()

    return word_2_id, id_2_anime

In [10]:
word_2_id0, id_2_anime0 = dictionaries(dataset)

100%|███████████████████████████████████████████████████████████████████████████| 19053/19053 [00:21<00:00, 875.86it/s]


In [11]:
len(word_2_id0)

39599

In [12]:
len(id_2_anime0) == len(word_2_id0) - 1 # -1 inizialization value

True

### Extended dataframe

In [6]:
dataset["tok_description"] = Parallel(n_jobs=-1, verbose=3)(delayed(tokenize)(i) for i in dataset["Description"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 368 tasks      | elapsed:    2.2s


2.5978 s


[Parallel(n_jobs=-1)]: Done 17984 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 19053 out of 19053 | elapsed:    2.5s finished


In [20]:
#start = time.time()
tqdm.pandas()
dataset['tok_description'] = dataset.progress_apply(lambda j: tokenize(j['Description']),axis=1)
#end = time.time()
#print('{:.4f} s'.format(end-start))

100%|█████████████████████████████████████████████████████████████████████████| 19053/19053 [00:00<00:00, 41421.45it/s]


In [21]:
#start = time.time()
tqdm.pandas()
dataset['clean_description'] = dataset.progress_apply(lambda j: clean(j['tok_description']),axis=1)
#end = time.time()
#print('{:.4f} s'.format(end-start))

100%|████████████████████████████████████████████████████████████████████████████| 19053/19053 [03:26<00:00, 92.21it/s]


In [22]:
dataset.to_csv (r'C:\Users\anton\Desktop\ADM\Homework3\ext_df.csv', index = False)

faster cleaning (but we remove repetitions)

In [24]:
tqdm.pandas()
dataset['clfast_description'] = dataset.progress_apply(lambda j: clean_fast(j['tok_description']),axis=1)

100%|██████████████████████████████████████████████████████████████████████████| 19053/19053 [00:14<00:00, 1274.63it/s]


### Search engine

In [9]:
def search_engine(query):
    # input: query as string
    # output: list of indexes (anime_df dataframe) of anime whose description contains all the words in the query
    
    # We load dictionaries
    word_2_id_file = open("word2id.pkl", "rb")
    word_2_id = pickle.load(word_2_id_file)
    word_2_id_file.close()
    id_2_anime_file = open("id2anime.pkl", "rb")
    id_2_anime = pickle.load(id_2_anime_file)
    id_2_anime_file.close()
    
    # We filter query (apply tokenizeandclean function and remove duplicates)
    cleaned_query = list(set(clean(tokenize(query))))
        
    listoflists = []
    
    for i in range(len(cleaned_query)):
        listoflists.append(set(id_2_anime[word_2_id[cleaned_query[i]]]))
        
    anime_intersection = list(set.intersection(*listoflists))
    
    return sorted(anime_intersection)

In [10]:
q = search_engine("saiyan race")
q

[6185, 11167, 17967, 18503]

In [11]:
dfq = dataset.iloc[q, [0, 10]]
dfq.reset_index(drop=True, inplace=True)
dfq['Url'] = ''
dfq

Unnamed: 0,Title,Description,Url
0,Dragon Ball Kai,"[""Five years after the events of Dragon Ball, ...",
1,Dragon Ball Z Special 1: Tatta Hitori no Saish...,"[""Bardock, Son Goku's father, is a low-ranking...",
2,Dragon Ball Z,"[""Five years after winning the World Martial A...",
3,Dragon Ball Super: Broly,"[""Forty-one years ago on Planet Vegeta, home o...",
