In [1]:
import pandas as pd
import nltk
tokenizer = nltk.RegexpTokenizer(r"\w+")
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer 
ps = PorterStemmer()

from collections import defaultdict
import json

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
dataset = pd.read_csv('test.csv')

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         30 non-null     object 
 1   Type          30 non-null     object 
 2   Episodes      30 non-null     int64  
 3   Release date  30 non-null     object 
 4   End date      30 non-null     object 
 5   Members       30 non-null     int64  
 6   Score         30 non-null     float64
 7   Users         30 non-null     int64  
 8   Rank          30 non-null     int64  
 9   Popularity    30 non-null     int64  
 10  Description   30 non-null     object 
 11  Related       30 non-null     object 
 12  Characters    30 non-null     object 
 13  Voices        30 non-null     object 
 14  Staff         29 non-null     object 
dtypes: float64(1), int64(5), object(9)
memory usage: 3.6+ KB


In [4]:
dataset.head()

Unnamed: 0,Title,Type,Episodes,Release date,End date,Members,Score,Users,Rank,Popularity,Description,Related,Characters,Voices,Staff
0,Fullmetal Alchemist: Brotherhood,TV,64,2009-04-05,2010-07-04 00:00:00,2676639,9.16,1622384,1,3,"[""After a horrific alchemy experiment goes wro...","['Adaptation: Fullmetal Alchemist', 'Alternati...","['Elric, Edward', 'Elric, Alphonse', 'Mustang,...","['Park, Romi', 'Kugimiya, Rie', 'Miki, Shinich...","[['Cook, Justin', 'Producer'], ['Yonai, Norito..."
1,Gintama: The Final,Movie,1,2021-01-08,-,79486,9.0,29979,10,1924,"['New ', <i>Gintama</i>, ' movie.']","['Adaptation: Gintama', 'Prequel: Gintama.: Sh...","['Sakata, Gintoki', 'Kagura', 'Shimura, Shinpa...","['Sugita, Tomokazu', 'Ishida, Akira', 'Hino, S...","[['Fujita, Youichi', 'Director'], ['Miyawaki, ..."
2,Gintama.,TV,12,2017-01-09,2017-03-27 00:00:00,246290,8.98,108581,11,726,"[""After joining the resistance against the bak...","['Adaptation: Gintama', 'Prequel: Gintama°', '...","['Sakata, Gintoki', 'Kagura', 'Katsura, Kotaro...","['Sugita, Tomokazu', 'Kugimiya, Rie', 'Ishida,...","[['Fujita, Youichi', 'Director'], ['Miyawaki, ..."
3,3-gatsu no Lion 2nd Season,TV,22,2017-10-14,2018-03-31 00:00:00,324393,8.97,155163,12,529,"['Now in his second year of high school, Rei K...","['Adaptation: 3-gatsu no Lion', 'Prequel: 3-ga...","['Kiriyama, Rei', 'Kawamoto, Hinata', 'Kawamot...","['Kawanishi, Kengo', 'Hanazawa, Kana', 'Kayano...","[['Shinbou, Akiyuki', 'Director, Series Compos..."
4,Koe no Katachi,Movie,1,2016-09-17,-,1780070,8.97,1208990,13,23,"['As a wild youth, elementary school student S...","['Adaptation: Koe no Katachi', 'Other: Koe no ...","['Nishimiya, Shouko', 'Ishida, Shouya', 'Nishi...","['Hayami, Saori', 'Irino, Miyu', 'Yuuki, Aoi',...","[['Senami, Riri', 'Assistant Producer'], ['Yam..."


### Cleaning

In [5]:
def tokenizeandclean(description):
    # to be applied also to the query
    
    low_descr = str.lower(description)
    
    # We tokenize the description and remove puncuation
    tok_descr = tokenizer.tokenize(low_descr)
    # Alternative way: first tokenize then remove punctuation
    # tok_descr = nltk.word_tokenize(low_descr)
    # nltk.download("punkt")
    # no_pun_descr = [word for word in tok_descr if word.isalnum()]
    
    # We remove stopwords from tokenized description
    no_stop_descr = [word for word in tok_descr if not word in stopwords.words()]
    
    # We carry out stemming
    stem_descr = [ps.stem(i) for i in no_stop_descr]
    
    # We remove isolated characters
    final_descr = [i for i in stem_descr if len(i) > 1]
    
    return final_descr

#### example

In [6]:
dataset['Description'][0]

'["After a horrific alchemy experiment goes wrong in the Elric household, brothers Edward and Alphonse are left in a catastrophic new reality. Ignoring the alchemical principle banning human transmutation, the boys attempted to bring their recently deceased mother back to life. Instead, they suffered brutal personal loss: Alphonse\'s body disintegrated while Edward lost a leg and then sacrificed an arm to keep Alphonse\'s soul in the physical realm by binding it to a hulking suit of armor.", \'\\n\', \'\\n\', \'\\n\', \'\\r\\nThe brothers are rescued by their neighbor Pinako Rockbell and her granddaughter Winry. Known as a bio-mechanical engineering prodigy, Winry creates prosthetic limbs for Edward by utilizing "automail," a tough, versatile metal used in robots and combat armor. After years of training, the Elric brothers set off on a quest to restore their bodies by locating the Philosopher\\\'s Stone—a powerful gem that allows an alchemist to defy the traditional laws of Equivalent

In [7]:
tokenizeandclean(dataset['Description'][0])

['horrif',
 'alchemi',
 'experi',
 'goe',
 'wrong',
 'elric',
 'household',
 'brother',
 'edward',
 'alphons',
 'left',
 'catastroph',
 'new',
 'realiti',
 'ignor',
 'alchem',
 'principl',
 'ban',
 'human',
 'transmut',
 'boy',
 'attempt',
 'bring',
 'recent',
 'deceas',
 'mother',
 'back',
 'life',
 'instead',
 'suffer',
 'brutal',
 'person',
 'loss',
 'alphons',
 'bodi',
 'disintegr',
 'edward',
 'lost',
 'leg',
 'sacrif',
 'arm',
 'keep',
 'alphons',
 'soul',
 'physic',
 'realm',
 'bind',
 'hulk',
 'suit',
 'armor',
 'nthe',
 'brother',
 'rescu',
 'neighbor',
 'pinako',
 'rockbel',
 'granddaught',
 'winri',
 'known',
 'bio',
 'mechan',
 'engin',
 'prodigi',
 'winri',
 'creat',
 'prosthet',
 'limb',
 'edward',
 'util',
 'automail',
 'tough',
 'versatil',
 'metal',
 'use',
 'robot',
 'combat',
 'armor',
 'year',
 'train',
 'elric',
 'brother',
 'set',
 'quest',
 'restor',
 'bodi',
 'locat',
 'philosoph',
 'stone',
 'power',
 'gem',
 'allow',
 'alchemist',
 'defi',
 'tradit',
 'law',
 

### Dictionaries

- the first dictionary <code>word_2_id</code> maps word to word identification integer

- the inverted index dictionary <code>id_2_anime</code> maps word identification integer to list of indexes (main dataset indexes) of anime whose cleaned description contains the word identified by the integer

#### Inverted Index with occurrences

In [16]:
def dictionaries(dataset):
    
    word_2_id = defaultdict()
    word_2_id['a'] = 0

    id_2_anime = defaultdict()

    for i in range(len(dataset)):

        for j in tokenizeandclean(dataset['Description'][i]):

            if j not in word_2_id.keys():

                word_2_id[j] = word_2_id[list(word_2_id.keys())[-1]] + 1

                id_2_anime[word_2_id[j]] = [i]

            else:

                id_2_anime[word_2_id[j]].append(i)
                
    return word_2_id, id_2_anime

#### Inverted Index without occurrences

In [31]:
def dictionaries(dataset):

    word_2_id = defaultdict()
    word_2_id['a'] = 0

    id_2_anime = defaultdict()

    for i in range(len(dataset)):

        for j in list(set(tokenizeandclean(dataset['Description'][i]))):

            if j not in word_2_id.keys():

                word_2_id[j] = word_2_id[list(word_2_id.keys())[-1]] + 1

                id_2_anime[word_2_id[j]] = [i]

            else:

                id_2_anime[word_2_id[j]].append(i)
    
    return word_2_id, id_2_anime