In [86]:
import pandas as pd

import nltk
tokenizer = nltk.RegexpTokenizer(r"\w+")
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer 
ps = PorterStemmer()

from collections import defaultdict
import pickle

from tqdm import tqdm

import math

import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
dataset = pd.read_csv('anime_df.csv')

In [None]:
dataset.head()

### Cleaning

In [34]:
def tokenizeandclean(description):
    # input: string
    # output: list of filtered words included in the string
    
    # to be applied also to the query
    
    low_descr = str.lower(description)
    
    # We tokenize the description and remove puncuation
    tok_descr = tokenizer.tokenize(low_descr)
    # Alternative way: first tokenize then remove punctuation
    # tok_descr = nltk.word_tokenize(low_descr)
    # nltk.download("punkt")
    # no_pun_descr = [word for word in tok_descr if word.isalnum()]
    
    # We remove stopwords from tokenized description
    no_stop_descr = [word for word in tok_descr if not word in stopwords.words()]
    
    # We carry out stemming
    stem_descr = [ps.stem(i) for i in no_stop_descr]
    
    # We remove isolated characters
    final_descr = [i for i in stem_descr if len(i) > 1]
    
    return final_descr

#### example

In [5]:
dataset['Description'][0]

'["After a horrific alchemy experiment goes wrong in the Elric household, brothers Edward and Alphonse are left in a catastrophic new reality. Ignoring the alchemical principle banning human transmutation, the boys attempted to bring their recently deceased mother back to life. Instead, they suffered brutal personal loss: Alphonse\'s body disintegrated while Edward lost a leg and then sacrificed an arm to keep Alphonse\'s soul in the physical realm by binding it to a hulking suit of armor.", \'\\n\', \'\\n\', \'\\n\', \'\\r\\nThe brothers are rescued by their neighbor Pinako Rockbell and her granddaughter Winry. Known as a bio-mechanical engineering prodigy, Winry creates prosthetic limbs for Edward by utilizing "automail," a tough, versatile metal used in robots and combat armor. After years of training, the Elric brothers set off on a quest to restore their bodies by locating the Philosopher\\\'s Stone—a powerful gem that allows an alchemist to defy the traditional laws of Equivalent

In [190]:
type(tokenizeandclean(dataset['Description'][i]))

list

In [None]:
datasetClean = defaultdict()
for i in range(len(dataset)):
    datasetClean[i]=(tokenizeandclean(dataset['Description'][i]))

In [10]:
# We save dictionaries as pkl
descriptionsDataset = open("descriptionsDataset.pkl", "wb")
pickle.dump(datasetClean, descriptionsDataset)
descriptionsDataset.close()

In [5]:
len(datasetClean)

NameError: name 'datasetClean' is not defined

### Dictionaries

- the first dictionary <code>word_2_id</code> maps word to word identification integer

- the inverted index dictionary <code>id_2_anime</code> maps word identification integer to list of indexes (main dataset indexes) of anime whose cleaned description contains the word identified by the integer

#### Inverted Index without occurrences

If a word is present more than once in the single description the index referring to the anime (anime_df index) in the <code>id_2_anime</code> dictionary is not repeated.

In [6]:
def dictionaries(dataset):
    # input: anime_df dataframe
    # output 1: the dictionary word_2_id maps word to word identification integer  
    # output 2: the inverted index dictionary id_2_anime maps word identification integer to list of indexes (main dataset indexes) of anime

    word_2_id = defaultdict()
    word_2_id['a'] = 0

    id_2_anime = defaultdict()
        
    for i in tqdm(range(len(dataset))):
        
        tok_list = tokenizeandclean(dataset['Description'][i])
        
        if tok_list == []:
            
            pass
        
        else:

            for j in list(set(tok_list)): # for j in tok_list -- inverted index with occurrences

                if j not in word_2_id.keys():

                    word_2_id[j] = word_2_id[list(word_2_id.keys())[-1]] + 1

                    id_2_anime[word_2_id[j]] = [i]

                else:

                    id_2_anime[word_2_id[j]].append(i)
    
    # We save dictionaries as pkl
    word_2_id_file = open("word_2_id.pkl", "wb")
    pickle.dump(word_2_id, word_2_id_file)
    word_2_id_file.close()
    
    id_2_anime_file = open("id_2_anime.pkl", "wb")
    pickle.dump(id_2_anime, id_2_anime_file)
    id_2_anime_file.close()

    return word_2_id, id_2_anime

In [7]:
word_2_id0, id_2_anime0 = dictionaries(dataset)

100%|██████████████████████████████████████████████████████████████████████████| 19053/19053 [1:56:43<00:00,  2.72it/s]


In [8]:
len(word_2_id0)

39334

In [11]:
len(id_2_anime0) == len(word_2_id0) - 1 # -1 inizialization value

True

### Search engine

In [5]:
def search_engine(query):
    # input: query as string
    # output: list of indexes (anime_df dataframe) of anime whose description contains all the words in the query
    
    # We load dictionaries
    word_2_id_file = open("word_2_id.pkl", "rb")
    word_2_id = pickle.load(word_2_id_file)
    word_2_id_file.close()
    id_2_anime_file = open("id_2_anime.pkl", "rb")
    id_2_anime = pickle.load(id_2_anime_file)
    id_2_anime_file.close()
    
    # We filter query (apply tokenizeandclean function and remove duplicates)
    cleaned_query = list(set(tokenizeandclean(query)))
        
    listoflists = []
    
    for i in range(len(cleaned_query)):
        listoflists.append(set(id_2_anime[word_2_id[cleaned_query[i]]]))
        
    anime_intersection = list(set.intersection(*listoflists))
    
    return anime_intersection

In [9]:
q = search_engine("saiyan race")

In [11]:
for i in q:
    print(dataset['Description'][i])
    print('')

["Bardock, Son Goku's father, is a low-ranking Saiyan soldier who was given the power to see into the future by the last remaining alien on a planet he just destroyed. He witnesses the destruction of his race and must now do his best to stop Frieza's impending massacre.", '\n', '\n', '\n', '\r\n(Source: ANN)']

["Five years after the events of Dragon Ball, martial arts expert Gokuu is now a grown man married to his wife Chi-Chi, with a four-year old son named Gohan. While attending a reunion on Turtle Island with his old friends Master Roshi, Krillin, Bulma and others, the festivities are interrupted when a humanoid alien named Raditz not only reveals the truth behind Gokuu's past, but kidnaps Gohan as well.", '\n', '\n', '\n', '\r\nWith Raditz displaying power beyond anything Gokuu has seen before, he is forced to team up with his old nemesis, Piccolo, in order to rescue his son. But when Gokuu and Piccolo reveal the secret of the seven mystical wish-granting Dragon Balls to Raditz, h

In [111]:
import pickle


with open('dictionaries/id_2_anime.pkl', 'rb') as f:
    firstDictionary = pickle.load(f)
    
with open('dictionaries/word_2_id.pkl', 'rb') as f:
    dataWord = pickle.load(f)
    
with open('descriptionsDataset.pkl', 'rb') as f:
    dataDescriptionClean = pickle.load(f)

19053

### 2.2

For the second search engine, given a query, we want to get the top-k (the choice of k it's up to you!) documents related to the query. In particular:

- Find all the documents that contains all the words in the query.
- Sort them by their similarity with the query.
- Return in output k documents, or all the documents with non-zero similarity with the query when the results are less than k. You must use a heap data structure (you can use Python libraries) for maintaining the top-k documents.
- To solve this task, you will have to use the tfIdf score, and the Cosine similarity. The field to consider it is still the synopsis.

In [112]:
def calculate_TfIdf(lenghtDictionary, lenghtTerm, numberOfOccurence, wordsDocument):
    TF = numberOfOccurence / wordsDocument #number of the occurence in the document / #numer of total words in this single document.
    IDF = math.log10(lenghtDictionary / lenghtTerm) #lenght of dictonarty / number of documents that containg the term j
    return round(TF*IDF,2) #just two decimal 

In [117]:
def number_occurence(document, word):
    return np.sum( word in s for s in document) #count the occurence of a word in a document

In [114]:
lengthDic = len(firstDictionary)

In [115]:
tfidf_invIndex = defaultdict() #starting a dictiornary

dataWordList = list(dataWord.copy()) #copy of the dictionary which contains the words
tfidf_invIndex = firstDictionary.copy() #copy of the dictionary which containts the documents where each word of dataword is present
dataDescription = dataDescriptionClean.copy()

In [118]:
for i in tqdm(range(1,len(firstDictionary))):
    lenghtTerm = len(firstDictionary[i]) #lenght of a single term_id_i
    
    for j in range(len(firstDictionary[i])):
        synopsis = dataDescription[firstDictionary[i][j]] #the description that contains the word
        wordsDocument = len(synopsis) #the lenght
        numberOfOccurence = number_occurence(synopsis, dataWordList[i])
        tfidf_invIndex[i][j]=(firstDictionary[i][j], calculate_TfIdf(lengthDic, lenghtTerm, numberOfOccurence, wordsDocument)) #return ( id-document, tfIdf_{term,documenti})

  
100%|██████████| 39332/39332 [00:06<00:00, 6077.23it/s] 


In [89]:
from numpy import inner
from numpy.linalg import norm
searchWord = tokenizeandclean('saiyan race')
scoreHeap, k = [], 10

q =  np.zeros((len(dataWord),), dtype=int)

for word in searchWord:
    for key,values in dataWord.items():
        if word in key:
            q[values] +=1 #fill the query q
normq = norm(q)

In [None]:
for i in range(len(dataset)):
    sum = 0;
    occurence = 0;
    norm = 0;
    for k in range(1,len(tfidf_invIndex)):
        for j in range(1,len(tfidf_invIndex[k])):
            if(i == tfidf_invIndex[k][j][0]):
                sum += q[i]*tfidf_invIndex[k][j][1]
                norm += tfidf_invIndex[k][j][1]*tfidf_invIndex[k][j][1]

        if(sum/math.sqrt(norm) != 0):
            print(i,sum/math.sqrt(norm))

In [41]:
#dataWord
#print([x for x in q])