In [3]:
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import os
import re

# Crawl synopsis for trying to do search

In [4]:
def scrabbing_search_engine(soup):
    anime_info = []
    #title
    title = str(soup.find("h1", attrs = {"class": "title-name h1_bold_none"}).string)
    anime_info.append(title)
    #synopsis
    synopsis = str(soup.find("p", attrs = {"itemprop": "description"}).text)
    anime_info.append(synopsis)
    
    return anime_info

In [81]:
attrs = ["animeTitle","animeDescription","animeUrl"]
list_of_anime = []

# take info
for page in tqdm(range(1,130)):
    folder = "./Folder_with_page/page"+str(page)
    for anime in os.listdir(folder):
        with open(folder + "/" + anime, "r",  encoding='utf-8') as fp:
            soup = BeautifulSoup(fp, "html.parser")
        anime_info = scrabbing_search_engine(soup)
        anime_info.append(soup.find("meta",  property="og:url")["content"])
        list_of_anime.append(anime_info)
        
# Creating the DataFrame
df = pd.DataFrame(list_of_anime, columns = attrs)

# Creating the tsv file
# Index True, in this way = ID of the anime
df.to_csv('prova_search_engine.tsv', index = False, sep = '\t')

100%|████████████████████████████████████████████████████████████████████████████████| 129/129 [18:55<00:00,  8.80s/it]


# Creating the document

In [82]:
documents = []
for doc in df["animeDescription"]:
    documents.append(doc)

### clean the documents

In [83]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string

In [105]:
def pre_processing(documents):
    stop = stopwords.words("english")
    snowball_stemmer = SnowballStemmer("english")
    remove = ["Written", "MAL", "Rewrite"]+["'s","``",'" "',"''","“"]

    # remove number
    document_tmp = re.sub(r'[0-9]', '', documents)
    # Tokenizing + Normalization
    document_tmp =  word_tokenize(document_tmp.lower()) 
    # removing stopwords
    document_tmp = [ word for word in document_tmp if word not in stop]
    # removing punctuation
    document_tmp = [ word for word in document_tmp if word not in string.punctuation]
    # removing "Written MAL Rewrite" and other stuff
    document_tmp = [ word for word in document_tmp if word not in remove]
    # stemming 
    document_tmp = [ snowball_stemmer.stem(word) for word in document_tmp]
    
    return document_tmp

In [106]:
# cleaning the documents
documents_clean = []
for d in documents:
    documents_clean.append(pre_processing(d))

In [139]:
# try some for view exception:

documents_clean[0]

['horrif',
 'alchemi',
 'experi',
 'goe',
 'wrong',
 'elric',
 'household',
 'brother',
 'edward',
 'alphons',
 'left',
 'catastroph',
 'new',
 'realiti',
 'ignor',
 'alchem',
 'principl',
 'ban',
 'human',
 'transmut',
 'boy',
 'attempt',
 'bring',
 'recent',
 'deceas',
 'mother',
 'back',
 'life',
 'instead',
 'suffer',
 'brutal',
 'person',
 'loss',
 'alphons',
 'bodi',
 'disintegr',
 'edward',
 'lost',
 'leg',
 'sacrif',
 'arm',
 'keep',
 'alphons',
 'soul',
 'physic',
 'realm',
 'bind',
 'hulk',
 'suit',
 'armor',
 'brother',
 'rescu',
 'neighbor',
 'pinako',
 'rockbel',
 'granddaught',
 'winri',
 'known',
 'bio-mechan',
 'engin',
 'prodigi',
 'winri',
 'creat',
 'prosthet',
 'limb',
 'edward',
 'util',
 'automail',
 'tough',
 'versatil',
 'metal',
 'use',
 'robot',
 'combat',
 'armor',
 'year',
 'train',
 'elric',
 'brother',
 'set',
 'quest',
 'restor',
 'bodi',
 'locat',
 'philosoph',
 'stone—a',
 'power',
 'gem',
 'allow',
 'alchemist',
 'defi',
 'tradit',
 'law',
 'equival',


# Creating vocabulary

In [112]:
import itertools

In [113]:
# the list of all words
word_list = list(set(list(itertools.chain.from_iterable(documents_clean))))

In [114]:
vocabolary = dict(zip(word_list, range(len(word_list))))

In [None]:
vocabolary

In [116]:
# function that map word to integer

def word_to_int(document, vocabolary):
    int_doc = []
    
    for word in document:
        int_doc.append(vocabolary[word])
        
    return int_doc

Mapp every documents word to integer

In [123]:
documents_mapped = []
for d in documents_clean:
    documents_mapped.append(word_to_int(d,vocabolary))

# Search engine

In [124]:
# search engine
from collections import defaultdict  

In [125]:
search_dict = defaultdict(list)

In [131]:
for number in vocabolary.values():
    for idd, d in enumerate(documents_mapped):
        if number in d:
            search_dict[number].append(idd)

In [132]:
import json

file = open("search_engine_2.json", "w")
json.dump(search_dict, file)
file.close()

# Searching

In [143]:
# input query
query_text = input()

query_clean = pre_processing(query_text)
query_int = word_to_int(query_clean, vocabolary)

 alchemy titan


In [148]:
query_int

[21829, 18648]

In [146]:
vocabolary["alchemi"]

21829

In [151]:
# return the document witch match the query

indexx = []
for query in query_int:
    indexx += search_dict[query]

In [155]:
df[df["animeID"].isin(indexx)]

Unnamed: 0,animeTitle,animeDescription,animeUrl,animeID
0,Fullmetal Alchemist: Brotherhood,After a horrific alchemy experiment goes wrong...,https://myanimelist.net/anime/5114/Fullmetal_A...,0
10,Shingeki no Kyojin: The Final Season,Gabi Braun and Falco Grice have been training ...,https://myanimelist.net/anime/40028/Shingeki_n...,10
22,Shingeki no Kyojin Season 3 Part 2,Seeking to restore humanity's diminishing hope...,https://myanimelist.net/anime/38524/Shingeki_n...,22
50,Shingeki no Kyojin,"Centuries ago, mankind was slaughtered to near...",https://myanimelist.net/anime/16498/Shingeki_n...,50
68,Shingeki no Kyojin Season 3,"Still threatened by the ""Titans"" that rob them...",https://myanimelist.net/anime/35760/Shingeki_n...,68
118,Shingeki no Kyojin Season 2,"For centuries, humanity has been hunted by gia...",https://myanimelist.net/anime/25777/Shingeki_n...,118
167,Baccano!,"During the early 1930s in Chicago, the transco...",https://myanimelist.net/anime/2251/Baccano,167
393,Fullmetal Alchemist,"Edward Elric, a young, brilliant alchemist, ha...",https://myanimelist.net/anime/121/Fullmetal_Al...,393
525,Fullmetal Alchemist: Brotherhood Specials,Amazing secrets and startling facts are expose...,https://myanimelist.net/anime/6421/Fullmetal_A...,525
695,Mobile Suit Zeta Gundam,"It is Universal Century 0087, and the One Year...",https://myanimelist.net/anime/85/Mobile_Suit_Z...,695


In [159]:
df["animeDescription"].iloc[1579]

'A girl named Olga is pursued by both the World Government and a man named Mad Treasure, as she is the only one who knows the location of the Pure Gold, a substance that can buy the entire world. Olga ends up sailing with the Straw Hat Pirates, and they journey to find the Pure Gold on the lost island of Alchemi, which is located inside the stomach of a large angler fish named Bonbori.\n\n\n(Source: IMDb)'