## Cargando datos y paquetes

#### Librerías 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import zscore
from sklearn.preprocessing import MaxAbsScaler
from sklearn.neighbors import NearestNeighbors
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from scrapy import Selector
import requests
from scrapy.crawler import CrawlerProcess
import scrapy
from scrapy.utils.project import get_project_settings

#### Cargando Dataset

In [98]:
anime = pd.read_csv("Data/anime.csv")
rating = pd.read_csv("Data/rating.csv")

## Preprocesamiento data anime

### Explorando datos

En primera instancia se explora la existencia de casos duplicados, también la dimensión de la data y el tipo de dato de cada variable. Se observa que la variable "episodes" es de tipo object y no numérico dado que contiene una categoría "Unknown" que indica que no se conoce la cantidad de episodios del anime, es por esto, que se procede a cambiar "Unknown" por NaN y luego el tipo de dato a numérico.



In [99]:
print(anime.shape)
print(anime.drop_duplicates().shape)
print(anime.info())
anime.tail(15)

(12294, 7)
(12294, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
anime_id    12294 non-null int64
name        12294 non-null object
genre       12232 non-null object
type        12269 non-null object
episodes    12294 non-null object
rating      12064 non-null float64
members     12294 non-null int64
dtypes: float64(1), int64(2), object(4)
memory usage: 672.4+ KB
None


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
12279,34491,Sagurare Otome The Animation,Hentai,OVA,1,,79
12280,34312,Saimin Class,Hentai,OVA,Unknown,,240
12281,9504,Sakura no Mori,Hentai,OVA,2,4.53,221
12282,34388,Shikkoku no Shaga The Animation,Hentai,OVA,Unknown,,195
12283,29992,Silent Chaser Kagami,Hentai,OVA,1,4.95,112
12284,26031,Super Erotic Anime,Hentai,OVA,2,4.45,118
12285,34399,Taimanin Asagi 3,"Demons, Hentai, Supernatural",OVA,Unknown,,485
12286,10368,Teleclub no Himitsu,Hentai,OVA,2,4.67,148
12287,9352,Tenshi no Habataki Jun,Hentai,OVA,1,4.33,201
12288,5541,The Satisfaction,Hentai,OVA,1,4.37,166


In [100]:
anime.replace("Unknown", np.nan, inplace=True)
anime["episodes"] = anime["episodes"].astype(float)

### Indentificando NaN 

Podemos detectar que las variables "genre", "type", "episodes" y "rating" poseen datos faltantes, donde "episodes" es la variable con más datos faltantes, seguida por "rating". Por otro lado, "genre" y "type" tienen mucho menos de missing. En total la data tiene 464 NaN.

In [101]:
print(anime.isnull().sum())
print(anime[anime.isnull().any(axis=1)].shape)
anime[anime.isnull().any(axis=1)].head()



anime_id      0
name          0
genre        62
type         25
episodes    340
rating      230
members       0
dtype: int64
(464, 7)


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
74,21,One Piece,"Action, Adventure, Comedy, Drama, Fantasy, Sho...",TV,,8.58,504862
252,235,Detective Conan,"Adventure, Comedy, Mystery, Police, Shounen",TV,,8.25,114702
615,1735,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,,7.94,533578
991,966,Crayon Shin-chan,"Comedy, Ecchi, Kids, School, Shounen, Slice of...",TV,,7.73,26267
1021,33157,Tanaka-kun wa Itsumo Kedaruge Specials,"Comedy, School, Slice of Life",Special,,7.72,5400


## Web Scraping para rellenar valores nulos

In [7]:
nombres= anime[anime.isnull().any(axis=1)]
nombres = nombres["name"].values.tolist()

tipoAnime=pd.get_dummies(anime["type"]).columns
tipoAnime=tipoAnime.str.strip().unique().tolist()

genero=anime["genre"].str.get_dummies(sep=",").columns
genero=genero.str.strip().unique().tolist()

In [8]:
buscarURL = 'https://myanimelist.net/search/all?q='
urlAnime = []
for i in nombres:
    urlAnime.append(buscarURL + i)


class AnimeFcSpider(scrapy.Spider):
    name = 'anime_fc'

    def start_requests(self):  # start_requests method
        for url2 in urlAnime:
            yield scrapy.Request(url=url2,
                                 callback=self.parse_front)

    def parse_front(self, response):  # First parsing method
        course_links = response.xpath('//div[@class="picSurround di-tc thumb"]/a/@href')
        yield response.follow(url=course_links[0],
                              callback=self.parse_pages)

    def parse_pages(self, response):  # Second parsing method
        crs_name = response.xpath('//h1[@class="h1"]/span/text()').extract_first()
        crs_episodes = response.xpath('//td[@class="spaceit"]/span[@id="curEps"]/text()').extract_first()
        crs_rating = response.xpath('//span[@itemprop="ratingValue"]/text()').extract_first()
        crs_id = response.xpath('//input[@name="aid"]/@value').extract_first()

        crs_genre = response.xpath('//div/a/@title').extract()
        crs_genre = np.intersect1d(crs_genre, genero)
        crs_genre = ','.join(map(str, crs_genre))

        crs_type = response.xpath('//div/a/text()').extract()
        crs_type = np.intersect1d(crs_type,tipoAnime)
        crs_type = ','.join(map(str, crs_type))
       

        list_name.append(crs_name)
        list_genre.append(crs_genre)
        list_type.append(crs_type)
        list_episodes.append(crs_episodes)
        list_rating.append(crs_rating)
        list_id.append(crs_id)



list_name = list()
list_genre = list()
list_type = list()
list_episodes = list()
list_rating = list()
list_id = list()

s = get_project_settings()
s['CONCURRENT_REQUESTS_PER_IP'] = 16
s['CONCURRENT_REQUESTS_PER_DOMAIN '] = 16
s['DOWNLOAD_DELAY'] = 2.5
s['CONCURRENT_REQUESTS'] = 32
s['CONCURRENT_REQUESTS'] = 32


process = CrawlerProcess(s)  # Run the Spider
process.crawl(AnimeFcSpider)
process.start()


2019-05-17 21:15:55 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2019-05-17 21:15:55 [scrapy.utils.log] INFO: Versions: lxml 4.3.2.0, libxml2 2.9.9, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.2.0, Python 3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1b  26 Feb 2019), cryptography 2.6.1, Platform Windows-10-10.0.17134-SP0
2019-05-17 21:15:55 [scrapy.crawler] INFO: Overridden settings: {'CONCURRENT_REQUESTS': 32, 'CONCURRENT_REQUESTS_PER_IP': 16, 'DOWNLOAD_DELAY': 2.5}
2019-05-17 21:15:55 [scrapy.extensions.telnet] INFO: Telnet Password: b32f15637d797352
2019-05-17 21:15:55 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2019-05-17 21:15:56 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewar

2019-05-17 21:16:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Choujuu%20Giga> (referer: None)
2019-05-17 21:16:51 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=IS:%20Infinite%20Stratos%202%20-%20Infinite%20Wedding> (referer: None)
2019-05-17 21:16:53 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Metropolis%20(2009)> (referer: None)
2019-05-17 21:16:53 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Kaitou%20Joker%204th%20Season> (referer: None)
2019-05-17 21:16:56 [scrapy.extensions.logstats] INFO: Crawled 40 pages (at 40 pages/min), scraped 0 items (at 0 items/min)
2019-05-17 21:16:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/36904/Aggressive_Retsuko_ONA> (referer: https://myanimelist.net/search/all?q=Aggressive%20Retsuko)
2019-05-17 21:16:57 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://my

2019-05-17 21:18:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Kamakura> (referer: None)
2019-05-17 21:18:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/1960/Sore_Ike_Anpanman> (referer: https://myanimelist.net/search/all?q=Sore%20Ike!%20Anpanman)
2019-05-17 21:18:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Modern%20No.2> (referer: None)
2019-05-17 21:18:45 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/32248/Masamune_Datenicle> (referer: https://myanimelist.net/search/all?q=Masamune%20Datenicle)
2019-05-17 21:18:47 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/4459/Ojarumaru> (referer: https://myanimelist.net/search/all?q=Ojarumaru)
2019-05-17 21:18:52 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/25049/Sushi_Ninja> (referer: https://myanimelist.net/search/all?q=Sushi%20Ninja)
20

2019-05-17 21:21:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/31834/Mormorando> (referer: https://myanimelist.net/search/all?q=Mormorando)
2019-05-17 21:21:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/31507/Ari_Ningen_Monogatari> (referer: https://myanimelist.net/search/all?q=Ari%20Ningen%20Monogatari)
2019-05-17 21:21:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/31760/Tsuru_Shitae_Waka_Kan> (referer: https://myanimelist.net/search/all?q=Tsuru%20Shitae%20Waka%20Kan)
2019-05-17 21:21:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/28587/Modern> (referer: https://myanimelist.net/search/all?q=Modern)
2019-05-17 21:21:26 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/408/Final_Fantasy_VII__Last_Order> (referer: https://myanimelist.net/search/all?q=Fantasy)
2019-05-17 21:21:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET htt

2019-05-17 21:23:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/33863/Cakes> (referer: https://myanimelist.net/search/all?q=Cakes)
2019-05-17 21:23:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Guitar> (referer: None)
2019-05-17 21:23:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/29655/Chanda_Gou> (referer: https://myanimelist.net/search/all?q=Chanda%20Gou)
2019-05-17 21:23:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Hanakappa> (referer: None)
2019-05-17 21:23:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/18007/Dalam-iwa_Goseumdochi> (referer: https://myanimelist.net/search/all?q=Dalam-iwa%20Goseumdochi)
2019-05-17 21:23:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Happy%20Bogeys> (referer: None)
2019-05-17 21:23:52 [scrapy.core.engine] DEBUG: Crawled (200) <GET 

2019-05-17 21:25:54 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Jinxiu%20Shenzhou%20Zhi%20Qi%20You%20Ji> (referer: None)
2019-05-17 21:25:56 [scrapy.extensions.logstats] INFO: Crawled 228 pages (at 21 pages/min), scraped 0 items (at 0 items/min)
2019-05-17 21:25:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Kabi%20Usagi> (referer: None)
2019-05-17 21:26:00 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Kaette%20Kite%20yoo%20Toyama%20kara> (referer: None)
2019-05-17 21:26:05 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/28487/Ikite_Iru> (referer: https://myanimelist.net/search/all?q=Ikite%20Iru)
2019-05-17 21:26:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/34502/Inazma_Delivery> (referer: https://myanimelist.net/search/all?q=Inazma%20Delivery)
2019-05-17 21:26:09 [scrapy.core.engine] DEBUG: Crawled (200) <

2019-05-17 21:28:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/31506/Kappa_no_Ude> (referer: https://myanimelist.net/search/all?q=Kappa%20no%20Ude)
2019-05-17 21:28:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Mim%20Mam%20Mint> (referer: None)
2019-05-17 21:28:21 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/29489/Karasu_no_Puuta> (referer: https://myanimelist.net/search/all?q=Karasu%20no%20Puuta)
2019-05-17 21:28:24 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/29361/Kareshi_wa_Hammerhead_Shark> (referer: https://myanimelist.net/search/all?q=Kareshi%20wa%20Hammerhead%20Shark)
2019-05-17 21:28:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/10797/Kayoe_Chuugaku> (referer: https://myanimelist.net/search/all?q=Kayoe!%20Chuugaku)
2019-05-17 21:28:31 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/an

2019-05-17 21:30:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=PikkaPika%20Summer> (referer: None)
2019-05-17 21:30:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Pittanko!%20Nekozakana> (referer: None)
2019-05-17 21:30:47 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Ponkotsu%20Quest:%20Maou%20to%20Haken%20no%20Mamono-tachi> (referer: None)
2019-05-17 21:30:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/29427/Mori_no_Ratio> (referer: https://myanimelist.net/search/all?q=Mori%20no%20Ratio)
2019-05-17 21:30:54 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/33479/Muzumuzu_Eighteen> (referer: https://myanimelist.net/search/all?q=Muzumuzu%20Eighteen)
2019-05-17 21:30:56 [scrapy.extensions.logstats] INFO: Crawled 328 pages (at 19 pages/min), scraped 0 items (at 0 items/min)
2019-05-17 21:30:57 [scrapy.core.engin

2019-05-17 21:32:56 [scrapy.extensions.logstats] INFO: Crawled 367 pages (at 20 pages/min), scraped 0 items (at 0 items/min)
2019-05-17 21:32:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Tokyo%20SOS> (referer: None)
2019-05-17 21:32:59 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Tomodachi%208-nin> (referer: None)
2019-05-17 21:33:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Toshi%20Souzou%20Gakubu%20Shoukai> (referer: None)
2019-05-17 21:33:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Wareware%20no%20Heya> (referer: None)
2019-05-17 21:33:10 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/33196/Shi_Wan_Ge_Leng_Xiaohua_Di_3rd_Season> (referer: https://myanimelist.net/search/all?q=Shi%20Wan%20Ge%20Leng%20Xiaohua%20Di%203rd%20Season)
2019-05-17 21:33:13 [scrapy.core.engine] DEBUG: Crawled (200) <G

2019-05-17 21:35:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/30119/Yowamushi_Monsters> (referer: https://myanimelist.net/search/all?q=Yowamushi%20Monsters)
2019-05-17 21:35:20 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Sword%20Art%20Online%20Movie:%20Ordinal%20Scale> (referer: None)
2019-05-17 21:35:23 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/34613/Garo__Honoo_no_Kokuin_-_Yurugaro> (referer: https://myanimelist.net/search/all?q=Yurugaro)
2019-05-17 21:35:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Dungeon%20ni%20Deai%20wo%20Motomeru%20no%20wa%20Machigatteiru%20Darou%20ka%20Gaiden:%20Sword%20Oratoria> (referer: None)
2019-05-17 21:35:31 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/30309/Yuuyake_Dandan> (referer: https://myanimelist.net/search/all?q=Yuuyake%20Dandan)
2019-05-17 21:35:36 [scrapy.core.eng

2019-05-17 21:37:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Akiba&#039;s%20Trip%20The%20Animation> (referer: None)
2019-05-17 21:37:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/33487/Masamune-kun_no_Revenge> (referer: https://myanimelist.net/search/all?q=Masamune-kun%20no%20Revenge)
2019-05-17 21:37:45 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/30455/Kantai_Collection__KanColle_Zoku-hen> (referer: https://myanimelist.net/search/all?q=Kantai%20Collection:%20KanColle%20Zoku-hen)
2019-05-17 21:37:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/33047/Fate_Extra__Last_Encore> (referer: https://myanimelist.net/search/all?q=Fate/Extra%20Last%20Encore)
2019-05-17 21:37:51 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Alice%20in%20Deadly%20School> (referer: None)
2019-05-17 21:37:54 [scrapy.core.engine] DEBUG: Craw

2019-05-17 21:39:51 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/3470/Special_A> (referer: https://myanimelist.net/search/all?q=Amanchu!%20Special)
2019-05-17 21:39:54 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/32105/Sousei_no_Onmyouji> (referer: https://myanimelist.net/search/all?q=Ao%20no%20Exorcist%20OVA)
2019-05-17 21:39:56 [scrapy.extensions.logstats] INFO: Crawled 504 pages (at 21 pages/min), scraped 0 items (at 0 items/min)
2019-05-17 21:39:58 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Chain%20Chronicle:%20Haecceitas%20no%20Hikari> (referer: None)
2019-05-17 21:40:01 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Chain%20Chronicle:%20Haecceitas%20no%20Hikari%20Part%202> (referer: None)
2019-05-17 21:40:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Chain%20Chronicle:%20Haecceitas%20no%20Hikari%20Pa

2019-05-17 21:42:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Escha%20Chron> (referer: None)
2019-05-17 21:42:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/manga/55887/Clockwork_Planet> (referer: https://myanimelist.net/search/all?q=Clockwork%20Planet)
2019-05-17 21:42:11 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Evangelion:%203.0+1.0> (referer: None)
2019-05-17 21:42:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/32152/Closers__Side_Blacklambs> (referer: https://myanimelist.net/search/all?q=Closers:%20Side%20Blacklambs)
2019-05-17 21:42:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/31456/Code_Realize_-_Sousei_no_Himegimi> (referer: https://myanimelist.net/search/all?q=Code:Realize:%20Sousei%20no%20Himegimi)
2019-05-17 21:42:22 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/34244/

2019-05-17 21:44:29 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/character/143685/Finis> (referer: https://myanimelist.net/search/all?q=Finis)
2019-05-17 21:44:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=High%20Score%20Girl> (referer: None)
2019-05-17 21:44:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Hinako%20Note> (referer: None)
2019-05-17 21:44:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/31376/Flying_Witch> (referer: https://myanimelist.net/search/all?q=Flying%20Babies)
2019-05-17 21:44:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/34427/Frame_Arms_Girl> (referer: https://myanimelist.net/search/all?q=Frame%20Arms%20Girl)
2019-05-17 21:44:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/33203/Fukumenkei_Noise> (referer: https://myanimelist.net/search/all?q=Fukumenkei%20Noise

2019-05-17 21:46:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Knight&#039;s%20&amp;%20Magic> (referer: None)
2019-05-17 21:46:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Kobayashi-san%20Chi%20no%20Maid%20Dragon> (referer: None)
2019-05-17 21:46:46 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Koutetsujou%20no%20Kabaneri%20Soushuuhen%201:%20Tsudou%20Hikari> (referer: None)
2019-05-17 21:46:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Koutetsujou%20no%20Kabaneri%20Soushuuhen%202:%20Moeru%20Inochi> (referer: None)
2019-05-17 21:46:51 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Kuroko%20no%20Basket:%20Last%20Game> (referer: None)
2019-05-17 21:46:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/33899/IDOLiSH7> (referer: https://myanimelist.net/search/all?q=I

2019-05-17 21:48:47 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/34151/Landreaall> (referer: https://myanimelist.net/search/all?q=Landreaall)
2019-05-17 21:48:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/2990/Mini_Skirt_Gakuen> (referer: https://myanimelist.net/search/all?q=Locker%20Room)
2019-05-17 21:48:52 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/34021/Lupin_the_IIIrd__Chikemuri_no_Ishikawa_Goemon> (referer: https://myanimelist.net/search/all?q=Lupin%20the%20IIIrd:%20Chikemuri%20no%20Ishikawa%20Goemon)
2019-05-17 21:48:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/17947/Mahou_Shoujo_Lyrical_Nanoha__Reflection> (referer: https://myanimelist.net/search/all?q=Mahou%20Shoujo%20Lyrical%20Nanoha%20Reflection)
2019-05-17 21:48:56 [scrapy.extensions.logstats] INFO: Crawled 685 pages (at 20 pages/min), scraped 0 items (at 0 items/min)
2019-05-17 21:48:57 [sc

2019-05-17 21:50:59 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Senki%20Zesshou%20Symphogear%204th%20Season> (referer: None)
2019-05-17 21:51:01 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/33983/Onihei> (referer: https://myanimelist.net/search/all?q=Onihei)
2019-05-17 21:51:05 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/32013/Oshiete_Galko-chan> (referer: https://myanimelist.net/search/all?q=Oshiete!%20Galko-chan%20(OVA))
2019-05-17 21:51:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/34262/Oushitsu_Kyoushi_Heine> (referer: https://myanimelist.net/search/all?q=Oushitsu%20Kyoushi%20Haine)
2019-05-17 21:51:09 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/34161/Overlord_Movie_1__Fushisha_no_Ou> (referer: https://myanimelist.net/search/all?q=Overlord%20Movie)
2019-05-17 21:51:12 [scrapy.core.engine] DEBUG: Crawled (200) <G

2019-05-17 21:53:11 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Trick%20or%20Alice> (referer: None)
2019-05-17 21:53:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Trinity%20Seven%20Movie:%20Eternity%20Library%20to%20Alchemic%20Girl> (referer: None)
2019-05-17 21:53:16 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Tsubu%E2%98%85Doll> (referer: None)
2019-05-17 21:53:20 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/30736/Shingeki_no_Bahamut__Virgin_Soul> (referer: https://myanimelist.net/search/all?q=Shingeki%20no%20Bahamut:%20Virgin%20Soul)
2019-05-17 21:53:23 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Tsugumomo> (referer: None)
2019-05-17 21:53:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/30223/Shishou_Series> (referer: https://myanimelist.net/search/all?q=Shishou

2019-05-17 21:55:25 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Yuyushiki%20Special> (referer: None)
2019-05-17 21:55:28 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Zunda%20Horizon> (referer: None)
2019-05-17 21:55:31 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/34076/Uchouten_Kazoku_2> (referer: https://myanimelist.net/search/all?q=Uchouten%20Kazoku%202)
2019-05-17 21:55:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/33064/Uchuu_Senkan_Yamato_2202__Ai_no_Senshi-tachi> (referer: https://myanimelist.net/search/all?q=Uchuu%20Senkan%20Yamato%202202:%20Ai%20no%20Senshi-tachi)
2019-05-17 21:55:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/33478/UQ_Holder__Mahou_Sensei_Negima_2> (referer: https://myanimelist.net/search/all?q=UQ%20Holder!)
2019-05-17 21:55:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://mya

2019-05-17 21:57:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Houkago%20Initiation> (referer: None)
2019-05-17 21:57:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/30891/Oni_Chichi__Refresh%E2%99%A5> (referer: https://myanimelist.net/search/all?q=Oni%20Chichi:%20Refresh%E2%99%A5)
2019-05-17 21:57:47 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/search/all?q=Secret%20Journey> (referer: None)
2019-05-17 21:57:47 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/32872/Energy_Kyouka> (referer: https://myanimelist.net/search/all?q=Energy%20Kyouka!!)
2019-05-17 21:57:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/manga/84593/Fella_Hame_Lips> (referer: https://myanimelist.net/search/all?q=Fella%20Hame%20Lips)
2019-05-17 21:57:54 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/31788/Ero_Manga_H_mo_Manga_mo_Step-up

2019-05-17 21:59:46 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/34311/Ecchi_na_Shintai_Sokutei_Anime_Edition> (referer: https://myanimelist.net/search/all?q=Ecchi%20na%20Shintai%20Sokutei%20Anime%20Edition)
2019-05-17 21:59:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/32238/Watashi_wa_Kairaku_Izonshou> (referer: https://myanimelist.net/search/all?q=Watashi%20wa,%20Kairaku%20Izonshou)
2019-05-17 21:59:52 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://myanimelist.net/anime/28169/Buta_no_Gotoki_Sanzoku_ni_Torawarete_Shojo_wo_Ubawareru_Kyonyuu_Himekishi___Onna_Senshi> (referer: https://myanimelist.net/search/all?q=Buta%20no%20Gotoki%20Sanzoku%20ni%20Torawarete%20Shojo%20wo%20Ubawareru%20Kyonyuu%20Himekishi%20&amp;%20Onna%20Senshi)
2019-05-17 21:59:56 [scrapy.extensions.logstats] INFO: Crawled 902 pages (at 18 pages/min), scraped 0 items (at 0 items/min)
2019-05-17 21:59:56 [scrapy.core.engine] DEBUG: Crawled (200) 

In [102]:
DataNa = pd.DataFrame({"anime_id":list_id, "name":list_name,"genre":list_genre,
                       "type":list_type, "episodes":list_episodes, "rating":list_rating})

DataNa.replace("", np.nan, inplace=True)
DataNa.replace('?', np.nan, inplace=True)

print(DataNa.shape)
print(DataNa.isnull().sum())
DataNa.head(10)

(453, 6)
anime_id     17
name          1
genre        31
type         21
episodes     91
rating      127
dtype: int64


Unnamed: 0,anime_id,name,genre,type,episodes,rating
0,21,One Piece,"Action,Adventure,Comedy,Drama,Fantasy,Shounen,...",TV,,8.53
1,19067,Future Card Buddyfight,Game,TV,64.0,7.07
2,235,Detective Conan,"Adventure,Comedy,Mystery,Police,Shounen",TV,,8.25
3,1735,Naruto: Shippuuden,"Action,Adventure,Comedy,Martial Arts,Shounen,S...",TV,500.0,8.2
4,966,Crayon Shin-chan,"Comedy,Ecchi,School,Seinen,Slice of Life",TV,,7.74
5,7505,Knyacki!,"Comedy,Drama,Fantasy,Kids",TV,,6.26
6,33157,Tanaka-kun wa Itsumo Kedaruge Specials,"Comedy,School,Slice of Life",Special,7.0,7.46
7,34754,Rilu Rilu Fairilu: Yousei no Door,"Fantasy,Magic,Slice of Life",TV,59.0,6.79
8,21639,Yu☆Gi☆Oh! Arc-V,"Action,Fantasy,Game,Music,Shounen","Music,TV",148.0,7.12
9,8687,Doraemon (2005),"Comedy,Kids,Sci-Fi,Shounen",TV,,7.54


#### Preprocesando data obtenida con web scraping

Cambiamos el tipo de datos al mismo que la data original

In [103]:
DataNa["anime_id"] = DataNa["anime_id"].astype(float)
DataNa["episodes"] = DataNa["episodes"].astype(float)
DataNa["rating"] = DataNa["rating"].astype(float)
DataNa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453 entries, 0 to 452
Data columns (total 6 columns):
anime_id    436 non-null float64
name        452 non-null object
genre       422 non-null object
type        432 non-null object
episodes    362 non-null float64
rating      326 non-null float64
dtypes: float64(3), object(3)
memory usage: 21.3+ KB


Cruzamos la data obtenida con la origianal para luego reemplazar los nulos con la información nueva

In [104]:
dataNueva= pd.merge(anime, DataNa,left_on="anime_id",right_on="anime_id", how="left")
dataNueva.info()
print(anime.isnull().sum())
print(anime[anime.isnull().any(axis=1)].shape)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 12294 entries, 0 to 12293
Data columns (total 12 columns):
anime_id      12294 non-null int64
name_x        12294 non-null object
genre_x       12232 non-null object
type_x        12269 non-null object
episodes_x    11954 non-null float64
rating_x      12064 non-null float64
members       12294 non-null int64
name_y        424 non-null object
genre_y       394 non-null object
type_y        419 non-null object
episodes_y    352 non-null float64
rating_y      304 non-null float64
dtypes: float64(4), int64(2), object(6)
memory usage: 1.2+ MB
anime_id      0
name          0
genre        62
type         25
episodes    340
rating      230
members       0
dtype: int64
(464, 7)


In [105]:
dataNueva.loc[dataNueva["genre_x"].isna(),"genre_x"] = dataNueva["genre_y"]
dataNueva.loc[dataNueva["type_x"].isna(),"type_x"] = dataNueva["type_y"]
dataNueva.loc[dataNueva["episodes_x"].isna(),"episodes_x"] = dataNueva["episodes_y"]
dataNueva.loc[dataNueva["rating_x"].isna(),"rating_x"] = dataNueva["rating_y"]


Eliminamos las varibles nuevas dado que ya utilizamos sus valores. 

In [106]:
dataNueva.drop(["name_y", "genre_y", "type_y", "episodes_y", "rating_y"],axis=1,inplace=True)
dataNueva.columns = dataNueva.columns.str.replace('_x', '')

print(dataNueva.isnull().sum())
print(dataNueva[dataNueva.isnull().any(axis=1)].shape)

anime_id      0
name          0
genre        45
type          9
episodes    125
rating       80
members       0
dtype: int64
(196, 7)


## Imputando datos nulos

#### Variable "episodes"

La primera variable a imputar es "episodes", dado que es la con mayor cantidad de NaN, para esto agruparemos por "type" y utilizaremos la mediana() de la cantidad de episodios de cada grupo.

In [107]:
anime=dataNueva.copy()
print(anime.groupby("type")["episodes"].describe())

anime.loc[(anime["type"]=="OVA") & (anime["episodes"].isna()),"episodes"] = anime.loc[(anime["type"]=="OVA") ,"episodes"].median()
anime.loc[(anime["type"]=="Movie") & (anime["episodes"].isna()),"episodes"] = anime.loc[(anime["type"]=="Movie") ,"episodes"].median()
anime.loc[(anime["type"]=="Music") & (anime["episodes"].isna()),"episodes"] = anime.loc[(anime["type"]=="Music") ,"episodes"].median()
anime.loc[(anime["type"]=="ONA") & (anime["episodes"].isna()),"episodes"] = anime.loc[(anime["type"]=="ONA") ,"episodes"].median()
anime.loc[(anime["type"]=="Special") & (anime["episodes"].isna()),"episodes"] = anime.loc[(anime["type"]=="Special") ,"episodes"].median()
anime.loc[(anime["type"]=="TV") & (anime["episodes"].isna()),"episodes"] = anime.loc[(anime["type"]=="TV") ,"episodes"].median()
anime.loc[(anime["type"].isna()) & (anime["episodes"].isna()),"episodes"] = anime["episodes"].median()

print(anime[anime.isnull().any(axis=1)].shape)
print(anime.isnull().sum())

           count       mean        std   min   25%   50%   75%     max
type                                                                  
Movie     2349.0   1.102171   2.147012   1.0   1.0   1.0   1.0   100.0
Music      488.0   1.190574   1.838040   1.0   1.0   1.0   1.0    30.0
Music,TV     1.0  17.000000        NaN  17.0  17.0  17.0  17.0    17.0
ONA        645.0   7.277519  11.782008   1.0   1.0   2.0  10.0    84.0
OVA       3298.0   2.428441   3.225129   1.0   1.0   2.0   3.0   110.0
Special   1675.0   2.568955   3.699171   1.0   1.0   1.0   2.0    51.0
TV        3713.0  37.161056  99.886573   2.0  12.0  24.0  39.0  3057.0
(116, 7)
anime_id     0
name         0
genre       45
type         9
episodes     0
rating      80
members      0
dtype: int64


#### Variable "type"

Esta variable tiene 9 nulos, podríamos inferir el tipo por la cantidad de capítulos del animé, pero justamente estos 9 animé no tienen esa información por lo que reemplazaremos el dato nulo por "notype" para no eliminar la observación y así perder información valiosa. 


In [108]:
anime["type"].replace(np.nan, "notype", inplace=True)
print(anime.isnull().sum())

anime_id     0
name         0
genre       45
type         0
episodes     0
rating      80
members      0
dtype: int64


#### Variable "genre"

Tenemos 45 animé con nulos en genero. pero dado que esta variable es muy importante en la elección del animé (por conocimiento propio) una imputación errónea sería grabe, por lo tanto haremos lo mismo que con "type" y crearemos una categoría para los nulos "nogenre"

In [109]:
anime["genre"].replace(np.nan, "nogenre", inplace=True)
print(anime.isnull().sum())

anime_id     0
name         0
genre        0
type         0
episodes     0
rating      80
members      0
dtype: int64


#### Variable "rating"

Para esta variable haremos una imputación un poco más dirigida, se agrupará por "type" y "epidodes" y se calculará la mediana de rating con esa agrupación para imputar rating. En caso que los grupos "type" y "episodes" no tengan una mediana para "rating" se agrupará por "genre" y "epidodes" y si aún así no hay una mediana para "rating", entoces los datos nulos se reemplazarán por la mediana global. 

In [110]:
def impute_median(series):
    return series.fillna(series.median())

anime.rating = anime.groupby(['type', 'episodes'])[["rating"]].transform(impute_median)
anime.rating = anime.groupby(['genre', 'episodes'])[["rating"]].transform(impute_median)
anime["rating"]=anime["rating"].fillna(anime["rating"].median())
print(anime.isnull().sum())


anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


Se resetea el índice de la data para no tener problemas en el futuro para buscar filas especificas

In [42]:
anime=anime.reset_index()

### Construyendo data con variables para análisis

#### Re-codificando variables

No era conveniente tener los géneros apilados como categoría separadas por comas en una única casilla, por lo que se separaron y pasaros a variables dicotómicas al igual que "type". Las variables restantes serán escaladas para no tener problemas con los algoritmos futuros, dado que utilizan distancias.

In [111]:
anime_data = pd.concat([anime["genre"].str.get_dummies(sep=","),
                           anime["type"].str.get_dummies(sep=","),anime[["rating"]],
                            anime[["members"]],anime["episodes"]],axis=1)

anime_data.head()


Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Movie,Music,ONA,OVA,Special,TV,notype,rating,members,episodes
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,9.37,200630,1.0
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,1,0,9.26,793665,64.0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,9.25,114262,51.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,9.17,673572,24.0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,9.16,151266,51.0


In [112]:
anime_data = MaxAbsScaler().fit_transform(anime_data)
anime_data

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        9.37000000e-01, 1.97876158e-01, 3.27118090e-04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        9.26000000e-01, 7.82771174e-01, 2.09355577e-02],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        9.25000000e-01, 1.12693643e-01, 1.66830226e-02],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        4.88000000e-01, 2.15994011e-04, 1.30847236e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        4.98000000e-01, 1.72597954e-04, 3.27118090e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        5.46000000e-01, 1.40050911e-04, 3.27118090e-04]])

## Algoritmo no supervisado para encontrar elementos similares 
## Parte 1: Animes similares a un anime especifico

### K vecino más cercano (KNN)

El K-Vecino más cercano es la opción que me pareció más aceptada, dado que es un algoritmo jerárquico por lo cual no tenemos que elegir grupos a priori y es exactamente lo que estamos buscamos, explico: 

Lo que necesitamos es un algoritmo que tome un anime (cada fila representa un anime diferente) y de acuerdo a sus características (calificación, géneros, tipo y cantidad de episodios) pueda encontrar animes similares. KNN toma la distancia de una observación con cada observación de la data (importante tener los datos en la misma escala) y es exactamente lo que nos interesa rescatar, dado que es un claro indicador de similitud entre animes, además nos da la opción de elegir los k vecinos más próximos al animé buscado. 

(El objetivo no es encontrar clúster o grupos de animé, si no desde un punto establecido en el espacio rescatar los puntos más próximos)

Parámetros KNN

n_neighbors:
El número de vecinos solo nos agrega más elementos en la salida, es decir, n_neighbors=k sólo me indicará que "índices" tendrá un vector de k-1 elementos correspondiente los índices de los vecinos más cercano del anime consultado.  




In [113]:
KNNanime = NearestNeighbors(n_neighbors=7, algorithm='ball_tree').fit(anime_data)
distances, indices = KNNanime.kneighbors(anime_data)

In [52]:
def nombres_indices(name):  # Toma el nombre del anime y devuelve su indice correspondiente
    return anime[anime["name"]==name].index.tolist()[0] 


In [53]:
def recomendados_por_anime(nombre):  # Muestra el grupo de animes más cercanos al consultado
     found_id = nombres_indices(nombre)
     for id in indices[found_id][1:]:
            print(anime.loc[id]["name"])
            
recomendados_por_anime("Naruto")
        
       

Naruto: Shippuuden
Katekyo Hitman Reborn!
Bleach
Dragon Ball Z
Boku no Hero Academia
Ben-To


## Parte 2: Animes recomendados para cada usuario

En la parte 1 sólo conseguimos encontrar animes similares a otros animes, pero no estamos recomendando nada al usuario, es por esto, que utilizaremos la data riting.csv que contiene información del usuario para crear un recomendado de anime según preferencias del usuario utilizando las distancias de similitud obtenidas en la parte 1.

### Explorando data riting

Esta data contiene un id del usuario (user_id), el id del anime (anime_id) y la calificación que da el usuario al anime (rating).

No contiene NaN, pero la variable rating contiene el valor -1 que significa que el usuario no calificó el anime, esto puede ser considerado como un dato faltando.


In [78]:
print(rating.shape)
print(rating.isnull().sum())
rating.head()

(7813737, 3)
user_id     0
anime_id    0
rating      0
dtype: int64


Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


El siguiente paso es cruzar las datas anime y rating por la izquierda, dado que la data "rating" puede contener
animes que no se encuentran en la data "anime" y esto puede ser un problema en el futuro. 

In [80]:
merge = pd.merge(anime, rating, on="anime_id", how="left")
merge.head()

Unnamed: 0,index,anime_id,name,genre,type,episodes,rating_x,members,user_id,rating_y
0,0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,99.0,5.0
1,0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,152.0,10.0
2,0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,244.0,10.0
3,0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,271.0,10.0
4,0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630,278.0,-1.0


### Construyendo recomendador

Necesitamos obtener todos los animes vistos por un usuario especifico, dado que según esto podemos capturar sus preferencias, luego de obtenidos los animes vistos por el usuario se procede a guardar en una lista con todos los animes similares a los que a visto el usuario (vecinos del algoritmo KNN ) excluyendo los que ha vistos (para no recomendar un anime que el usuario ya vio). Por último, se toma esta lista y se calcula la frecuencia de los animes que más se repiten en la lista y se ordenan de mayor a menor. 

La función ecomendados_usuario() devuelve los animes recomendados para el usuario.


In [142]:
def similar_animes(id_anime):  # Trae todos los id_anime relacionados con un id_anime dado
    
    id_list=[]
    found_id = anime[anime["anime_id"]==id_anime].index.tolist()[0]  # Indice del id ingresado
    for id in indices[found_id][1:]:
            id_list.append(anime.loc[id]["anime_id"])
            
    return id_list  
        
            
def similar_animes_usuarios(id_user):  # Crea una lista con todos los animes relacionados con los animes visto por el usuario
    
    a = merge[merge["user_id"]==id_user].anime_id.values
    lista = []
    for i in range(len(a)):
        lista.append(similar_animes(a[i]))
    return lista
            
        
def similar_animes_usuarios_freq(id_user): # Crea una lista con los 6 anime más recomendados del usuario
    a=similar_animes_usuarios(id_user)
    r= np.array([])
    for i in range(5):
        f1 = pd.Series( (v[i] for v in a))
        r = np.append(r,f1)
        
    gh = merge[merge["user_id"]==id_user].anime_id.values
    rdiff=np.setdiff1d(r, gh)
    kk = pd.DataFrame({'Column1':rdiff})
    pda = pd.crosstab(index=kk["Column1"].astype(int), columns= "count")
    pda2 = pda.sort_values("count", ascending=False).head(6).index.tolist() 
    
    return pda2
        
    
def recomendados_usuario(id_user):  # Pasa de anime_id a los nombres de los animé
    
    a=similar_animes_usuarios_freq(id_user)
    for id in a:
        print(anime[anime["anime_id"]==id]["name"].values)
        


### Utilizando funciones de recomendación

#### Animes recomendados por usuario

In [159]:
recomendados_usuario(3454)

['School Rumble']
['Gochuumon wa Usagi Desu ka?']
['Ao Haru Ride']
['Sword Art Online II']
['Sword Art Online: Sword Art Offline - Extra Edition']
['High Score Girl']


In [167]:
recomendados_usuario(8765)

['One Piece']
['Naruto: Shippuuden']
['Hajime no Ippo: New Challenger']
['Dragon Ball Kai']
['Diamond no Ace']
['Haikyuu!!']


#### Animes recomendados por anime

In [106]:
recomendados_por_anime("Dragon Ball Z")

Dragon Ball Kai
Dragon Ball Super
Dragon Ball Kai (2014)
One Piece
Naruto: Shippuuden
Naruto


In [123]:
recomendados_por_anime("Pokemon")

Digimon Adventure
Pokemon Advanced Generation
Pokemon Diamond &amp; Pearl
Pokemon XY
Pokemon Best Wishes!
Pokemon XY&amp;Z
