In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.feature_extraction.text import CountVectorizer
import import_ipynb
import ETL 

import numpy as np
import pandas as pd
import json


class Recomention:
    def __init__(self):
        films = ETL.Films()
        self.credits_final = films.get_credits_final()
        self.movies_final = films.get_movies_final()
        movie_cleaned = self.movies_final
        index = movie_cleaned [((movie_cleaned.vote_average == 0) | ( movie_cleaned.popularity == 0) | (movie_cleaned.vote_count == 0) | (movie_cleaned.vote_count >= 100)  |  (movie_cleaned['popularity'] >= 10)  )].index   
        movie_cleaned = movie_cleaned.drop(index)
        movie_cleaned['overview'] = movie_cleaned['overview'].fillna('')
        index = movie_cleaned [  (movie_cleaned['popularity'] <= 2.5)  ].index
        self.movie_cleaned = movie_cleaned.drop(index)

    def get_recommendations_by_title(self, title):
        print("Recomendacion por titulo")  
        movies = self.movie_cleaned
        print(movies.head(5))
        tfidf = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf.fit_transform(movies['overview'])
        indices = pd.Series(movies.index, index=movies['key_title']).drop_duplicates()
        cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

        idx = indices[title.lower()]    
        print("idx is: ",idx)
        sim_scores = list(enumerate(cosine_sim[idx]))    
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:6]
        movie_indices = [i[0] for i in sim_scores]
        print(self.movie_cleaned['key_title'].iloc[movie_indices].values)
        return self.movie_cleaned['key_title'].iloc[movie_indices].values

        
    def weighted_rating(self, x):
        v = x['vote_count']
        R = x['vote_average']
        
        C = self.movie_cleaned['vote_average'].mean()
        m = self.movie_cleaned['vote_count'].quantile(0.90)

        return (v/(v+m) * R) + (m/(m+v) * C)

    
    def recommendations(self, title):

        print("Starting with: ",title)
        metadata = self.movie_cleaned
        
        metadata['score'] = metadata.apply(self.weighted_rating, axis=1)

        
        metadata = metadata.sort_values('score', ascending=False)
        movieRatings = metadata.pivot_table(index=['vote_count', 'vote_average'], columns='key_title',values='score', aggfunc='sum', fill_value=0)  
        #print("movieRatings: ",movieRatings)
        movieRatings = movieRatings.applymap(lambda x: 1 if x > 0 else 0)
        index_movieRatings = pd.DataFrame(cosine_similarity(movieRatings))

        #print("index_movieRatings: ",index_movieRatings)
        index_movieRatings.columns = movieRatings.index
        index_movieRatings['index'] = movieRatings.index
        index_movieRatings = index_movieRatings.set_index('index')

        movie_name_ratings = movieRatings[title.lower()]
       
        similarMovies = movieRatings.corrwith(movie_name_ratings)  
        
        similarMovies = similarMovies.dropna()  
  
        similarMovies.sort_values(ascending=False)  
        print("Type ", type(similarMovies))
        index_list = similarMovies.index.to_list()
        
        #print(type(index_list))
        #print(index_list[1:6])

        return str(index_list[1:6])
        #return ""
    
    def get_recommendations(self, title, cosine_sim, metadata):

        indices = pd.Series(metadata.index, index=metadata['key_title'])
        idx = indices[title]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:6]
        movie_indices = [i[0] for i in sim_scores]

        print( "la lista: ",metadata['title'].iloc[movie_indices].to_list())
        return metadata['title'].iloc[movie_indices].to_list()

    def get_list(self, x):    
        if isinstance(x, list): 
            names = [i['name'] for i in x]        
            if len(names) > 3:
                
                names = names[:3]
            return names
        else:
            x=x.replace("'","\"")
            data = json.loads(x)
            names = [i['name'] for i in data]
            return names
        return []
    
    def create_mix(self,x):
        spoken_languages = ' '.join(x['spoken_languages'])
        genres = ' '.join(x['genres'])
        return spoken_languages + ' '+genres+' '+x['overview'] + ' '

        
    def get_recommendations_by_words(self, title):
        metadata = self.movie_cleaned
        metadata = metadata.drop(['budget','production_companies','production_countries','release_date_clean','revenue','runtime','status', 
        'tagline','release_date_clean', 'release_date_clean_format', 'retorno', 'month_movies', 'day_movies', 'year_movies','popularity',
        'release_date','vote_average','vote_count','belongs_to_collection_json','production_companies_json','belongs_to_collection'
       ,'original_language'], axis=1)

        features = ['spoken_languages', 'genres']
        for feature in features:
            metadata[feature] = metadata[feature].apply(self.get_list)

        metadata['mix'] = metadata.apply(self.create_mix, axis=1)

        count = CountVectorizer(stop_words='english')
        count_matrix = count.fit_transform(metadata['mix'])
        cosine_sim = cosine_similarity(count_matrix, count_matrix)

        metadata = metadata.reset_index()

        print("title: ",title)
        output = self.get_recommendations(title.lower(),cosine_sim, metadata)
        print(output)
        return output

