In [99]:
import pandas as pd
from __future__ import print_function
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from numpy import linalg as LA
import json
import math
%matplotlib inline
import matplotlib.pyplot as plt

# Load the CSV file into a Pandas dataframe
df = pd.read_csv(''data/output.csv'')

# Rename the "sypnopsis" column to "synopsis"
df = df.rename(columns={'sypnopsis': 'synopsis'})

# Drop Duplicate names
df.drop_duplicates(subset='Name', inplace=True)


In [100]:
anime_id_to_index = {anime_id:index for index, anime_id in enumerate(df['MAL_ID'])}
anime_name_to_id = {name:mid for name, mid in zip(df['Name'], df['MAL_ID'])}
anime_id_to_name = {v:k for k,v in anime_name_to_id.items()}
anime_name_to_index = {name:anime_id_to_index[anime_name_to_id[name]] for name in df['Name']}
anime_index_to_name = {v:k for k,v in anime_name_to_index.items()}


In [101]:
n_feats = 5000
doc_by_vocab = np.empty([len(df), n_feats])

def build_vectorizer(max_features, stop_words, max_df=0.8, min_df=10, norm='l2'):
    """Returns a TfidfVectorizer object with the above preprocessing properties.
    
    Note: This function may log a deprecation warning. This is normal, and you
    can simply ignore it.
    
    Parameters
    ----------
    max_features : int
        Corresponds to 'max_features' parameter of the sklearn TfidfVectorizer 
        constructer.
    stop_words : str
        Corresponds to 'stop_words' parameter of the sklearn TfidfVectorizer constructer. 
    max_df : float
        Corresponds to 'max_df' parameter of the sklearn TfidfVectorizer constructer. 
    min_df : float
        Corresponds to 'min_df' parameter of the sklearn TfidfVectorizer constructer. 
    norm : str
        Corresponds to 'norm' parameter of the sklearn TfidfVectorizer constructer. 

    Returns
    -------
    TfidfVectorizer
        A TfidfVectorizer object with the given parameters as its preprocessing properties.
    """
    # YOUR CODE HERE
    vectorizer = TfidfVectorizer(max_features = max_features, stop_words=stop_words, max_df=max_df, min_df=min_df, norm=norm)
    return vectorizer

In [102]:
tfidf_vec = build_vectorizer(n_feats, "english")
doc_by_vocab = tfidf_vec.fit_transform(df['synopsis'].values.astype('U')).toarray()
index_to_vocab = {i:v for i, v in enumerate(tfidf_vec.get_feature_names())}

In [103]:
print(doc_by_vocab)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.39281937 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [104]:
def get_sim(mov1, mov2, input_doc_mat, input_movie_name_to_index):
    """Returns a float giving the cosine similarity of 
       the two movie transcripts.
    
    Params: {mov1 (str): Name of the first movie.
             mov2 (str): Name of the second movie.
             input_doc_mat (numpy.ndarray): Term-document matrix of movie transcripts, where 
                    each row represents a document (movie transcript) and each column represents a term.
             movie_name_to_index (dict): Dictionary that maps movie names to the corresponding row index 
                    in the term-document matrix.}
    Returns: Float (Cosine similarity of the two movie transcripts.)
    """
    # YOUR CODE HERE
    index1 = input_movie_name_to_index[mov1]
    index2 = input_movie_name_to_index[mov2]
    arr1 = input_doc_mat[index1]
    arr2 = input_doc_mat[index2]
    numerator = np.dot(arr1,arr2)
    denomenator = LA.norm(arr1)*LA.norm(arr2)
    
    return numerator/denomenator

In [105]:
test1 = get_sim('Trigun', 'Cowboy Bebop', doc_by_vocab, anime_name_to_index)

In [106]:
print(test1)

0.036221562027022834


In [107]:
def build_movie_sims_cos(n_mov, movie_index_to_name, input_doc_mat, movie_name_to_index, input_get_sim_method):
    """Returns a movie_sims matrix of size (num_movies,num_movies) where for (i,j):
        [i,j] should be the cosine similarity between the movie with index i and the movie with index j
        
    Note: You should set values on the diagonal to 1
    to indicate that all movies are trivially perfectly similar to themselves.
    
    Params: {n_mov: Integer, the number of movies
             movie_index_to_name: Dictionary, a dictionary that maps movie index to name
             input_doc_mat: Numpy Array, a numpy array that represents the document-term matrix
             movie_name_to_index: Dictionary, a dictionary that maps movie names to index
             input_get_sim_method: Function, a function to compute cosine similarity}
    Returns: Numpy Array 
    """
    # YOUR CODE HERE
    movie_sims_matrix = np.zeros((n_mov, n_mov))
    
    for i in range(0, n_mov):
        for j in range(0, n_mov):
            sim_score = input_get_sim_method(movie_index_to_name[i], movie_index_to_name[j], input_doc_mat, movie_name_to_index)
            movie_sims_matrix[i][j] = sim_score
    
    return movie_sims_matrix

In [108]:
movie_sims_cos = build_movie_sims_cos(1000, anime_index_to_name, doc_by_vocab, anime_name_to_index, get_sim)

In [109]:
def build_movie_sims_jac(n_mov, input_data):
    """Returns a movie_sims_jac matrix of size (num_movies,num_movies) where for (i,j) :
        [i,j] should be the jaccard similarity between the category sets for movies i and j
        such that movie_sims_jac[i,j] = movie_sims_jac[j,i]. 
        
    Note: 
        Movies sometimes contain *duplicate* categories! You should only count a category once
        
        A movie should have a jaccard similarity of 1.0 with itself.
    
    Params: {n_mov: Integer, the number of movies,
            input_data: List<Dictionary>, a list of dictionaries where each dictionary 
                     represents the movie_script_data including the script and the metadata of each movie script}
    Returns: Numpy Array 
    """
    genre_sims = np.zeros((n_mov, n_mov))
    
    # YOUR CODE HERE
    for i in range (0, n_mov):
        for j in range (0, n_mov):
            Al = input_data[i].split(',')
            Al = [s.strip() for s in Al]
            A = set(Al)
            
            Bl = input_data[j].split(',')
            Bl = [s.strip() for s in Bl]
            B = set(Bl)
            if(len(A.union(B)) > 0):
                jac_sim = len(A.intersection(B))/len(A.union(B))
            genre_sims[i][j] = jac_sim
            genre_sims[j][i] = jac_sim
            
    
    return genre_sims

In [110]:
movie_sims_jac = build_movie_sims_jac(1000,df['Genres'])

In [111]:
test1 = movie_sims_jac[anime_name_to_index['Cowboy Bebop'] , anime_name_to_index['Cowboy Bebop']]
test2 = movie_sims_jac[anime_name_to_index['Cowboy Bebop'] , anime_name_to_index['Cowboy Bebop: Tengoku no Tobira']]
test3 = movie_sims_jac[anime_name_to_index['Hungry Heart: Wild Striker'] , anime_name_to_index['Cowboy Bebop: Tengoku no Tobira']]

In [112]:
print(test1) #equals 1
print(test2)
print(test3) #equal 0

1.0
0.5714285714285714
0.0


In [113]:
def get_ranked_movies(mov, matrix):
    """
    Return sorted rankings (most to least similar) of movies as 
    a list of two-element tuples, where the first element is the 
    movie name and the second element is the similarity score
    
    Params: {mov: String,
             matrix: np.ndarray}
    Returns: List<Tuple>
    """
    
    # Get movie index from movie name
    mov_idx = anime_name_to_index[mov]
    
    # Get list of similarity scores for movie
    score_lst = matrix[mov_idx]
    mov_score_lst = [(anime_index_to_name[i], s) for i,s in enumerate(score_lst)]
    
    # Do not account for movie itself in ranking
    mov_score_lst = mov_score_lst[:mov_idx] + mov_score_lst[mov_idx+1:]
    
    # Sort rankings by score
    mov_score_lst = sorted(mov_score_lst, key=lambda x: -x[1])
    
    return mov_score_lst

In [135]:
def multiply_jac_sim(anime, genres, arr):
    # Get movie index from movie name
    anime_idx = anime_name_to_index[anime]
    score_lst = []
    
    for i,tup in enumerate(arr):
        
        A = set(genres)
        l = df['Genres'][anime_name_to_index[tup[0]]].split(',')
        l = [s.strip() for s in l]
        B = set(l)
        jac_sim = 0
        if(len(A.union(B)) > 0):
            jac_sim = len(A.intersection(B))/len(A.union(B))        
        arr[i] = (tup[0], tup[1]*jac_sim)
        
    arr = sorted(arr, key=lambda x: -x[1])

    return arr
    
    

In [133]:
def multiply_ratings(arr):
    for i, tup in enumerate(arr):
        score = df['Score'][i]
        score = 'hi'
        try:
            score = float(score)
        except:
            score = 5
        arr[i] = (tup[0], tup[1]*score)
        
    arr = sorted(arr, key=lambda x: -x[1])
    return arr

In [139]:
###################
## Recommendation for Cowboy Bepop
test_anime = 'Cowboy Bebop'
initial_ranking = get_ranked_movies(test_anime , movie_sims_cos)
ranking_jac = multiply_jac_sim(test_anime, ["Action", "Drama"] ,initial_ranking)
ranking_score = multiply_ratings(ranking_jac)

for (anime, score) in ranking_score[:10]:
        print("%.3f %s" % (score, anime))

0.402 Cowboy Bebop: Tengoku no Tobira
0.188 Youjuu Toshi
0.167 Uchuu Kaizoku Captain Herlock
0.165 One: Kagayaku Kisetsu e
0.159 Noir
0.143 WeiÃŸ Kreuz OVA
0.131 Fullmetal Alchemist
0.130 Seihou Bukyou Outlaw Star
0.127 Mobile Suit Gundam SEED
0.121 Uchuu no Stellvia


To-Do:

##### Changes movie -> Anime
##### Only on 1000
##### Can we use global variables?
##### Tests? 
##### Anime has no Transcript?
##### I sorted after each adjustment so I could look at it, we could only do it once tho in the final product
#####  Relevancy score are low rn because of the transcript matching, if they get higher we may want to log ratings