In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#load training dataset
ratings = pd.read_csv('train_dt/ratings.csv')

movies_metadata = pd.read_csv('train_dt/movies_metadata.csv')

credits = pd.read_csv('train_dt/credits.csv')

keywords = pd.read_csv('train_dt/keywords.csv')

links = pd.read_csv('train_dt/links.csv')

  movies_metadata = pd.read_csv('train_dt/movies_metadata.csv')


In [3]:
#preprocessing
#movies metadata preprocessing
movies_metadata.dropna(inplace = True)

#delete worng 'id' format data
#ex) ValueError: Unable to parse string "2012-09-29" at position 29502
movies_metadata['isIdRight'] = movies_metadata['id'].str.contains('|'.join('-'))
movies_metadata = movies_metadata[movies_metadata['isIdRight'] == False]
movies_metadata.drop(['isIdRight'], axis = 1, inplace = True)

keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
movies_metadata['id'] = movies_metadata['id'].astype('int')

In [4]:
#function : content_based_recommendation_user
#receives information about genres, actors, directors, and keywords from users and returns 5 recommended movies
#input : kewords, credits, movies_metadata
#output : dataframe with recommendation movie_name with imdb site link
def content_based_recommendation_user(keywords, credits, movies_metadata):
    #content based dataset
    cb_trained_dt = movies_metadata
    cb_trained_dt = cb_trained_dt.merge(credits, on='id')
    cb_trained_dt = cb_trained_dt.merge(keywords, on='id')
    
    from ast import literal_eval
    #Parse the stringified features into their corresponding python objects
    features = ['cast', 'crew', 'keywords', 'genres']
    for feature in features:
        cb_trained_dt[feature] = cb_trained_dt[feature].apply(literal_eval)
        
    #apply func : apply functions in parenthese throughout the dataframe
    #related link : https://m.blog.naver.com/PostView.naver?isHttpsRedirect=true&blogId=wideeyed&logNo=221559041280
    cb_trained_dt['director'] = cb_trained_dt['crew'].apply(get_director)

    features = ['cast', 'keywords', 'genres']
    for feature in features:
        cb_trained_dt[feature] = cb_trained_dt[feature].apply(get_list)

    features = ['cast', 'keywords', 'director', 'genres']
    for feature in features:
        cb_trained_dt[feature] = cb_trained_dt[feature].apply(clean_data)
        
    #In a column called soup, keyword, cast, director, and genres are all tied together and put values that can measure cosine similarity
    cb_trained_dt['soup'] = cb_trained_dt.apply(create_soup, axis=1)
    
    recommend_movie_list = make_recommendation_user_input(cb_trained_dt)
    
    result = pd.DataFrame(recommend_movie_list, columns = ['movie_name', 'link'])
    for idx in range(0, len(result)):
        print(result['movie_name'][idx])
        temp_link = result['link'][idx].replace(result['link'][idx], "<a href = \"" + result['link'][idx] + "\" >" + result['link'][idx] + "</a>")
        print(temp_link)
    return result
#function : get_director
#A function that only actors (parsing) among movie-related people are kept separately
#input : corpus of movie_related people name
#output : movie_director's name
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan
#function : get_list
#If it is complicated with #id, etc., it is a function that parses it and returns it to the list
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        return names

    #Return empty list in case of missing/malformed data
    return []
#fnction : clean_data
#Blankout function
def clean_data(x):
    #Remove blanks inside the list -> [cast, keywords, genres]
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x] #uppercase -> change lowercase + remove blanks
    else:
        #Remove blanks inside string -> [director]
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
#function : create_soup
#Creating a corpus for cosine severity function
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
#get input and return output
def get_searchTerms():
    searchTerms = [] 
    
    print('Write down about movie(Genre, actors, directors, keywords)')
    user_input = input("(if multiple, please separate them with a comma)[Type 'null' if you don't want to type]")
    user_input = " ".join(["".join(n.split()) for n in user_input.lower().split(',')])
    if user_input != 'null':
        searchTerms.append(user_input)

    return searchTerms
#function : make_recommendation_user_input
#receives information about genres, actors, directors, and keywords from users and returns 5 recommended movies
#input : cb_trained_dt
#output : dataframe with recommendation movie_name with imdb site link
def make_recommendation_user_input(cb_trained_dt):
    metadata = cb_trained_dt
    new_row = metadata.iloc[-1,:].copy() #creating a copy of the last row of the 
  #dataset, which we will use to input the user's input
  #grabbing the new wordsoup from the user
    searchTerms = get_searchTerms()  
    new_row.iloc[-1] = " ".join(searchTerms) #adding the input to our new row
  
  #adding the new row to the dataset
    metadata = metadata.append(new_row)
  
  #Vectorizing the entire matrix as described above!
    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(metadata['soup'])

  #running pairwise cosine similarity 
    cosine_sim2 = cosine_similarity(count_matrix, count_matrix) #getting a similarity matrix
  
  #sorting cosine similarities by highest to lowest
    sim_scores = list(enumerate(cosine_sim2[-1,:]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  #matching the similarities to the movie titles and ids
    ranked_titles = []
    for i in range(1, 5 + 1):
        indx = sim_scores[i][0] 
        ranked_titles.append([metadata['title'].iloc[indx], 'https://imdb.com/title/' + metadata['imdb_id'].iloc[indx]]) #대신 사이트 주소 출력
  
    return ranked_titles
#main
recommend_movie_list = content_based_recommendation_user(keywords, credits, movies_metadata)

Write down about movie(Genre, actors, directors, keywords)
(if multiple, please separate them with a comma)[Type 'null' if you don't want to type]horror,romance
Love Hina Christmas Special: Silent Eve
<a href = "https://imdb.com/title/tt0414243" >https://imdb.com/title/tt0414243</a>
Hatchet III
<a href = "https://imdb.com/title/tt1870419" >https://imdb.com/title/tt1870419</a>
Macross Frontier: The Wings of Goodbye
<a href = "https://imdb.com/title/tt1566528" >https://imdb.com/title/tt1566528</a>
The Blair Witch Project
<a href = "https://imdb.com/title/tt0185937" >https://imdb.com/title/tt0185937</a>
Hostel
<a href = "https://imdb.com/title/tt0450278" >https://imdb.com/title/tt0450278</a>


  metadata = metadata.append(new_row)


In [5]:
#load test dataset
test_dt = np.load('test_dt/Dataset.npy')
test_dt = pd.DataFrame(test_dt)

test_dt = test_dt[0].str.split(",", expand=True).rename(columns={0:'user_id', 1:"imdb_id", 2:"rating", 3:'rating_date'})

In [6]:
#preprocessing test dataset
movie_info_with_test_dt = pd.merge(test_dt, movies_metadata, on="imdb_id", how="left")
movie_info_with_test_dt = movie_info_with_test_dt.dropna()
movie_info_with_test_dt = movie_info_with_test_dt[['user_id','imdb_id','rating']]

In [8]:
def content_based_recommendation_measure(keywords, credits, movies_metadata, test_dt):
    
    #make test_dataset_different for test
    temp = pd.DataFrame(test_dt['user_id'].value_counts())
    row_list = list(temp.index.values)
    temp.reset_index(inplace = True)
    temp.columns = ['user_id','rating_cnt']
    temp = temp.drop(temp[temp['rating_cnt'] < 11].index)
    temp2 = pd.merge(test_dt, temp)
    temp2 = temp2[['user_id','imdb_id','rating']]
    groups = temp2.groupby('user_id')
    result = dict(list(groups))

    result_top5 = {} #store test set result
    for key, value in result.items():
        value['rating'] = value['rating'].astype(int)
        value.sort_values(by = 'rating', ascending = False, inplace = True)
        result_top5[key] = value.head(6)#5+1
    
    import itertools
    final_test_dt_top500 = dict(itertools.islice(result_top5.items(), 500))
    
    #content based dataset
    cb_trained_dt = movies_metadata
    cb_trained_dt = cb_trained_dt.merge(credits, on='id')
    cb_trained_dt = cb_trained_dt.merge(keywords, on='id')
    
    from ast import literal_eval
    #Parse the stringified features into their corresponding python objects
    features = ['cast', 'crew', 'keywords', 'genres']
    for feature in features:
        cb_trained_dt[feature] = cb_trained_dt[feature].apply(literal_eval)
        
    cb_trained_dt['director'] = cb_trained_dt['crew'].apply(get_director)

    features = ['cast', 'keywords', 'genres']
    for feature in features:
        cb_trained_dt[feature] = cb_trained_dt[feature].apply(get_list)

    features = ['cast', 'keywords', 'director', 'genres']
    for feature in features:
        cb_trained_dt[feature] = cb_trained_dt[feature].apply(clean_data)
        
    cb_trained_dt['soup'] = cb_trained_dt.apply(create_soup, axis=1)
    
    accuracy_list = []#list for measuring accuracy of each test cases
    for key,value in final_test_dt_top500.items():
        #print(key)
        input_imdb_id = value.iloc[0]['imdb_id']
        searchTerms = cb_trained_dt[cb_trained_dt['imdb_id'] == input_imdb_id]['soup']
        
        real_result = list(value['imdb_id'][1:6])
        accuracy_list.append(make_recommendation_for_accuracy(cb_trained_dt, searchTerms, real_result))
    return sum(accuracy_list) / len(accuracy_list)
def make_recommendation_for_accuracy(cb_trained_dt, searchTerms, real_result):
    
    metadata = cb_trained_dt
    new_row = metadata.iloc[-1,:].copy() #creating a copy of the last row of the 
  #dataset, which we will use to input the user's input

  #grabbing the new wordsoup from the user
    new_row.iloc[-1] = " ".join(searchTerms) #adding the input to our new row
  
  #adding the new row to the dataset
    metadata = metadata.append(new_row) #-> replace because of FutureWarning
    #metadata = pd.concat([metadata, new_row])
  
  #Vectorizing the entire matrix as described above!
    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(metadata['soup'])

  #running pairwise cosine similarity 
    cosine_sim2 = cosine_similarity(count_matrix, count_matrix) #getting a similarity matrix
  
  #sorting cosine similarities by highest to lowest
    sim_scores = list(enumerate(cosine_sim2[-1,:]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  #matching the similarities to the movie titles and ids
    predict_val = []
    #doing just TOP 5
    for i in range(1, 10 + 1):
        indx = sim_scores[i][0]
        predict_val.append(metadata['imdb_id'].iloc[indx]) #대신 사이트 주소 출력

    intersection_list = list(set(predict_val) & set(real_result))
    accuracy = len(intersection_list) / 5
    return accuracy
accuracy = content_based_recommendation_measure(keywords, credits, movies_metadata, movie_info_with_test_dt)

































In [9]:
accuracy

0.0808