# Imports

In [1]:
import pandas as pd
import numpy as np

from itertools import combinations

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Mounting Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading Data Frames

In [3]:
movies = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/one-m-capstone/data/processed_movies.csv').drop(columns=['Unnamed: 0'])
combined = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/one-m-capstone/data/combined.csv')

# Content Engine

## Get a user's favortie genres
For the purposes of this notebook we will simulate genre choices from our combined data frame. In the simulation we will assume that if a user rates a movie 5 stars they will like that genre. This is a naive approach and there is room for improvement. However, the goal of the simulation is to emulate what the user will be guided through in the app.

In [None]:
# Simulating for user_id == 1
user_one = combined[combined['user_id'] == 2]

user_one = user_one[['title','genres','rating']]
user_one['genres'] = user_one['genres'].str.split('|')
user_one['genres'] = user_one['genres'].str.join(' ').astype(str)

# Only look at movies with 5 star rating
fav_genres = user_one[user_one['rating'] == 5]['genres'].unique()
fav_genres

array(['Action Drama War', 'Drama', 'Comedy Romance War',
       'Action Adventure Comedy Romance', 'Action Adventure Sci-Fi',
       'Adventure Drama Western', 'Drama Thriller', 'Drama Romance War',
       'Action Adventure Drama Sci-Fi War', 'Drama Romance',
       'Adventure Comedy Drama', 'Action Thriller', 'Comedy Drama',
       'Crime Drama', 'Action Drama', 'Action Comedy Crime Drama',
       'Drama Sci-Fi'], dtype=object)

In [None]:
type(fav_genres)

numpy.ndarray

## Generate Random Movie Suggestions based on a user's most viewed content

Since our Collaborative Engine will suffer from the cold-start problem we need to find videos a user hasn't watched yet and create a system to begin to generate their preferences. To achieve this we can create a function to randomly select a title from each genre the user likes. In the app we will ask the user to watch or rate thes movies. If a user says I don't want to watch that movie, we can fall back to a function that randomly selects another movie from that genre.

In [None]:
def get_random_movies(fav_genres):
  # takes in favorite genres and returns a randomly selected choice of movies
  # from the user's favorite genres
  rand_movies = []

  # Find one random movie from each of the favorite genres that shares the genres 
  for genre in fav_genres:
    genre_movies = movies[movies['genres'] == genre]['title']
    rand_movies.append(genre_movies.sample().values[0])

  return rand_movies

def get_movie_from_genre(genre):
    '''
      Just like get_random_movies but for a singular genre. This will be 
      used to replace movies that are not enjoyed.

      Takes in a genre, randomly selects another movie
    '''

    genre_movies = movies[movies['genres'] == genre]['title']
    return genre_movies.sample()


starter_movies = get_random_movies(fav_genres)
starter_movies

['Thin Red Line, The (1998)',
 'Official Story, The (La Historia Oficial) (1985)',
 'Forrest Gump (1994)',
 'Romancing the Stone (1984)',
 'Time Tracers (1995)',
 'Dances with Wolves (1990)',
 'Gingerbread Man, The (1998)',
 'Gone with the Wind (1939)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Phenomenon (1996)',
 'Stand by Me (1986)',
 'Natural Born Killers (1994)',
 'Patch Adams (1998)',
 'Impact (1949)',
 'Death Wish 4: The Crackdown (1987)',
 "Man Bites Dog (C'est arriv� pr�s de chez vous) (1992)",
 'Day the Earth Stood Still, The (1951)']

## Content Based Filtering

This is a simple content based filtering algorithm that will return the 5 most similar titles to a given movie based on their genre similarity. Once again there is room for imporvement here, but the naive approach is intentional as it is meant to simply solve the cold-start problem. This code is a modified version of [khanhnamle1994's](https://github.com/khanhnamle1994/movielens) code.
Special thanks.

In [None]:
# Thank you to https://github.com/khanhnamle1994 with genre_functions

# Create a tf-idf matrix of genres found
tf = TfidfVectorizer()
tfidf_matrix = tf.fit_transform(movies['genres'])

# Cosine similarity is done implicitly by the l2 normalization applied from the tf-idf 
# matrix, we can just use linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

def genre_recommendations(title):
  # Looks at a title and returns the top 5 most similar titles
  titles = movies['title']
  indices = pd.Series(movies.index, index=movies['title'])
  idx = indices[title]
  sim_scores = list(enumerate(cosine_sim[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  top_scores = sim_scores[1:6]
  movie_indices = [i[0] for i in top_scores]
  return titles.iloc[movie_indices]

In [None]:
genre_recommendations(starter_movies[0])

461           Heaven & Earth (1993)
1204       Full Metal Jacket (1987)
1214    Boat, The (Das Boot) (1981)
1222                   Glory (1989)
1545               G.I. Jane (1997)
Name: title, dtype: object