MOVIE RECOMMENDATION SYSTEM


DATA COLLECTION AND VISUALIZATION 

In [1]:
import pandas as pd

# Load data.csv into data
movies = pd.read_csv('data.csv')
movies.head() # Show parts of the data

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [2]:
movies.columns # Show the columns of the data

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [3]:
# FEATURES COLLECTION --> CHOOSE ['id','title','genre','overview'] because these help in matching with the user's preferences
movies = movies[['id','title','genre','overview']]
movies.head()

Unnamed: 0,id,title,genre,overview
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...


In [4]:
# CREATE A NEW COLUMN CALLED 'content' TO BE USED TO MATCH WITH THE USER'S PREFERENCES
movies['content'] = movies['genre'] + ' ' + movies['overview']

# Drop the columns 'genre' and 'overview'
movies = movies.drop(columns=['genre','overview'])

movies.head()

Unnamed: 0,id,title,content
0,278,The Shawshank Redemption,"Drama,Crime Framed in the 1940s for the double..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance Raj is a rich, carefree, ..."
2,238,The Godfather,"Drama,Crime Spanning the years 1945 to 1955, a..."
3,424,Schindler's List,"Drama,History,War The true story of how busine..."
4,240,The Godfather: Part II,"Drama,Crime In the continuing saga of the Corl..."


BUILDING VECTOR USING SCIKIT-LEARN LIBRARY

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Convert text into numerical feature vectors that represent the importance of words in each document
vectorizer = TfidfVectorizer(max_features=1000,stop_words='english')
vector = vectorizer.fit_transform(movies['content'].values.astype('U')).toarray()

vector

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

COMPUTE COSINE SIMILARITY

In [None]:
from sklearn.metrics.pairwise import cosine_similarity # This part is not really necessary given this prompt; however, it can be used to suggest other movies as well
similarity = cosine_similarity(vector) # Each movies is compared to every other movies within the data set based on their content
similarity.shape # Verify that the similarity calculations are accurate
similarity

array([[1.        , 0.04866835, 0.07810835, ..., 0.13338915, 0.10728665,
        0.05613979],
       [0.04866835, 1.        , 0.04454462, ..., 0.        , 0.01072361,
        0.        ],
       [0.07810835, 0.04454462, 1.        , ..., 0.03501509, 0.06540598,
        0.05426955],
       ...,
       [0.13338915, 0.        , 0.03501509, ..., 1.        , 0.01450114,
        0.00989724],
       [0.10728665, 0.01072361, 0.06540598, ..., 0.01450114, 1.        ,
        0.01756019],
       [0.05613979, 0.        , 0.05426955, ..., 0.00989724, 0.01756019,
        1.        ]])

DEFINE RECOMMENDATION FUNCTION

In [22]:
def recommend(user_input, movies, vectorizer, similarity, top_n=5):
    # Convert the user's response to vector
    user_vector = vectorizer.transform([user_input]).toarray()
    # Computer user's similarity and convert the outcome into 1D array
    user_similarity = cosine_similarity(user_vector,vector).flatten()
    # Get the top N similar movies
    top_indices = user_similarity.argsort()[-top_n:][::-1] # Reverse the order to the most similar first
    return movies.iloc[top_indices][['title','content']]
    

TESTING FUNCTION

In [23]:
# Test with a sample input
user_query = "I like action movies set in space"
recommendations = recommend(user_query, movies, vectorizer, similarity, top_n=5)
recommendations

Unnamed: 0,title,content
9498,Lost in Space,"Science Fiction,Adventure The prospects for co..."
1162,Gattaca,"Thriller,Science Fiction,Mystery,Romance In a ..."
4973,Space Pirate Captain Harlock,"Animation,Science Fiction Space Pirate Captain..."
5440,Batman: Return of the Caped Crusaders,"Action,Animation,Adventure,Crime,Science Ficti..."
9920,Space Chimps,"Animation,Family,Adventure,Comedy,Science Fict..."


SAVE MODELS AND BUILD A FRONT-END USING STREAMLIT

In [9]:
import pickle
pickle.dump(movies, open('movies_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(vectorizer,open('vectorizer.pkl','wb'))
pickle.dump(vector,open('vector.pkl','wb'))
