# Movies reccomendation System

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objects as go
init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Reading csv file

df=pd.read_csv("movies.csv")

In [3]:
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,


In [4]:
#Checking size of dataframe

df.shape

(9999, 9)

In [5]:
# Checking for Missing Values

df.isna().sum()

MOVIES         0
YEAR         644
GENRE         80
RATING      1820
ONE-LINE       0
STARS          0
VOTES       1820
RunTime     2958
Gross       9539
dtype: int64

In [6]:
print("Missing Values:\n")
for col in df.columns:
    missing = df[col].isna().sum()
    percent = missing / df.shape[0] * 100
    print("%s:%.2f%% (%d)" % (col,percent,missing))



Missing Values:

MOVIES:0.00% (0)
YEAR:6.44% (644)
GENRE:0.80% (80)
RATING:18.20% (1820)
ONE-LINE:0.00% (0)
STARS:0.00% (0)
VOTES:18.20% (1820)
RunTime:29.58% (2958)
Gross:95.40% (9539)


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MOVIES    9999 non-null   object 
 1   YEAR      9355 non-null   object 
 2   GENRE     9919 non-null   object 
 3   RATING    8179 non-null   float64
 4   ONE-LINE  9999 non-null   object 
 5   STARS     9999 non-null   object 
 6   VOTES     8179 non-null   object 
 7   RunTime   7041 non-null   float64
 8   Gross     460 non-null    object 
dtypes: float64(2), object(7)
memory usage: 703.2+ KB


In [8]:
# Removing "\n" from GENRE, ONE-LINE, and STARS columns
for col in ['GENRE','ONE-LINE','STARS']:
    df[col] = df[col].str.replace("\n","").str.strip()






In [9]:
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"Action, Horror, Thriller",6.1,A woman with a mysterious illness is forced in...,Director:Peter Thorwarth| Stars:Peri Baume...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"Animation, Action, Adventure",5.0,The war for Eternia begins again in what may b...,"Stars:Chris Wood, Sarah Michelle Gellar, Lena ...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"Drama, Horror, Thriller",8.2,Sheriff Deputy Rick Grimes wakes up from a com...,"Stars:Andrew Lincoln, Norman Reedus, Melissa M...",885805.0,44.0,
3,Rick and Morty,(2013– ),"Animation, Adventure, Comedy",9.2,An animated series that follows the exploits o...,"Stars:Justin Roiland, Chris Parnell, Spencer G...",414849.0,23.0,
4,Army of Thieves,(2021),"Action, Crime, Horror",,"A prequel, set before the events of Army of th...",Director:Matthias Schweighöfer| Stars:Matt...,,,


In [10]:
# Creating New Column Director and Stars by extracting Director(s) and Stars from orignial STARS column
def extract_director(direc):
    if 'Director' in direc or 'Directors' in direc:
        director = direc.strip().split("|")[0] # The Second Half is the stars
        return director.split(":")[1] # Return the Director name
    else:
        return ''

def extract_stars(stars):
    if 'Star' not in stars or 'Stars' not in stars:
        return ''
    else:
        return stars.split(":")[-1] # last value in this list will be the stars

df['Director'] = df['STARS'].apply(lambda d: extract_director(d))
df['Stars'] = df['STARS'].apply(lambda s: extract_stars(s))

# View head of these columns
df[['STARS','Director','Stars']].head()



Unnamed: 0,STARS,Director,Stars
0,Director:Peter Thorwarth| Stars:Peri Baume...,Peter Thorwarth,"Peri Baumeister, Carl Anton Koch, Alexander Sc..."
1,"Stars:Chris Wood, Sarah Michelle Gellar, Lena ...",,"Chris Wood, Sarah Michelle Gellar, Lena Headey..."
2,"Stars:Andrew Lincoln, Norman Reedus, Melissa M...",,"Andrew Lincoln, Norman Reedus, Melissa McBride..."
3,"Stars:Justin Roiland, Chris Parnell, Spencer G...",,"Justin Roiland, Chris Parnell, Spencer Grammer..."
4,Director:Matthias Schweighöfer| Stars:Matt...,Matthias Schweighöfer,"Matthias Schweighöfer, Nathalie Emmanuel, Ruby..."


# Data Visualization

In [11]:
# Extracting Year from original YEARS column

df['Year'] = df['YEAR'].str.extract(r'([0-9]{4}–.*|[0-9]{4})')
df['Year'] = df['Year'].str.strip().replace(")","")

def extract_year(year):
    if year[-3:] == '– )':
        return year.replace('– )',"–")
    else:
        return year.replace(')',"")

df['Year'] = df['Year'].fillna('Unknown')
df['Year'] = df['Year'].apply(lambda y: extract_year(y))
    
year_count = df[df['Year'] != 'Unknown']['Year'].value_counts().reset_index().rename(columns = {'Year':'Count','index':'Year'})
year_count.head()

Unnamed: 0,Year,Count
0,2020–,898
1,2020,742
2,2021–,661
3,2019,657
4,2019–,553


In [12]:
colors = ['paleturquoise'] * 10
colors[0],colors[2],colors[4],colors[-1] = 'darkcyan','darkcyan','darkcyan','darkcyan'

fig = px.bar(data_frame = year_count.head(10),
             x = 'Year', y = 'Count')

fig.update_traces(marker_color = colors)

fig.update_layout(title = 'Year(s) Distribution')

fig.show()

In [15]:
print("Statistical value of [{}]".format('Rating'))

# Average Rating 
print("Mean:", round(df['RATING'].mean(),2))

# Median Rating
print("Median:", df['RATING'].median())

# Max Rating
print("Max:", df['RATING'].max())

Statistical value of [Rating]
Mean: 6.92
Median: 7.1
Max: 9.9


In [18]:
fig = px.bar(data_frame = df['RATING'].value_counts().reset_index().head(10),
             x = 'index', y = 'RATING',
             title = 'Rating Distribution')

fig.update_yaxes(title = 'Count')

fig.update_xaxes(type ='category',
                 title = 'Rating (out of 10)')

fig.show()

In [19]:
fig = px.bar(data_frame = df['RunTime'].value_counts().reset_index().head(10),
             x = 'index', y = 'RunTime',
             title = 'Runtime Distribution')

fig.update_yaxes(title = 'Count')

fig.update_xaxes(type ='category',
                 title = 'Runtime (mins)')

fig.show()

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MOVIES    9999 non-null   object 
 1   YEAR      9355 non-null   object 
 2   GENRE     9919 non-null   object 
 3   RATING    8179 non-null   float64
 4   ONE-LINE  9999 non-null   object 
 5   STARS     9999 non-null   object 
 6   VOTES     8179 non-null   object 
 7   RunTime   7041 non-null   float64
 8   Gross     460 non-null    object 
 9   Director  9999 non-null   object 
 10  Stars     9999 non-null   object 
 11  Year      9999 non-null   object 
dtypes: float64(2), object(10)
memory usage: 937.5+ KB


In [21]:
df['VOTES'] = df['VOTES'].str.replace(",","")
df['VOTES'] 

0        21062
1        17870
2       885805
3       414849
4          NaN
         ...  
9994       NaN
9995       NaN
9996       NaN
9997       NaN
9998       NaN
Name: VOTES, Length: 9999, dtype: object

In [22]:
df['VOTES'] = df['VOTES'].fillna(0)
df['VOTES'] = df['VOTES'].astype(int)
df['VOTES'].sort_values(ascending = False)



111     1713028
193     1691777
16      1552311
260     1529752
132     1227522
         ...   
7039          0
7040          0
7041          0
7042          0
9998          0
Name: VOTES, Length: 9999, dtype: int32

In [25]:
movie_genre = df['GENRE'].value_counts().reset_index().rename(columns={'GENRE':'Count','index':'Genre'})

fig = px.bar(data_frame = movie_genre.sort_values(by='Count',ascending = False).head(10),
             x = 'Genre', y = 'Count')

fig.update_layout(title = 'Top 10 Genre Combination')

fig.show()

In [27]:
# Count number of Genre
from collections import Counter

genre_raw = df['GENRE'].dropna().to_list()
genre_list = list()

for genres in genre_raw:
    genres = genres.split(", ")
    for g in genres:
        genre_list.append(g)
        
genre_df = pd.DataFrame.from_dict(Counter(genre_list), orient = 'index').rename(columns = {0:'Count'})
genre_df.head()

Unnamed: 0,Count
Action,2258
Horror,553
Thriller,910
Animation,1732
Adventure,1792


In [28]:
# Genre Count Ditribution
fig = px.pie(data_frame = genre_df,
             values = 'Count',
             names = genre_df.index,
             color_discrete_sequence = px.colors.qualitative.Safe)

fig.update_traces(textposition = 'inside',
                  textinfo = 'label+percent',
                  pull = [0.05] * len(genre_df.index.to_list()))

fig.update_layout(title = {'text':'Genre Distribution'},
                  legend_title = 'Gender',
                  uniformtext_minsize=13,
                  uniformtext_mode='hide',
                  font = dict(
                      family = "Courier New, monospace",
                      size = 18,
                      color = 'black'
                  ))


fig.show()

# Content based filtering

In [53]:
# Features  using GENRE, RATING??, ONE-LINE, RunTime??, Director, Stars

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

features = ['GENRE','ONE-LINE','Director','Stars']

# Filling in missing values with Blank String
for feature in features:
    df[feature] = df[feature].fillna("")

df['combined_features'] = df['GENRE'] + " " + df['ONE-LINE'] + " " + df['Director'] + " " + df['Stars'] 
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(count_matrix)

In [69]:
# Function for movie recommendation
def movie_recommendation(mov,sim_num = 5):

    user_choice = mov
    
    try:
        ref_index = df[df['MOVIES'].str.contains(user_choice, case = False)].index[0]

        similar_movies = list(enumerate(cosine_sim[ref_index]))

        sorted_simmilar_movies = sorted(similar_movies, key = lambda x: x[1], reverse = True)[1:]

        print('\nRecomended Movies for [{}]'.format(user_choice))
        print('-'*(24 + len(user_choice)))

        for i, element in enumerate(sorted_simmilar_movies):
            similar_movie_id = element[0]
            similar_movie_title = df['MOVIES'].iloc[similar_movie_id]
            s_score = element[1]
            print('{:40} -> {:.3f}'.format(similar_movie_title, s_score))

            if i > sim_num:
                break
    except IndexError:
        print("\n[{}] is not in our database!".format(user_choice))
        print("We couldn't recommend anyting...Sorry...")



In [70]:
# Search for movie with the keyword
def movie_available(key):
    
    keyword = key
    
    print("Movie with keyword: [{}]".format(keyword))
    
    for i, mov in enumerate(df[df['MOVIES'].str.contains(keyword)]['MOVIES'].to_list()):
        print("{}) {} ".format(i+1,mov))

In [71]:
# Running the Function
movie_available("Spider")

Movie with keyword: [Spider]
1) Spider-Man: Far from Home 
2) Spider-Man 
3) The Girl in the Spider's Web 
4) Escape From Spiderhead 
5) The Spectacular Spider-Man 


In [72]:
# Running the Function 
movie_recommendation("Lucifer")


Recomended Movies for [Lucifer]
-------------------------------
The Upper World                          -> 0.454
 Lucifer                                 -> 0.439
Daniel Sloss: Live Shows                 -> 0.408
Kaamyaab                                 -> 0.401
Strange Weather                          -> 0.394
Rake                                     -> 0.394
Mercenaire                               -> 0.392


In [73]:
# Running the Function with argument
movie_recommendation("Spider-Man: Far from home",10)


Recomended Movies for [Spider-Man: Far from home]
-------------------------------------------------
Batman v Superman: Dawn of Justice       -> 0.313
Quantum Quest: A Cassini Space Odyssey   -> 0.309
Kong: Skull Island                       -> 0.305
Liu lang di qiu                          -> 0.300
Jumanji: The Next Level                  -> 0.293
Avatar: The Last Airbender               -> 0.291
Jupiter's Legacy                         -> 0.290
 Avatar: The Last Airbender              -> 0.289
The Division                             -> 0.288
This Perfect Day                         -> 0.287
The Hitman's Bodyguard                   -> 0.286
Mercenaire                               -> 0.281


# Thank You