# **Movie Recommender on the basis of Release Date of Movie(s)**

In [1]:
import numpy as np
import pandas as pd

# **1. create 2 dataframes netflixMovie_df and imdbMovie_df**

**preprocessing netflix dataframe**

In [2]:
netflixMovie_df = pd.read_csv('Netflix_Dataset_Movie.csv')
netflixRating_df = pd.read_csv('Netflix_Dataset_Rating.csv')

In [3]:
# we don't need rating dataframe as there is no year of launch of movies
netflixRating_df.head(1)

Unnamed: 0,User_ID,Rating,Movie_ID
0,712664,5,3


In [4]:
netflixMovie_df

Unnamed: 0,Movie_ID,Year,Name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004,Fidel Castro: American Experience
17767,17768,2000,Epoch
17768,17769,2003,The Company


In [5]:
# we need movie dataFrame of only netflixMovie_df = ['movie_names', 'release date']

netflixMovie_df = netflixMovie_df[['Name', 'Year']]
netflixMovie_df

Unnamed: 0,Name,Year
0,Dinosaur Planet,2003
1,Isle of Man TT 2004 Review,2004
2,Character,1997
3,Paula Abdul's Get Up & Dance,1994
4,The Rise and Fall of ECW,2004
...,...,...
17765,Where the Wild Things Are and Other Maurice Se...,2002
17766,Fidel Castro: American Experience,2004
17767,Epoch,2000
17768,The Company,2003


In [6]:
# convert 'Name' -> 'Movie_Title'
# convert 'Year' -> 'Released_Year'

netflixMovie_df = netflixMovie_df.rename(columns={'Name': 'Movie_Title'})
netflixMovie_df = netflixMovie_df.rename(columns={'Year': 'Released_Year'})

In [7]:
netflixMovie_df.head(1)

Unnamed: 0,Movie_Title,Released_Year
0,Dinosaur Planet,2003


**preprocessing imdb dataframe**

In [8]:
imdbMovie_df = pd.read_csv('imdb_top_1000.csv')

In [9]:
imdbMovie_df.head(1)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469


In [10]:
# we need movie dataFrame of only imdbMovie_df = ['movie_names', 'release date']

imdbMovie_df = imdbMovie_df[['Series_Title', 'Released_Year']]

In [11]:
imdbMovie_df.head(1)

Unnamed: 0,Series_Title,Released_Year
0,The Shawshank Redemption,1994


In [12]:
# convert 'Series_Title' -> 'Movie_Title'

imdbMovie_df = imdbMovie_df.rename(columns={'Series_Title': 'Movie_Title'})

In [13]:
imdbMovie_df.head(1)

Unnamed: 0,Movie_Title,Released_Year
0,The Shawshank Redemption,1994


In [14]:
# check if there is some incorrect string value in imdbMovie_df & store it in movie_idx

def checkIncorrectValues():
    movie_idx = -1
    for i in range(len(imdbMovie_df['Released_Year'])):
        if(imdbMovie_df['Released_Year'].iloc[i] != 'PG'):
            imdbMovie_df['Released_Year'].iloc[i] = int(imdbMovie_df['Released_Year'].iloc[i])
        else:
            movie_idx = i
    return movie_idx

In [15]:
obtained_movie_idx = checkIncorrectValues()

In [16]:
# check the incorrect 'Movie_Title' value in imdbMovie_df

imdbMovie_df['Movie_Title'][obtained_movie_idx]

'Apollo 13'

In [17]:
# fill the movie's 'Released_Year' w/ movie's release date

imdbMovie_df['Released_Year'][obtained_movie_idx] = 1970

# **2. Review of netflixMovie_df and imdbMovie_df**

In [18]:
netflixMovie_df

Unnamed: 0,Movie_Title,Released_Year
0,Dinosaur Planet,2003
1,Isle of Man TT 2004 Review,2004
2,Character,1997
3,Paula Abdul's Get Up & Dance,1994
4,The Rise and Fall of ECW,2004
...,...,...
17765,Where the Wild Things Are and Other Maurice Se...,2002
17766,Fidel Castro: American Experience,2004
17767,Epoch,2000
17768,The Company,2003


In [19]:
imdbMovie_df

Unnamed: 0,Movie_Title,Released_Year
0,The Shawshank Redemption,1994
1,The Godfather,1972
2,The Dark Knight,2008
3,The Godfather: Part II,1974
4,12 Angry Men,1957
...,...,...
995,Breakfast at Tiffany's,1961
996,Giant,1956
997,From Here to Eternity,1953
998,Lifeboat,1944


In [20]:
print('shape of netflixMovie_df  : ', netflixMovie_df.shape)
print('shape of imdbMovie_df     : ', imdbMovie_df.shape)

shape of netflixMovie_df  :  (17770, 2)
shape of imdbMovie_df     :  (1000, 2)


# **3. merge netflixMovie_df & imdbMovie_df to form yearMovie_df** 

In [21]:
# now merge the two dataframes into yearMovie_df

yearMovie_df = pd.concat([netflixMovie_df, imdbMovie_df], axis=0)

In [22]:
yearMovie_df

Unnamed: 0,Movie_Title,Released_Year
0,Dinosaur Planet,2003
1,Isle of Man TT 2004 Review,2004
2,Character,1997
3,Paula Abdul's Get Up & Dance,1994
4,The Rise and Fall of ECW,2004
...,...,...
995,Breakfast at Tiffany's,1961
996,Giant,1956
997,From Here to Eternity,1953
998,Lifeboat,1944


In [23]:
# sort yearMovie_df on the basis of 'Released_Year'

yearMovie_df = yearMovie_df.sort_values(by=["Released_Year"])

In [24]:
yearMovie_df

Unnamed: 0,Movie_Title,Released_Year
17666,Eros Dance Dhamaka,1915
7653,Lumiere Brothers' First Films,1915
13146,Chaplin's Essanay Comedies: Vol. 1,1915
8820,The Birth of a Nation,1915
14686,Chaplin's Essanay Comedies: Vol. 2,1915
...,...,...
612,The Trial of the Chicago 7,2020
205,Soul,2020
20,Soorarai Pottru,2020
18,Hamilton,2020


In [25]:
# add 'User_id' to yearMovie_df

yearMovie_df['User_Id'] = [i for i in range(len(yearMovie_df.index))]

In [26]:
yearMovie_df

Unnamed: 0,Movie_Title,Released_Year,User_Id
17666,Eros Dance Dhamaka,1915,0
7653,Lumiere Brothers' First Films,1915,1
13146,Chaplin's Essanay Comedies: Vol. 1,1915,2
8820,The Birth of a Nation,1915,3
14686,Chaplin's Essanay Comedies: Vol. 2,1915,4
...,...,...,...
612,The Trial of the Chicago 7,2020,18765
205,Soul,2020,18766
20,Soorarai Pottru,2020,18767
18,Hamilton,2020,18768


In [27]:
arr = yearMovie_df.to_numpy()

In [28]:
arr

array([['Eros Dance Dhamaka', 1915, 0],
       ["Lumiere Brothers' First Films", 1915, 1],
       ["Chaplin's Essanay Comedies: Vol. 1", 1915, 2],
       ...,
       ['Soorarai Pottru', 2020, 18767],
       ['Hamilton', 2020, 18768],
       ['Druk', 2020, 18769]], dtype=object)

# **4. Create a pivot table movieUser_df**

In [29]:
yearMovie_df.drop_duplicates(subset='Movie_Title', keep = 'first', inplace = True)

In [30]:
yearMovie_df

Unnamed: 0,Movie_Title,Released_Year,User_Id
17666,Eros Dance Dhamaka,1915,0
7653,Lumiere Brothers' First Films,1915,1
13146,Chaplin's Essanay Comedies: Vol. 1,1915,2
8820,The Birth of a Nation,1915,3
14686,Chaplin's Essanay Comedies: Vol. 2,1915,4
...,...,...,...
612,The Trial of the Chicago 7,2020,18765
205,Soul,2020,18766
20,Soorarai Pottru,2020,18767
18,Hamilton,2020,18768


In [31]:
# drop first 12867 rows
N = 12867
yearMovie_df = yearMovie_df.iloc[N: , :]

In [32]:
yearMovie_df

Unnamed: 0,Movie_Title,Released_Year,User_Id
3591,The North Face Expeditions: Everest and Bonus ...,2001,13543
2292,Gaudi Afternoon,2001,13544
15523,A Woman's a Helluva Thing,2001,13545
12669,Absolutely Fabulous: Series 4,2001,13546
5176,Abandoned,2001,13547
...,...,...,...
612,The Trial of the Chicago 7,2020,18765
205,Soul,2020,18766
20,Soorarai Pottru,2020,18767
18,Hamilton,2020,18768


In [33]:
yearMovie_df['Movie_Title'][0]

'Dinosaur Planet'

In [34]:
# reserialize 'User_Id'

yearMovie_df['User_Id'] = [i for i in range(len(yearMovie_df.index))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yearMovie_df['User_Id'] = [i for i in range(len(yearMovie_df.index))]


In [35]:
movieUser_df = pd.pivot_table(yearMovie_df, index='Movie_Title', columns='User_Id', values='Released_Year')
# yearMovie_df

In [36]:
movieUser_df

User_Id,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
Movie_Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'N Sync: PopOdyssey Live,,,,,,,,,,,...,,,,,,,,,,
(500) Days of Summer,,,,,,,,,,,...,,,,,,,,,,
.Com for Murder,,,,,,,,,,,...,,,,,,,,,,
.hack//Legend of the Twilight,,,,,,,,,,,...,,,,,,,,,,
.hack//SIGN,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zus & Zo,,,,,,,,,,,...,,,,,,,,,,
Zwartboek,,,,,,,,,,,...,,,,,,,,,,
e-Dreams,,,,,,,,,,,...,,,,,,,,,,
s-Cry-ed,,,,,,,,,,,...,,,,,,,,,,


In [37]:
# drop last 
N = 3000
movieUser_df = movieUser_df.iloc[: , :-N]

In [38]:
movieUser_df

User_Id,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
Movie_Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'N Sync: PopOdyssey Live,,,,,,,,,,,...,,,,,,,,,,
(500) Days of Summer,,,,,,,,,,,...,,,,,,,,,,
.Com for Murder,,,,,,,,,,,...,,,,,,,,,,
.hack//Legend of the Twilight,,,,,,,,,,,...,,,,,,,,,,
.hack//SIGN,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zus & Zo,,,,,,,,,,,...,,,,,,,,,,
Zwartboek,,,,,,,,,,,...,,,,,,,,,,
e-Dreams,,,,,,,,,,,...,,,,,,,,,,
s-Cry-ed,,,,,,,,,,,...,,,,,,,,,,


# **5. make movieUser_df sparse**

In [39]:
m = np.random.randint(low=-300, high=6, size=(5000, 2000), dtype=int)

In [40]:
movieUser_df.index[1]

'(500) Days of Summer'

In [41]:
# create a list containing all movie names

movieList=[]
for i in range(len(movieUser_df.index)):
    movieList.append(movieUser_df.index[i])

In [42]:
# store the values in userMovie_df

movieUser_df = pd.DataFrame(m, index=movieList)

In [43]:
# make it sparse
movieUser_df[movieUser_df < 1] = 0

In [44]:
# fill all 'nan' values with 0

movieUser_df.fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
'N Sync: PopOdyssey Live,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(500) Days of Summer,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
.Com for Murder,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
.hack//Legend of the Twilight,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
.hack//SIGN,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zus & Zo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Zwartboek,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
e-Dreams,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s-Cry-ed,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# *6. Core Logic of Recommender System using Binary Search*

In [45]:
# this function returns the last index of highest valued rating, and the correponding movie name 

def lastIndexOfTopRatedMoviesByUserX(user_series_of_movies, rating, l, h):
    ans_idx = -1
    while l <= h:
        mid = l + (h-l)//2
        if user_series_of_movies[mid] >= rating:
            ans_idx = mid
            last_coordinated_movie = user_series_of_movies.index[mid]
            l = mid + 1
        else:
            h = mid - 1 
            
    return ans_idx, last_coordinated_movie

In [46]:
# n = number of movies per top ratings of user
# u = 'User_id'
# rating = lowest best rating -> [1, 5]
# last_coordinated_movie -> name of last highly rated movie by the user-X
n = 5
u = 0
rating = 4
# last_coordinated_movie
movieUser_df = movieUser_df.sort_values(by=[0], axis=0, ascending=False)

In [47]:
movieUser_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
Step Into Liquid,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Art of Revenge,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(500) Days of Summer,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Lord of War,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Super Sucker,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Gone Girl,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gone Dark,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gone But Not Forgotten,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gone Baby Gone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,2,0


In [48]:
if 'Avatar' in movieUser_df.index:
    print(True)

True


In [49]:
movieUser_df[u].fillna(value=0, inplace=True)

In [50]:
# call the lastIndexOfTopRatedMoviesByUserX
l = 0
h = len(movieUser_df.columns)-1
idx, last_coordinated_movie = lastIndexOfTopRatedMoviesByUserX(movieUser_df[u], rating, l, h)

In [51]:
print("the index of last highly rated movie by the user-X: ", idx)
print("the name of last highly rated movie by the user-X: ", last_coordinated_movie)

the index of last highly rated movie by the user-X:  35
the name of last highly rated movie by the user-X:  Tom and Jerry: The Fast and the Furry


In [52]:
# verify the name of the movie received

movieUser_df[0].index[idx]

'Tom and Jerry: The Fast and the Furry'

In [53]:
movieUser_df[0][idx]

4

In [54]:
movieUser_df[0][idx+1]

3

In [55]:
# Hence the calculation is correct

# ***7. Recommendation code returning a list of movies***

In [56]:
# Helper function to return whether the movie is rated or not to avoid recommending already rated movie

def isRated(movie_name):
    if movieUser_df[u][movie_name] > 0:
        return True
    return False

In [57]:
# This function returns:-
# 1. the recommended movie list
# 2. the year of the movie in year sorted yearMovie_df OR, the year of last_coordinated_movie for verification

def recommendMovies(u, n, idx, last_coordinated_movie, yearMovie_df_array):
    movie_list = []
    pivot_movie_idx = -1
    pivot_movie_year = -1
    for i in range(len(yearMovie_df_array)):
        j = len(yearMovie_df_array) - i - 1
        if(yearMovie_df_array[i][0] == last_coordinated_movie):
            pivot_movie_idx = i
            pivot_movie_year = yearMovie_df_array[i][1]
            print(yearMovie_df_array[i][0])
            break
        if(yearMovie_df_array[j][0] == last_coordinated_movie):
            pivot_movie_idx = j
            pivot_movie_year = yearMovie_df_array[j][1]
            print(yearMovie_df_array[j][0])
            break
            
    
    # store closest movies greater than or equal to current year 
    right_movie_cnt = 0
    right_starter_idx = pivot_movie_idx + 1
    while right_movie_cnt < n:
        if(right_starter_idx > len(yearMovie_df_array)-1):
            break
            
        if(isRated(yearMovie_df_array[right_starter_idx][0]) == False):
            
            movie_list.append( (yearMovie_df_array[right_starter_idx][0], yearMovie_df_array[right_starter_idx][1]) )
            right_movie_cnt += 1
            
        right_starter_idx += 1
            
            
    
    # store closest movies less than or equal to current year 
    left_movie_cnt = 0
    left_starter_idx = pivot_movie_idx - 1
    while left_movie_cnt < n:
        if(left_starter_idx == 0):
            break
            
        if(isRated(yearMovie_df_array[left_starter_idx][0]) == False):
            
            movie_list.append( (yearMovie_df_array[left_starter_idx][0], yearMovie_df_array[left_starter_idx][1]) )
            left_movie_cnt += 1
        
        left_starter_idx -= 1
        
            
    return movie_list, pivot_movie_year

In [58]:
# convert yearMovie_df -> numpy array

yearMovie_df_array = yearMovie_df.to_numpy()

In [59]:
len(yearMovie_df_array)

5000

In [60]:
yearMovie_df_array

array([['The North Face Expeditions: Everest and Bonus Footage', 2001, 0],
       ['Gaudi Afternoon', 2001, 1],
       ["A Woman's a Helluva Thing", 2001, 2],
       ...,
       ['Soorarai Pottru', 2020, 4997],
       ['Hamilton', 2020, 4998],
       ['Druk', 2020, 4999]], dtype=object)

In [61]:
# movieUser_df[0]['Hamilton']

In [62]:
# idx variable contains the last index of highest valued rating
# last_coordinated_movie contains the name of the last highly rated movie by user-X
# pivot_movie_year = the year of movie obtained as last_coordinated_movie

recommendations, pivot_movie_year = recommendMovies(u, n, idx, last_coordinated_movie, yearMovie_df_array)

Tom and Jerry: The Fast and the Furry


In [63]:
recommendations[:3]

[('The Perfect Neighbor', 2005),
 ("Who's Your Daddy", 2005),
 ('The Fallen Ones', 2005)]

In [64]:
# validate the year of obtained movie with obve recommended movies
pivot_movie_year

# the movie year which was predicted for the selected user it is recommending the movies closest to this year 
# and here it is verified

2005

In [65]:
# Here is the important part
# we can verify that all the recommendations above are "NON-RATED"
# if the rating value in movieUser_df = 0 of any user 'u' (for any movie above) 
# then that must mean the movie is of course NON-RATED

movieUser_df[u]['Jay-Z: Fade to Black']

0

In [66]:
# verify the recommendations are unrater

movieUser_df[u]['Jay-Z: Fade to Black']

0