In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
import re
from wordcloud import STOPWORDS
from nltk.corpus import stopwords


%matplotlib inline

In [3]:
df = pd.read_csv('imdb_data/all_movies_data.csv')
df.head()

Unnamed: 0,movie,year,category,duration,genres,rating,tagline,director,stars,votes
0,Range 15,2016,TV-MA,89,"Action, Comedy, Horror",4.3,Veterans wake up after a night of partying to ...,Ross Patterson,"Sean Astin, Keith David, Danny Trejo, William ...",5010
1,Snake in the Eagle's Shadow,1978,PG,90,"Action, Comedy",7.3,An orphan who has been raised at a kung fu sch...,Woo-Ping Yuen,"Jackie Chan, Siu-Tin Yuen, Jeong-lee Hwang, De...",12191
2,Twinkle Twinkle Lucky Stars,1985,TV-14,105,"Action, Comedy",6.2,5 HK cops (4 horny males) on vacation in Patta...,Sammo Kam-Bo Hung,"Sammo Kam-Bo Hung, Richard Ng, Eric Tsang, Kiu...",4164
3,McHale's Navy,1997,PG,108,"Action, Comedy",4.5,"A retired Navy officer returns to active duty,...",Bryan Spicer,"Tom Arnold, Dean Stockwell, Ernest Borgnine, D...",6891
4,Fastlane,2002,TV-14,60,"Action, Comedy, Crime",7.4,Two hotshot undercover cops and their equally ...,,"Peter Facinelli, Bill Bellamy, Tiffani Thiesse...",3600


In [4]:
# number of rows od data

df.shape

(23096, 10)

In [5]:
# number of duplicated rows

df.duplicated().sum().sum()

0

In [6]:
# selected features

cat_features = ['category', 'genres', 'tagline', 'director', 'stars']
num_features = ['duration', 'rating', 'votes']

In [7]:
df.isnull().sum()

movie       0
year        0
category    0
duration    0
genres      0
rating      0
tagline     0
director    0
stars       0
votes       0
dtype: int64

### Preprocessing

In [8]:
df_proc = df.copy()

In [9]:
df_proc['genres'] = df_proc['genres'].str.replace(',', '')
df_proc['stars'] = df_proc['stars'].str.replace(',', '')

In [10]:
# finding all non-alphanumeric characters in the taglines

non_alpha = set()

for row in df_proc['tagline']:
    for char in row:
        if (re.match(r'[^\sA-Za-z]', char)):
               non_alpha.add(char)
                
print(non_alpha)

{'"', 'ô', 'í', '3', 'Á', '間', '%', 'ū', 'å', 'ú', 'Æ', '*', '6', 'û', '–', '°', '(', '女', ':', '人', '‘', 'æ', '½', 'à', 'Í', '-', '#', '?', '—', 'â', 'Ö', '4', ')', 'á', '8', '!', 'è', '1', 'ø', '2', '7', '&', '魂', 'ö', 'ō', 'ñ', '9', '5', '’', 'ä', 'ã', '幽', '.', '/', ';', '道', '®', 'é', '0', 'ó', ',', 'ï', '$', 'ç', '£', 'ë', 'Ó', '´', '+', '倩', 'ò', 'É', "'", 'ü'}


In [11]:
def clean_tagline(text):
    text = re.sub(r'[^\sA-Za-z\d]', '', text)
    return text

In [12]:
df_proc['tagline'] = df_proc['tagline'].apply(clean_tagline)

In [13]:
df_proc['cat_feat'] = df_proc['category'] + ' ' + df_proc['genres'] + ' ' + df_proc['tagline'] + ' ' + df_proc['director'] + ' ' + df_proc['stars']

In [14]:
df_proc.drop(['movie', 'year', 'category', 'duration', 'genres', 'tagline', 'director', 'stars', 'rating', 'votes'], axis=1, inplace=True)

In [15]:
df_proc.head()

Unnamed: 0,cat_feat
0,TV-MA Action Comedy Horror Veterans wake up af...
1,PG Action Comedy An orphan who has been raised...
2,TV-14 Action Comedy 5 HK cops 4 horny males on...
3,PG Action Comedy A retired Navy officer return...
4,TV-14 Action Comedy Crime Two hotshot undercov...


In [16]:
STOP_WORDS = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS).union(STOPWORDS)

In [17]:
def remove_stop_words(text):
    words = text.split(' ')
    return ' '.join([word for word in words if word not in STOP_WORDS])

lem = WordNetLemmatizer()
def lemmatize_sentence(text):
    return ' '.join([lem.lemmatize(word) for word in text.split(' ')])

def clean_text(text):
    return lemmatize_sentence(remove_stop_words(text))

In [18]:
df_proc['cat_feat'] = df_proc['cat_feat'].apply(clean_text)

In [19]:
df_proc['cat_feat'].head()

0    TV-MA Action Comedy Horror Veterans wake night...
1    PG Action Comedy An orphan raised kung fu scho...
2    TV-14 Action Comedy 5 HK cop 4 horny male vaca...
3    PG Action Comedy A retired Navy officer return...
4    TV-14 Action Comedy Crime Two hotshot undercov...
Name: cat_feat, dtype: object

In [20]:
tfidf = TfidfVectorizer(stop_words='english')
feature_vectors = tfidf.fit_transform(df_proc['cat_feat'])
feature_df = pd.DataFrame(feature_vectors.toarray(), columns=tfidf.get_feature_names())
feature_df.shape



(23096, 57041)

In [21]:
feature_df.head()

Unnamed: 0,00,007,007wannabe,10,100,1000,10000,100000,1000000,100000000,...,ørjan,øverli,øverlie,øvredal,øyvind,úrsula,ünel,þorsteinn,þrúður,þóra
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Using Nearest Neighbors (Content-Based Filtering)

In [25]:
similarity = cosine_similarity(feature_vectors)
print(similarity)

[[1.         0.00696448 0.00914768 ... 0.00755292 0.01946594 0.02027316]
 [0.00696448 1.         0.00419869 ... 0.         0.         0.00683128]
 [0.00914768 0.00419869 1.         ... 0.         0.01181813 0.09668541]
 ...
 [0.00755292 0.         0.         ... 1.         0.0119423  0.        ]
 [0.01946594 0.         0.01181813 ... 0.0119423  1.         0.        ]
 [0.02027316 0.00683128 0.09668541 ... 0.         0.         1.        ]]


In [26]:
print(similarity.shape)

(23096, 23096)


In [27]:
# getting the movie name from the user

movie_name = input(' Enter your favourite movie name : ')

 Enter your favourite movie name : Spider Man


In [28]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = df['movie'].tolist()

In [29]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

In [30]:
find_close_match
close_match = find_close_match[0]

In [31]:
# finding the index of the movie with title

index_of_the_movie = df[df.movie == close_match].index[0]
print(index_of_the_movie)

16366


In [32]:
# getting a list of similar movies 

similarity_score = list(enumerate(similarity[index_of_the_movie]))

In [33]:
list(enumerate(similarity[index_of_the_movie]))

[(0, 0.0046888764405129905),
 (1, 0.007107140200275038),
 (2, 0.0028267966677638645),
 (3, 0.009327778804944481),
 (4, 0.006764919627569588),
 (5, 0.002978224987552228),
 (6, 0.003151543615036277),
 (7, 0.01710290771560664),
 (8, 0.057316856177394494),
 (9, 0.003674445556491991),
 (10, 0.025750272727504412),
 (11, 0.0033731355766900228),
 (12, 0.014425918553599898),
 (13, 0.003480184968604899),
 (14, 0.003538156294568451),
 (15, 0.0042582385783745975),
 (16, 0.008683734546365677),
 (17, 0.016128054855692374),
 (18, 0.03510989098588953),
 (19, 0.014100898638021908),
 (20, 0.00362942871859371),
 (21, 0.0029822262622873222),
 (22, 0.004112553433789656),
 (23, 0.013271040321132799),
 (24, 0.005012113882382569),
 (25, 0.003969566667753031),
 (26, 0.00438239251265896),
 (27, 0.00812326201074267),
 (28, 0.005278075346795226),
 (29, 0.003724746865225794),
 (30, 0.006272188281770635),
 (31, 0.00483387374327308),
 (32, 0.006140300797934695),
 (33, 0.003839787628404136),
 (34, 0.00794603206694276

In [34]:
len(similarity_score)

23096

In [35]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x: x[1], reverse=True)
print(sorted_similar_movies)

[(16366, 1.0000000000000002), (16539, 0.3577164354699953), (16507, 0.2962236640984863), (16476, 0.22964517874535145), (2903, 0.1904978367050918), (16821, 0.18286679685427865), (17433, 0.1616050613902402), (21099, 0.1493935208793076), (8705, 0.14691552428970248), (19462, 0.12788424430572243), (17303, 0.12647138424705984), (13614, 0.12471199473625232), (2423, 0.12461693435349736), (8813, 0.1234151875588043), (8963, 0.12110390046807219), (10170, 0.12090161462338338), (8114, 0.11934045483198771), (16537, 0.1189348594649427), (16425, 0.11853572011735916), (19231, 0.11607056338896787), (16659, 0.11582147923159995), (7977, 0.1150018485878974), (18622, 0.1136768595516951), (21657, 0.11338556336965831), (17427, 0.11311239258599284), (12837, 0.11288051129741977), (18224, 0.11242487551210004), (16707, 0.11195154010868087), (18992, 0.11148763650780032), (20172, 0.11064104058748116), (13422, 0.10988616820229476), (7282, 0.10931138048795935), (9695, 0.10762089007096932), (14296, 0.10754491288008962)

In [36]:
# print the name of similar movies on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = df[df.index==index]['movie'].values[0]
    if (i<30):
        print(i, '.', title_from_index)
        i+=1

Movies suggested for you : 

1 . Spider-Man
2 . Spider-Man 2
3 . Spider-Man 3
4 . The Amazing Spider-Man
5 . Spider-Woman
6 . Spider-Man: The Animated Series
7 . Spider-Man
8 . Midnight Special
9 . Oz the Great and Powerful
10 . Brothers (I)
11 . The Amazing Spider-Man
12 . The Cider House Rules
13 . Full-Time Magister
14 . Pleasantville
15 . Cats & Dogs
16 . Big Wolf on Campus
17 . Seabiscuit
18 . Melancholia
19 . Aquaman
20 . Dead for a Dollar
21 . Superhero Movie
22 . The Last Temptation of Christ
23 . Superman
24 . Pawn Sacrifice
25 . New Rose Hotel
26 . The Great Gatsby
27 . 4:44 Last Day on Earth
28 . Small Soldiers
29 . The Dark Knight


### Engine

In [37]:
def recommend_movies():
    movie_name = input("Enter your favourite movie name : ")

    list_of_all_titles = df['movie'].tolist()
    find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
    close_match = find_close_match[0]
    index_of_the_movie = df[df['movie'] == close_match].index.values[0]
    similarity_score = list(enumerate(similarity[index_of_the_movie]))
    sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse=True)

    print('Movies suggested for you : \n')

    i = 1

    for movie in sorted_similar_movies:
        index = movie[0]
        title_from_index = df[df.index==index]['movie'].values[0]
        movie_rating = df[df.index==index]['rating'].values[0]
        year = df[df.index==index]['year'].values[0]
        genres = df[df.index==index]['genres'].values[0]
        if (i<30):
            print(i, '.', title_from_index, f'({year})', f'{movie_rating}', genres)
            i+=1

In [39]:
recommend_movies()

Enter your favourite movie name : Aqua man
Movies suggested for you : 

1 . Aquaman (1967) 6.6 Animation, Action, Adventure
2 . Oggy and the Cockroaches: Next Generation (2022) 6.3 Animation, Short, Adventure
3 . Fantastic Four (1967) 6.6 Animation, Action, Adventure
4 . Easy Come, Easy Go (1967) 5.2 Adventure, Comedy, Music
5 . Toopy and Binoo (2005) 7.0 Animation, Adventure, Comedy
6 . SheZow (2012) 5.4 Animation, Action, Adventure
7 . The Idaten Deities Know Only Peace (2021) 7.2 Animation, Action, Adventure
8 . Sonic Mania Adventures (2018) 8.1 Animation, Short, Action
9 . The Pink Panther (1969) 7.8 Animation, Short, Comedy
10 . One Day at a Time (1975) 6.6 Comedy
11 . Full-Time Magister (2016) 7.4 Animation, Action, Adventure
12 . Space Ghost (1966) 7.3 Animation, Action, Adventure
13 . The Angry Red Planet (1959) 5.2 Adventure, Horror, Sci-Fi
14 . Mighty Little Bheem (2019) 6.6 Animation, Short, Adventure
15 . Krypto the Superdog (2005) 5.5 Animation, Action, Adventure
16 . Maxi