In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# loading the data from the csv file to apandas dataframe
movies_data = pd.read_csv('movies_telugu.csv')

In [3]:
movies_data

Unnamed: 0,Index,Movie,Year,Certificate,genres,Overview,Runtime,Rating,No.of.Ratings
0,0,Bahubali: The Beginning,2015.0,UA,"Action, Drama","In ancient India, an adventurous and darin...",159,8.1,99114
1,1,Baahubali 2: The Conclusion,2017.0,UA,"Action, Drama","When Shiva, the son of Bahubali, learns ab...",167,8.2,71458
2,2,1 - Nenokkadine,2014.0,UA,"Action, Thriller",A rock star must overcome his psychologica...,170,8.1,42372
3,3,Dhoom:3,2013.0,UA,"Action, Thriller","When Sahir, a circus entertainer trained i...",172,5.4,42112
4,4,Ra.One,2011.0,U,"Action, Adventure, Sci-Fi",When the titular antagonist of an action g...,156,4.6,37211
...,...,...,...,...,...,...,...,...,...
1395,1395,Maro Monagadu,1985.0,,,,0,8.6,49
1396,1396,Jakkanna,2016.0,,"Comedy, Drama",The movie is about an attempt by Sunil the...,0,6.3,49
1397,1397,Muvva Gopaludu,1987.0,,"Drama, Romance","Muvva Gopaludu is a 1987 Indian Telugu film, D...",137,7.8,49
1398,1398,Ninney Ishta Paddaanu,2003.0,U,,Hero Charan (Tarun) a middle class family ...,0,5.9,49


In [4]:
movies_data.shape

(1400, 9)

In [5]:
selected_features = ['genres','Overview','Rating','No.of.Ratings']
print(selected_features)

['genres', 'Overview', 'Rating', 'No.of.Ratings']


In [6]:
# replacing the null valuess with null string

for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [7]:
combined_features = movies_data['genres'] + ' ' + movies_data['Overview'] + ' ' + movies_data['Rating'].astype(str) + ' ' + movies_data['No.of.Ratings'].astype(str)


In [8]:
vectorizer = TfidfVectorizer()

In [9]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [10]:
print(feature_vectors)

  (0, 821)	0.3277804286675048
  (0, 4198)	0.20232583497450396
  (0, 6057)	0.29769485311168353
  (0, 5783)	0.16606927374019823
  (0, 1332)	0.18472690146173543
  (0, 2484)	0.2734051156037553
  (0, 4043)	0.2237014373247953
  (0, 1966)	0.28800946108940817
  (0, 3135)	0.243319540047934
  (0, 1292)	0.2074381264442197
  (0, 3608)	0.13572301085025607
  (0, 1935)	0.3277804286675048
  (0, 1038)	0.07568200769416801
  (0, 924)	0.29769485311168353
  (0, 1027)	0.11176873125593247
  (0, 3047)	0.1916864636571128
  (0, 1037)	0.28800946108940817
  (0, 3034)	0.16610306201447372
  (0, 2183)	0.0726613588372952
  (0, 878)	0.07781926703286678
  (1, 693)	0.2753522886535695
  (1, 3333)	0.22480545083026204
  (1, 3587)	0.2753522886535695
  (1, 5828)	0.26056828632523515
  (1, 5595)	0.11580415318442949
  :	:
  (1399, 5463)	0.42776714657270154
  (1399, 3647)	0.1699973485396792
  (1399, 3928)	0.19859499564069408
  (1399, 2975)	0.15564413301322122
  (1399, 4080)	0.16378287846806575
  (1399, 3781)	0.14496915446679343


In [11]:
similarity = cosine_similarity(feature_vectors)

In [12]:
print(similarity.shape)

(1400, 1400)


In [13]:
movie_name = input(' Enter your favourite movie name : ')

In [14]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['Movie'].tolist()
print(list_of_all_titles)

['Bahubali: The Beginning', 'Baahubali 2: The Conclusion', '1 - Nenokkadine', 'Dhoom:3', 'Ra.One', 'Dhoom:2', 'Eega', 'Krrish 3', 'Arjun Reddy', 'Rangasthalam', 'Magadheera', 'War', 'Bharat Ane Nenu', 'Saaho', 'Theri', 'Dookudu', 'Pokiri', 'Sarkar', 'Athadu', 'The Ghazi Attack', 'Kabali', 'MSG: The Messenger of God', 'Nanban', 'Srimanthudu', 'Veer - Vivegam', 'Billa 2', 'Manam', '7 Aum Arivu', 'Bigil', 'Business Man', 'Geetha Govindam', 'Mahanati', 'Spyder', 'Nannaku Prematho', 'Dabangg 3', 'MSG 2 the Messenger', 'Manikarnika: The Queen of Jhansi', 'Race Gurram', 'Okkadu', 'Bommarillu', 'Atharintiki Daaredi', 'Khaleja', 'Yennai Arindhaal', 'Thalaivaa', 'Kaala', 'Bairavaa', 'Goodachari', 'Puli', 'Pulimurugan', 'Veeram', 'Vedam', 'Yevadu', 'Aravindha Sametha Veera Raghava', 'Billa', 'Jersey', 'Sye Raa Narasimha Reddy', 'Ala Vaikunthapurramuloo', 'Janatha Garage', 'Gabbar Singh', 'Temper', 'Game Over', 'Singam 2', 'Dhruva', 'Jalsa', 'Maharshi', 'Pelli Choopulu', 'Arya 2', 'Chekka Chivanth

In [15]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Athadu', 'Katha', 'Partha']


In [16]:
close_match = find_close_match[0]
print(close_match)

Athadu


In [17]:
# finding the index of the movie with title

index_of_the_movie = movies_data[movies_data.Movie == close_match]['Index'].values[0]
print(index_of_the_movie)

18


In [18]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.030715243665943512), (1, 0.04722260912980234), (2, 0.06067550754558859), (3, 0.026201507354383917), (4, 0.01799219416038693), (5, 0.05834183045690487), (6, 0.04730017975883599), (7, 0.057582645106984666), (8, 0.01808995054324071), (9, 0.011945916093690907), (10, 0.024173523588840937), (11, 0.02548528473327758), (12, 0.016908583588027784), (13, 0.021468383094676677), (14, 0.015392819234393663), (15, 0.013168164946491034), (16, 0.02872975270743525), (17, 0.014149406065783101), (18, 1.0000000000000002), (19, 0.04381801134185641), (20, 0.046218913518163464), (21, 0.013015097454610835), (22, 0.06887100147272603), (23, 0.018771269597271725), (24, 0.06129556583088189), (25, 0.023288020184477395), (26, 0.005944573789229279), (27, 0.044159465788469895), (28, 0.011557331169324244), (29, 0.018254466035485757), (30, 0.018621287462434882), (31, 0.049890946813050334), (32, 0.041326944944920556), (33, 0.04770444507496738), (34, 0.03553925547166232), (35, 0.04711340065396799), (36, 0.0238288000

In [19]:
len(similarity_score)

1400

In [20]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_movies)

[(18, 1.0000000000000002), (313, 0.2244685924229764), (340, 0.20058652650161987), (322, 0.15396146444451067), (789, 0.1504691029149579), (46, 0.1362527245949464), (362, 0.13542781448685046), (223, 0.12796702391806325), (711, 0.11960124496764005), (560, 0.1168614803549699), (249, 0.11202578870207157), (1226, 0.11118884335185757), (576, 0.11068718768600347), (719, 0.11052224370619426), (1385, 0.11004268629161605), (53, 0.10619644026885158), (692, 0.10604251506282461), (514, 0.1046231122719496), (1094, 0.10368317656235611), (801, 0.10287183852268271), (1093, 0.09847583732343887), (610, 0.09750025151251439), (828, 0.09658622745823751), (628, 0.09498192286328308), (784, 0.09406624264616327), (1164, 0.09334141580761444), (1209, 0.09321225798647245), (822, 0.09301109276345361), (873, 0.09295827374326691), (271, 0.09200364506417971), (598, 0.09188833946137397), (107, 0.09186377458879809), (281, 0.0914520288281898), (686, 0.08759497671449902), (805, 0.08742667205828845), (1067, 0.08688640716334

In [21]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  Index = movie[0]
  title_from_index = movies_data[movies_data.Index==Index]['Movie'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . Athadu
2 . King
3 . Amar Akbar Anthony
4 . Nenu Meeku Telusa...?
5 . Sindhooram
6 . Goodachari
7 . Disco Raja
8 . Bruce Lee: The Fighter
9 . Alludugaru
10 . Alludu Seenu
11 . Kadaram Kondan
12 . Sandhippoma
13 . Allari Police
14 . Shock
15 . Andha Oru Nimidam
16 . Billa
17 . Rama Rama Krishna Krishna
18 . Yogi
19 . Kondaveeti Simham
20 . Srimannarayana
21 . Mechanic Alludu
22 . Seven
23 . Veerabhadra
24 . Shadow
25 . Vijayendra Varama
26 . Lorry Driver
27 . Brahma
28 . Subha Sankalpam
29 . Gulabi
