In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

In [87]:
df = pd.read_csv('data/IMDb movies.csv', low_memory = False)

In [88]:
data = pd.DataFrame()

In [89]:
c = df['avg_vote'].mean()

In [90]:
m = df['votes'].quantile(0.60)

In [91]:
data = df[df['votes'] >= m]

In [92]:
score = []
v = data['votes'].values
r = data['avg_vote'].values

In [93]:
def weighted_average(v, r, c, m):
    s = ((v * r) / (v + m)) + ((m * c) / (v + m))
    return s

In [94]:
for i in range(len(v)):
    score.append(weighted_average(v[i], r[i], c, m))

In [95]:
data.loc[:, ('weighted_score')] = score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [96]:
data = data.sort_values('weighted_score', ascending = False)

In [97]:
data.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,description,avg_vote,votes,budget,usa_gross_income,worldwide_gross_income,metascore,reviews_from_users,reviews_from_critics,weighted_score
28453,tt0111161,Le ali della libertà,The Shawshank Redemption,1994,2/10/1995,Drama,142,USA,English,Frank Darabont,...,Two imprisoned men bond over a number of years...,9.3,2278845,"$25,000,000","$28,699,976","$28,815,245",80.0,8232.0,164.0,9.298887
38406,tt0252487,Hababam Sinifi,Hababam Sinifi,1975,4/1/1975,"Comedy, Drama",87,Turkey,Turkish,Ertem Egilmez,...,"Lazy, uneducated students share a very close b...",9.3,36269,,,,,62.0,1.0,9.231449
15528,tt0068646,Il padrino,The Godfather,1972,9/21/1972,"Crime, Drama",175,USA,"English, Italian, Latin",Francis Ford Coppola,...,The aging patriarch of an organized crime dyna...,9.2,1572674,"$6,000,000","$134,966,411","$246,120,974",100.0,3977.0,253.0,9.198435
48078,tt0468569,Il cavaliere oscuro,The Dark Knight,2008,7/23/2008,"Action, Crime, Drama",152,"USA, UK","English, Mandarin",Christopher Nolan,...,When the menace known as the Joker wreaks havo...,9.0,2241615,"$185,000,000","$535,234,033","$1,005,455,211",84.0,6938.0,423.0,8.998968
16556,tt0071562,Il padrino - Parte II,The Godfather: Part II,1974,9/25/1975,"Crime, Drama",202,USA,"English, Italian, Spanish, Latin, Sicilian",Francis Ford Coppola,...,The early life and career of Vito Corleone in ...,9.0,1098714,"$13,000,000","$47,834,595","$48,035,783",90.0,1030.0,178.0,8.997896


In [98]:
# create tfidf
tfidf = TfidfVectorizer(stop_words = 'english')

# replace anything that is not a number with an empty string
data['description'] = data['description'].fillna('')

# fit and transform the data for TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(data['description'])

# output tfidf_matrix shape
tfidf_matrix.shape

(34349, 39532)

In [99]:
tfidf.get_feature_names()[0:20]

['00',
 '000',
 '007',
 '00s',
 '01',
 '09',
 '10',
 '100',
 '1000',
 '10000',
 '1001',
 '100m',
 '100th',
 '101',
 '101st',
 '102',
 '1023',
 '104',
 '108',
 '109']

In [100]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [101]:
cosine_sim.shape

(34349, 34349)

In [103]:
indices = pd.Series(data.index, index  = data['original_title']).drop_duplicates()

In [104]:
indices[:20]

original_title
The Shawshank Redemption                             28453
Hababam Sinifi                                       38406
The Godfather                                        15528
The Dark Knight                                      48078
The Godfather: Part II                               16556
Aynabaji                                             77545
Pulp Fiction                                         28381
The Lord of the Rings: The Return of the King        34127
Schindler's List                                     27629
12 Angry Men                                          8973
Hababam Sinifi Sinifta Kaldi                         38407
Tosun Pasa                                           38490
Maratonci trce pocasni krug                          20502
Peranbu                                              83132
Inception                                            57475
Fight Club                                           32487
Forrest Gump                             

In [129]:
def recommendation(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    movie_similarity = [i[1] for i in sim_scores]
    return data['original_title'].iloc[movie_indices], movie_similarity

In [117]:
movie = input('Please enter the movie name:')

Please enter the movie name:Fight Club


In [130]:
recommendation(movie)

(42848                                  Code 46
 17567                              Logan's Run
 21201                 Twilight Zone: The Movie
 36641                                     2046
 41569    The League of Extraordinary Gentlemen
 46706                                The Queen
 22988                 Amazon Women on the Moon
 19925                         Galaxy of Terror
 51892            The Day the Earth Stood Still
 62316                                    Dropa
 Name: original_title, dtype: object,
 [0.3899116963932646,
  0.2611243597598879,
  0.24902963788838267,
  0.23183502965018732,
  0.23137181708994792,
  0.22488170116965023,
  0.21675132857623133,
  0.21635412454006092,
  0.214590327486521,
  0.2054634237353824])