In [1]:
import os,sys
import pandas as pd
import numpy as np
sys.path.append(os.path.normpath(os.getcwd()))

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
metadata = pd.read_csv('metadata_prep.csv')

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(45068, 75551)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

metadata['genres'] = metadata['genres'].fillna('')
cv = CountVectorizer()
g_cv_matrix= cv.fit_transform(metadata['genres'])
g_cv_matrix.shape

(45068, 46)

In [6]:
metadata['title'] = metadata['title'].fillna('')
#Construct the required TF-IDF matrix by fitting and transforming the data
title_matrix = tfidf.fit_transform(metadata['title'])
title_matrix.shape

(45068, 22574)

In [7]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
g_cosine_sim = linear_kernel(g_cv_matrix, g_cv_matrix)

In [9]:
t_cosine_sim = linear_kernel(title_matrix, title_matrix)

In [10]:
def get_recommendations(title, df, indices, cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 3 most similar movies
    sim_scores = sim_scores[1:30]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 3 most similar movies
    return df['title'].iloc[movie_indices]


In [11]:
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [12]:
a = get_recommendations('Iron Man 2', metadata, indices, cosine_sim)

영화 설명에 대한 유사도

In [13]:
print(a)

12555                                     Iron Man
20718                                   Iron Man 3
2768                                 The Dark Half
31071                      The Mother Of Invention
1198                                 Touch of Evil
19947                               The Guilt Trip
26406                      Avengers: Age of Ultron
1066                         Rebel Without a Cause
1820                            All the King's Men
43182                            The Kinematograph
26440                                One More Time
37717                                     Sharkman
23147                                   Alter Egos
6433                                       Purpose
31893                     Fracchia The Human Beast
19762                               Excuse My Dust
42559                               The Other Half
37325                               The Flying Man
43482                              Somebody's Hero
24822                          

In [14]:
b = get_recommendations("Iron Man 2", metadata, indices, g_cosine_sim)

장르에 대한 유사도

In [15]:
print(b)

157                                       Congo
169                             Johnny Mnemonic
178     Mighty Morphin Power Rangers: The Movie
256                                   Star Wars
312                                    Stargate
324                      Star Trek: Generations
514                                   RoboCop 3
528                                  The Shadow
675                                        Solo
755                            Independence Day
820                            Escape from L.A.
1087                                  The Abyss
1149                    The Empire Strikes Back
1162                         Return of the Jedi
1250                           Forbidden Planet
1302                   Star Trek: First Contact
1317     Star Trek VI: The Undiscovered Country
1318            Star Trek V: The Final Frontier
1319            Star Trek II: The Wrath of Khan
1320        Star Trek III: The Search for Spock
1412                                  Th

In [16]:
c = get_recommendations("Iron Man 2" ,metadata, indices, t_cosine_sim)

영화 제목으로만 유사도를 비교했을 때

In [17]:
print(c)

12555                         Iron Man
15105                       Iron Man 2
20718                       Iron Man 3
34421                         Iron Man
41456                         Iron Man
5202                         Iron Will
9507                            3-Iron
1690          The Man in the Iron Mask
11882         The Man in the Iron Mask
30876         The Man in the Iron Mask
20683          The Invincible Iron Man
19652      The Man with the Iron Fists
29432    The Man with the Iron Fists 2
4403              Tetsuo: The Iron Man
25242                   Eight Iron Men
18292                    The Iron Lady
11414                      Iron Island
18719                         Iron Sky
18107                    The Iron Rose
12312                   The Iron Horse
727            The Man from Down Under
918                       The Thin Man
1164                     The Third Man
4980                      The Last Man
7086                    Who's the Man?
7248                     

In [18]:
v = metadata['vote_count']
print(v)

0        5415.0
1        2413.0
2          92.0
3          34.0
4         173.0
          ...  
45063       1.0
45064       3.0
45065       6.0
45066       0.0
45067       0.0
Name: vote_count, Length: 45068, dtype: float64


In [19]:
R = metadata['vote_average']
print(R)

0        7.7
1        6.9
2        6.5
3        6.1
4        5.7
        ... 
45063    4.0
45064    9.0
45065    3.8
45066    0.0
45067    0.0
Name: vote_average, Length: 45068, dtype: float64


In [20]:
C = metadata['vote_average'].mean()
print(C)

5.651645401087021


In [21]:
m = metadata['vote_count'].quantile(0.7)
print(m)

25.0


In [22]:
metadata['weighted_average']=((R*v)+ (C*m))/(v+m)

In [23]:
metadata['weighted_average'] = metadata['weighted_average'].fillna(0)

In [24]:
import numpy as np

r_matrix= metadata['weighted_average'].to_numpy()
r_matrix = r_matrix.reshape(-1, 1)

In [25]:
print(r_matrix)

[[7.69058661]
 [6.88719899]
 [6.31872765]
 ...
 [5.29326242]
 [5.6516454 ]
 [5.6516454 ]]


In [26]:
r_cosine_sim = linear_kernel(r_matrix, r_matrix)

In [27]:
c = get_recommendations("Iron Man 2" ,metadata, indices, r_cosine_sim)

평점에 대한 유사도 (비슷한 평점 추천)

In [28]:
print(c)

314             The Shawshank Redemption
829                        The Godfather
39937                         Your Name.
38779                       Planet Earth
12448                    The Dark Knight
2829                          Fight Club
292                         Pulp Fiction
522                     Schindler's List
23539                           Whiplash
5460                       Spirited Away
2198                   Life Is Beautiful
1173              The Godfather: Part II
1147     One Flew Over the Cuckoo's Nest
1171                              Psycho
1179         Once Upon a Time in America
42964                    Planet Earth II
351                         Forrest Gump
1149             The Empire Strikes Back
18398                   The Intouchables
289               Leon: The Professional
3016                      The Green Mile
1165                          GoodFellas
2203                  American History X
1156                        12 Angry Men
9666            

In [54]:
from operator import itemgetter

In [93]:
def combined(title, df, indices, cosine_sim1, cosine_sim2, cosine_sim3, cosine_sim4):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores1 = list(enumerate(cosine_sim1[idx])) # overview
    sim_scores2 = list(enumerate(cosine_sim2[idx])) # title
    sim_scores3 = list(enumerate(cosine_sim3[idx])) # genres
    sim_scores4 = list(enumerate(cosine_sim4[idx])) # ratings
           
    #feauture scaling
    s1_max = max(sim_scores1, key=itemgetter(1))[1]
    s2_max = max(sim_scores2, key=itemgetter(1))[1]
    s3_max = max(sim_scores3, key=itemgetter(1))[1]
    s4_max = max(sim_scores4, key=itemgetter(1))[1]
    
    s1_min = min(sim_scores1, key=itemgetter(1))[1]
    s2_min = min(sim_scores2, key=itemgetter(1))[1]
    s3_min = min(sim_scores3, key=itemgetter(1))[1]
    s4_min = min(sim_scores4, key=itemgetter(1))[1]

    
    
    s1 = []
    for i in range(len(sim_scores1)):
        scaled_s1 = (sim_scores1[i][1] - s1_min) / (s1_max - s1_min)
        #print(scaled_s1)
        s1.append((sim_scores1[i][0], scaled_s1 * 0.6))
    
    s2 = []
    for i in range(len(sim_scores2)):
        scaled_s2 = (sim_scores2[i][1] - s2_min) / (s2_max - s2_min)
        s2.append((sim_scores2[i][0], scaled_s2 * 0.2))
    
    s3 = []
    for i in range(len(sim_scores3)):
        scaled_s3 = (sim_scores3[i][1] - s3_min) / (s3_max - s3_min)
        s3.append((sim_scores3[i][0], scaled_s3 * 0.15))
    
    s4 = []
    for i in range(len(sim_scores4)):
        scaled_s4 = (s4_max  - s4_min) / (s4_max  - s4_min)
        s4.append((sim_scores4[i][0], scaled_s4 * 0.05))
    
    sim_scores = s1 + s2 + s3 + s4
    
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 3 most similar movies
    sim_scores = sim_scores[1:30]
   
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 3 most similar movies
    return df['title'].iloc[movie_indices], sim_scores

In [94]:
combined = combined("Iron Man 2" ,metadata, indices, cosine_sim, t_cosine_sim, g_cosine_sim, r_cosine_sim)

설명, 이름, 장르, 평점 모두 고려한 추천 리스트

In [96]:
print(combined)

5567                                 Man of Iron
12555                                   Iron Man
15105                                 Iron Man 2
20718                                 Iron Man 3
34421                                   Iron Man
41456                                   Iron Man
5202                                   Iron Will
9507                                      3-Iron
96                                      Shopping
157                                        Congo
169                              Johnny Mnemonic
178      Mighty Morphin Power Rangers: The Movie
256                                    Star Wars
312                                     Stargate
324                       Star Trek: Generations
514                                    RoboCop 3
528                                   The Shadow
675                                         Solo
755                             Independence Day
820                             Escape from L.A.
1087                