In [28]:
import pandas as pd

In [29]:
item_cols = [
    'item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
    'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy',
    'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
    'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

In [30]:
movies_full = pd.read_csv('../data/ml-100k/u.item', sep='|', encoding='latin-1', header=None, names=item_cols)
movies_full

Unnamed: 0,item_id,title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
movies_full[['item_id', 'title'] + item_cols[5:8]].head()

Unnamed: 0,item_id,title,unknown,Action,Adventure
0,1,Toy Story (1995),0,0,0
1,2,GoldenEye (1995),0,1,1
2,3,Four Rooms (1995),0,0,0
3,4,Get Shorty (1995),0,1,0
4,5,Copycat (1995),0,0,0


In [32]:
genre_cols = item_cols[5:]
genre_cols

['unknown',
 'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [33]:
movies_full['genres'] = movies_full[genre_cols].apply(
        lambda row: ' '.join([genre for genre, val in row.items() if val == 1]), axis=1
)

#Keeping only needed columns
movies_meta = movies_full[['item_id', 'title', 'genres']]
movies_meta.head()

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Animation Children's Comedy
1,2,GoldenEye (1995),Action Adventure Thriller
2,3,Four Rooms (1995),Thriller
3,4,Get Shorty (1995),Action Comedy Drama
4,5,Copycat (1995),Crime Drama Thriller


In [34]:
#TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

#Initialize and fit TF-IDF on genres
tfidf = TfidfVectorizer(token_pattern=r'(?u)\b[\w\-]+\b')
tfidf_matrix = tfidf.fit_transform(movies_meta['genres'])

print("TF-IDF matrix shape: ", tfidf_matrix.shape)
# tfidf_matrix

TF-IDF matrix shape:  (1682, 20)


In [35]:
print(tfidf.get_feature_names_out())

['action' 'adventure' 'animation' 'children' 'comedy' 'crime'
 'documentary' 'drama' 'fantasy' 'film-noir' 'horror' 'musical' 'mystery'
 'romance' 's' 'sci-fi' 'thriller' 'unknown' 'war' 'western']


In [36]:
#Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print("Cosine similarity matrix shape: ", cosine_sim.shape)

Cosine similarity matrix shape:  (1682, 1682)


In [37]:
#Recommendation Function
indices = pd.Series(movies_meta.index, index=movies_meta['title']).drop_duplicates()
indices

title
Toy Story (1995)                                0
GoldenEye (1995)                                1
Four Rooms (1995)                               2
Get Shorty (1995)                               3
Copycat (1995)                                  4
                                             ... 
Mat' i syn (1997)                            1677
B. Monkey (1998)                             1678
Sliding Doors (1998)                         1679
You So Crazy (1994)                          1680
Scream of Stone (Schrei aus Stein) (1991)    1681
Length: 1682, dtype: int64

In [38]:
def recommend_movies(title, n=5):
    if title not in indices:
        return "Movie not found in database."
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1] #Skip itself
    movie_indices = [i[0] for i in sim_scores]
    return movies_meta['title'].iloc[movie_indices].tolist()

In [39]:
recommend_movies("Star Wars (1977)", 5)

['Return of the Jedi (1983)',
 'Empire Strikes Back, The (1980)',
 'Starship Troopers (1997)',
 'African Queen, The (1951)',
 'Independence Day (ID4) (1996)']

## Enhancement (Combine Title and Genres)

In [40]:
# Comine Title and Genres into one text column
movies_meta['text'] = movies_meta['title'] + ' ' + movies_meta['genres']
movies_meta[['title', 'genres', 'text']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_meta['text'] = movies_meta['title'] + ' ' + movies_meta['genres']


Unnamed: 0,title,genres,text
0,Toy Story (1995),Animation Children's Comedy,Toy Story (1995) Animation Children's Comedy
1,GoldenEye (1995),Action Adventure Thriller,GoldenEye (1995) Action Adventure Thriller
2,Four Rooms (1995),Thriller,Four Rooms (1995) Thriller
3,Get Shorty (1995),Action Comedy Drama,Get Shorty (1995) Action Comedy Drama
4,Copycat (1995),Crime Drama Thriller,Copycat (1995) Crime Drama Thriller


In [41]:
tfidf2 = TfidfVectorizer(stop_words='english', token_pattern=r'(?u)\b[\w\-]+\b')

In [43]:
#Fit-transform the combined text
tfidf2_matrix = tfidf2.fit_transform(movies_meta['text'])

In [45]:
print("TF-IDF shape: ", tfidf2_matrix.shape)

TF-IDF shape:  (1682, 2325)


In [46]:
cosine_sim2 = cosine_similarity(tfidf2_matrix, tfidf2_matrix)

In [48]:
def recommend_movies2(title, n=5):
    if title not in indices:
        return "Movie not found in database."
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim2[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies_meta['title'].iloc[movie_indices].tolist()

In [49]:
recommend_movies2("Batman Returns (1992)", 5)

['Batman (1989)',
 'Batman & Robin (1997)',
 'Batman Forever (1995)',
 'Lashou shentan (1992)',
 'Sneakers (1992)']