In [54]:
#Importing
import pandas as pd
import numpy as np

### Fetching the prepared data

In [55]:
#loading the final_df into new_df
df = pd.read_csv(r'datasets/final_df.csv')
df = df[['id','title','year','tags']]
df.head()

Unnamed: 0,id,title,year,tags
0,19995,Avatar,2009,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,2007,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,2015,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,2012,following the death of district attorney harve...
4,49529,John Carter,2012,"john carter is a war-weary, former military ca..."


### Stemming of Tags

In [56]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [57]:
def stem(text):
    L = []
    for i in text.split():
        L.append(ps.stem(i))
    return " ".join(L)

In [58]:
df['tags'] = df['tags'].apply(stem)
df.head()

Unnamed: 0,id,title,year,tags
0,19995,Avatar,2009,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,2007,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,2015,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,2012,follow the death of district attorney harvey d...
4,49529,John Carter,2012,"john carter is a war-weary, former militari ca..."


### Vectorization

In [59]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words='english') #max_features = no of words to be considered as components

In [60]:
# storing the matrix of vectors
vectors = cv.fit_transform(df['tags']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [61]:
#checking the 5000 most used words (features)
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [62]:
from sklearn.metrics.pairwise import cosine_similarity
sim_matrix = cosine_similarity(vectors)

In [63]:
def recommend(movie):
    movie_index = df[df['title'] == movie].index[0]
    distances = sim_matrix[movie_index]
    rec_movies = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:11]

    for x in rec_movies:
        print(df.iloc[x[0]].title)

In [64]:
recommend('The Wolverine')

X2
Iron Man 2
X-Men Origins: Wolverine
Superman Returns
X-Men
X-Men: Days of Future Past
Ant-Man
Krrish
Teenage Mutant Ninja Turtles III
The Truman Show


### Exporting required data as a binary file

In [65]:
import pickle
pickle.dump(df, open("movies.pkl",'wb'))
pickle.dump(sim_matrix, open("sim_mat_bow.pkl",'wb'))
print('Done')

Done
