In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
df= pd.read_csv('movies.csv')
df.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
df.drop('movieId',axis=1,inplace=True)

In [5]:
df.isna().sum()

title     0
genres    0
dtype: int64

In [6]:
df['clean_genres']=df['genres'].apply(lambda x: x.replace('|', ' '))

In [7]:
df['combined']=df['title']+' '+df['clean_genres']

In [12]:
df.shape

(5000, 3)

In [9]:
df=df[:5000]

In [11]:
df.drop('clean_genres',axis=1,inplace=True)

In [13]:
nlp=spacy.load('en_core_web_lg')

In [14]:
def preprocessing(text):
    text=text.lower()
    doc=nlp(text)
    clean_text=[]
    for i in doc:
        if i.is_punct or i.is_stop:
            continue

        clean_text.append(i.lemma_)

    return clean_text

In [15]:
df['clean_text']=df['combined'].apply(lambda x: preprocessing(x))

In [16]:
df['clean_text_string']=df['clean_text'].apply(lambda x: ' '.join(x))

In [18]:
df.drop('clean_text',axis=1,inplace=True)

In [19]:
df.head(3)

Unnamed: 0,title,genres,combined,clean_text_string
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995) Adventure Animation Children ...,toy story 1995 adventure animation child comed...
1,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995) Adventure Children Fantasy,jumanji 1995 adventure child fantasy
2,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995) Comedy Romance,grumpy old man 1995 comedy romance


In [20]:
lists=df['clean_text_string'].tolist()

In [21]:
lists[:5]

['toy story 1995 adventure animation child comedy fantasy',
 'jumanji 1995 adventure child fantasy',
 'grumpy old man 1995 comedy romance',
 'wait exhale 1995 comedy drama romance',
 'father bride ii 1995 comedy']

In [22]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [23]:
list_embedding=model.encode(lists)

In [24]:
list_embedding.shape

(5000, 384)

In [28]:
query='suggest any romantic movies'

In [29]:
def recommendations(query, list_embedding, df, top=10):
    query_embedding = model.encode([query])
    similarity = cosine_similarity(query_embedding, list_embedding)
    
    top = similarity[0].argsort()[-top:][::-1]
    return df.iloc[top]

In [30]:
suggestions = recommendations(query, list_embedding, df)
print(suggestions[['title', 'genres']])

                           title                genres
2802              Romance (1999)         Drama|Romance
3879  You Can Count on Me (2000)         Drama|Romance
1543      She's So Lovely (1997)         Drama|Romance
4711        Born Romantic (2000)  Comedy|Drama|Romance
4766             Intimacy (2000)                 Drama
3782           Love & Sex (2000)  Comedy|Drama|Romance
1518                 Fall (1997)               Romance
4267      Crazy/Beautiful (2001)         Drama|Romance
2178    Indecent Proposal (1993)         Drama|Romance
3358    Whatever It Takes (2000)        Comedy|Romance
