In [1]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import string
import re
import joblib
import os

In [2]:
reviews = pd.read_csv("mergedReviews.csv")

In [3]:
#All lower case
#Remove numbers
#Remove punctuation
translator = str.maketrans('', '', string.punctuation)
#Remove whitespace

for i in range(len(reviews)):
    reviews["reviews"][i] = reviews["reviews"][i].lower()
    reviews["reviews"][i] = re.sub(r'\d+', '', reviews["reviews"][i])
    reviews["reviews"][i] = reviews["reviews"][i].translate(translator)
    reviews["reviews"][i] = " ".join(reviews["reviews"][i].split())
print(reviews)

        imdb_id                                            reviews
0     tt0013427  robert flaherty is one of the more noted docum...
1     tt0014429  in the country boy harold says goodbye to his ...
2     tt0015324  its almost impossible to describe the astoundi...
3     tt0015864  ive seen both version of this filmthe original...
4     tt0017925  probably buster keatons best film and oddly en...
...         ...                                                ...
7623  tt9866072  its official this pandemic has broken me becau...
7624  tt9882084  now i am a year old man from the north of the ...
7625  tt9893250  people who con the elderly are disgusting and ...
7626  tt9896916  unlike most movies this is all about the story...
7627  tt9898858  that kid was the most annoying character ever ...

[7628 rows x 2 columns]


In [4]:
#print(len(reviews["reviews"][0].split()))
#print(round(sum([len(i.split())for i in reviews["reviews"]])/len(reviews["reviews"])))

max_vocab_length = 10000
max_length = 6063

In [5]:
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

text_vectorizer.adapt(reviews["reviews"])

In [6]:
#random_sentence = random.choice(reviews["reviews"])
#print(random_sentence)
#print(text_vectorizer(random_sentence))

#words_in_vocab = text_vectorizer.get_vocabulary()
#top_5_words = words_in_vocab[:5]
#bottom_5_words = words_in_vocab[-5:]
#print(top_5_words,bottom_5_words)

In [7]:
embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=64,
                             input_length=max_length)

#print(embedding(text_vectorizer(random_sentence)))

In [8]:
pooling_layer = layers.GlobalAveragePooling1D()

In [9]:
x = text_vectorizer(reviews["reviews"])
x = embedding(x)
x = pooling_layer(x)

In [10]:
nn = NearestNeighbors(n_neighbors=10)
nn.fit(x)

In [11]:
# Save model
directory = "saved_model"

if not os.path.exists(directory):
    os.makedirs(directory)

# Define the file path
file_path = os.path.join(directory, "nearest_neighbors.joblib")
joblib.dump(nn, file_path)

['saved_model\\nearest_neighbors.joblib']

In [12]:
# Load model
if  os.path.exists(directory):
    nn = joblib.load(file_path)

In [13]:
text = embedding(text_vectorizer("family"))
neighbours = nn.kneighbors(text, return_distance=False)

In [14]:
for index in neighbours[0]:
    movie_id = reviews.iloc[index]["imdb_id"]
    print(f"Movie ID: {movie_id}")

Movie ID: tt0308506
Movie ID: tt0085995
Movie ID: tt1620981
Movie ID: tt8976696
Movie ID: tt12673718
Movie ID: tt3416742
Movie ID: tt6866224
Movie ID: tt6317656
Movie ID: tt3876910
Movie ID: tt1801552
