In [0]:
from pyspark.sql.functions import *
from pyspark.ml.feature import *

In [0]:
df = spark.table("netflix_catalog.silver.netflix_titles_with_category") \
    .select("show_id", "title", "type", "listed_in")\
    .filter((col("type") == "Movie") | (col("type") == "TV Show"))
df.display()

show_id,title,type,listed_in
81145628,Norm of the North: King Sized Adventure,Movie,Comedies
80117401,Jandino: Whatever it Takes,Movie,Stand-Up Comedy
70234439,Transformers Prime,TV Show,Kids' TV
80058654,Transformers: Robots in Disguise,TV Show,Kids' TV
80125979,#realityhigh,Movie,Comedies
80163890,Apaches,TV Show,Spanish-Language TV Shows
70304989,Automata,Movie,Thrillers
80164077,Fabrizio Copano: Solo pienso en mi,Movie,Stand-Up Comedy
80117902,Fire Chasers,TV Show,Science & Nature TV
70304990,Good People,Movie,Action & Adventure


### Xử lý, tạo đặc trưng text vector và tạo model LSH

In [0]:
# Gộp các cột text thành cột combined_text
df_features = df.withColumn("combined_text", concat_ws(" ", col("title"), col("listed_in"), col("type")))

# Tokenizer
tokenizer = RegexTokenizer(inputCol="combined_text", outputCol="words", pattern="\\W")
wordsData = tokenizer.transform(df_features)

# Loại bỏ stopwords
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filteredData = remover.transform(wordsData)

# Tạo đặc trưng TF-IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
featurizedData = hashingTF.transform(filteredData)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

# Tạo model LSH để tìm phim gần nhất
brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength=2.0, numHashTables=10.0)
model = brp.fit(rescaledData)

### Tính similarity (cosine similarity) giữa phim

In [0]:
# Chọn phim gốc
movie_title = "LEGO Marvel Super Heroes: Black Panther"
key = rescaledData.filter(col("title") == movie_title).select("features").head().features

# Tìm 6 phim gần nhất (bao gồm cả phim gốc)
neighbors = model.approxNearestNeighbors(rescaledData, key, 6)

# Hiển thị kết quả (bỏ phim gốc đi lấy 5 phim tương tự)
neighbors.filter(col("title") != movie_title).select("title", "listed_in", "distCol").show(truncate=False)

+-------------------------------------------------+------------------------+------------------+
|title                                            |listed_in               |distCol           |
+-------------------------------------------------+------------------------+------------------+
|LEGO Marvel Super Heroes: Guardians of the Galaxy|Children & Family Movies|13.293937391504272|
|LEGO: Marvel Super Heroes: Maximum Overload      |Children & Family Movies|13.379217939660492|
|LEGO Marvel Super Heroes: Avengers Reassembled!  |Children & Family Movies|13.861353807618908|
|Black Panther                                    |Action & Adventure      |13.927540908233937|
|최강전사 미니특공대 : 영웅의 탄생                |Children & Family Movies|14.749201520469283|
+-------------------------------------------------+------------------------+------------------+

