
# Briefing about Word2Vec:

<img src="https://mccormickml.com/assets/word2vec/skip_gram_net_arch.png" alt="Word2Vec" style="max-width:800px">


In [4]:
%pip install -q -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade tensorflow-cpu gensim chdb pandas pyarrow scikit-learn numpy matplotlib
%pip show tensorflow-cpu chdb gensim

Note: you may need to restart the kernel to use updated packages.
Name: tensorflow-cpu
Version: 2.15.0.post1
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /home/Clickhouse/.venv/lib/python3.9/site-packages
Requires: absl-py, astunparse, flatbuffers, gast, google-pasta, grpcio, h5py, keras, libclang, ml-dtypes, numpy, opt-einsum, packaging, protobuf, setuptools, six, tensorboard, tensorflow-estimator, tensorflow-io-gcs-filesystem, termcolor, typing-extensions, wrapt
Required-by: 
---
Name: chdb
Version: 1.0.1
Summary: chDB is an in-process SQL OLAP Engine powered by ClickHouse
Home-page: https://github.com/auxten/chdb
Author: auxten
Author-email: auxtenwpc@gmail.com
License: Apache-2.0
Location: /home/Clickhouse/.venv/lib/python3.9/site-packages
Requires: 
Required-by: 
---
Name: gensim
Version: 4.3.2
Summary: Python framework fo

In [5]:
import pandas as pd
import zipfile
import urllib.request
import os
import chdb
from chdb import session

# Download and extract the dataset
if not os.path.exists("ml-25m/ratings.csv"):
    url = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context
    filehandle, _ = urllib.request.urlretrieve(url)
    zip_file_object = zipfile.ZipFile(filehandle, "r")
    zip_file_object.extractall()

!ls -l ml-25m

total 1129584
-rw-r--r-- 1 root root     10460 Dec 11 08:24 README.txt
-rw-r--r-- 1 root root 435164157 Dec 11 08:24 genome-scores.csv
-rw-r--r-- 1 root root     18103 Dec 11 08:24 genome-tags.csv
-rw-r--r-- 1 root root   1368578 Dec 11 08:24 links.csv
-rw-r--r-- 1 root root   3038099 Dec 11 08:24 movies.csv
-rw-r--r-- 1 root root 678260987 Dec 11 08:24 ratings.csv
-rw-r--r-- 1 root root  38810332 Dec 11 08:24 tags.csv


In [6]:
# Peek at the data
print(chdb.query("SELECT * FROM file('ml-25m/ratings.csv') LIMIT 5"))

1,296,5,1147880044
1,306,3.5,1147868817
1,307,5,1147868828
1,665,5,1147878820
1,899,3.5,1147868510



In [7]:
# Create tables for the tables of movieLens dataset
chs = session.Session()
chs.query("CREATE DATABASE IF NOT EXISTS movielens ENGINE = Atomic")
chs.query("USE movielens")
chs.query(
    "CREATE VIEW movies AS SELECT movieId, title, genres FROM file('ml-25m/movies.csv')"
)
chs.query(
    "CREATE VIEW ratings AS SELECT userId, movieId, rating, timestamp FROM file('ml-25m/ratings.csv')"
)
chs.query(
    "CREATE VIEW tags AS SELECT userId, movieId, tag, timestamp FROM file('ml-25m/tags.csv')"
)
print(chs.query("SELECT * FROM movies LIMIT 5", "CSVWithNames"))
print(chs.query("SELECT * FROM ratings LIMIT 5", "CSVWithNames"))
print(chs.query("SELECT * FROM tags LIMIT 5", "CSVWithNames"))

"movieId","title","genres"
1,"Toy Story (1995)","Adventure|Animation|Children|Comedy|Fantasy"
2,"Jumanji (1995)","Adventure|Children|Fantasy"
3,"Grumpier Old Men (1995)","Comedy|Romance"
4,"Waiting to Exhale (1995)","Comedy|Drama|Romance"
5,"Father of the Bride Part II (1995)","Comedy"

"userId","movieId","rating","timestamp"
1,296,5,1147880044
1,306,3.5,1147868817
1,307,5,1147868828
1,665,5,1147878820
1,899,3.5,1147868510

"userId","movieId","tag","timestamp"
3,260,"classic",1439472355
3,260,"sci-fi",1439472256
4,1732,"dark comedy",1573943598
4,1732,"great dialogue",1573943604
4,7569,"so bad it's good",1573943455



# Use word2vec to train the embeddings of movies

In [23]:
# Generate the movie id sequence from user ratings, the movies that have been rated >3.5 by users group by userId
# and concat with " ", order by timestamp
# The movie id sequence is used to generate the movie embedding,
# ie. user 1 rated movie 233, 21, 11 and user 2 rated movie 33, 11, 21
# then the movie id sequence is
# "233 21 11"
# "33 11 21"
movie_id_seq = chs.query("""SELECT arrayStringConcat(groupArray(movieId), ' ') FROM (
                            SELECT userId, movieId FROM ratings WHERE rating > 3.5  ORDER BY userId, timestamp
                            ) GROUP BY userId""")   


# Split the movie id sequence into list
moive_list = str(movie_id_seq).split("\n")

print("Length of movie list: ", len(moive_list))
print("First 5 movie list: ", moive_list[:5])


"858 1193 2959 50 183837 201773 122914 195159 8961 33794 6377 1203 904 912 2019 79132 58559 593 4226 122912 122916 4973 750 122926 68954 3504 955 4963 8984 53322 158783 45720 117887 178827 171253 1387 6533 49272 71745 154 7209 164909 1256 166461 2648 1340 5769 1198 1732 91094 1265 6893 1945 307 104283 3645 2716 1 78499 3114 201588 898 3546 1220 2936 50872 364 4262 200332 1079 195161 2700 946 1267 180031 953 176371 6273 111734 152970 35836 26393 164179 118466 25865 127202 56782 4361 1276 134853 5712 2065 61934 2761 145150 3362 3928 108932 112852 103980 7064 2423 8360 112450 141 82926 916 3039 2203 2132 3088 26171 951 1148 8641 60756 34162 106918 3948 1923 83976 1682 3988 1485 6373 180265 72294 3556 2987 2406 4681 188301 89745 59315 122920 288 38061 102125 2770 79702 110102 93840 104913 8798 142488 106920 6711 137857 111781 111759 648 1961 45186 189333 55820 139644 158872 69122 140174 93510 169992 122906 185029 5989 56152 4022 150 2797 2424 105504 107141 1042 3741 122882 189363 954 3836 

In [27]:
import multiprocessing
from gensim.models import Word2Vec

cores = multiprocessing.cpu_count()

# Split the movie id sequence into a list of lists
movie_id_seq_list = [seq.strip("\"").split() for seq in moive_list]
print("Length of movie id sequence list: ", len(movie_id_seq_list))
print("First 5 movie id sequence list: ", movie_id_seq_list[:5])

# Train the Word2Vec model using CBOW
model = Word2Vec(sg=0, window=5, vector_size=16, min_count=1, workers=cores-1)
model.build_vocab(movie_id_seq_list, progress_per=10000)
print("Vocabulary size: ", len(model.wv))

model.train(movie_id_seq_list, total_examples=model.corpus_count, epochs=10, report_delay=1)

# Print model info
print("Vocabulary content: ", model.wv.index_to_key)


Length of movie id sequence list:  162343
First 5 movie id sequence list:  [['858', '1193', '2959', '50', '183837', '201773', '122914', '195159', '8961', '33794', '6377', '1203', '904', '912', '2019', '79132', '58559', '593', '4226', '122912', '122916', '4973', '750', '122926', '68954', '3504', '955', '4963', '8984', '53322', '158783', '45720', '117887', '178827', '171253', '1387', '6533', '49272', '71745', '154', '7209', '164909', '1256', '166461', '2648', '1340', '5769', '1198', '1732', '91094', '1265', '6893', '1945', '307', '104283', '3645', '2716', '1', '78499', '3114', '201588', '898', '3546', '1220', '2936', '50872', '364', '4262', '200332', '1079', '195161', '2700', '946', '1267', '180031', '953', '176371', '6273', '111734', '152970', '35836', '26393', '164179', '118466', '25865', '127202', '56782', '4361', '1276', '134853', '5712', '2065', '61934', '2761', '145150', '3362', '3928', '108932', '112852', '103980', '7064', '2423', '8360', '112450', '141', '82926', '916', '3039', '

# Test find similar movies

In [32]:
input_movie_id = 1
top_k = 10
print("Input movie: ", chs.query(f"SELECT title FROM movies WHERE movieId = {input_movie_id}", "CSV"))
print("Top 10 similar movies: ")
similar_movies = model.wv.most_similar(str(input_movie_id), topn=top_k)
print(chs.query(f"SELECT movieId, title FROM movies WHERE movieId IN ({','.join([str(m[0]) for m in similar_movies])})", "CSV"))

Input movie:  "Toy Story (1995)"

Top 10 similar movies: 
34,"Babe (1995)"
150,"Apollo 13 (1995)"
356,"Forrest Gump (1994)"
364,"Lion King, The (1994)"
480,"Jurassic Park (1993)"
588,"Aladdin (1992)"
595,"Beauty and the Beast (1991)"
1265,"Groundhog Day (1993)"
1270,"Back to the Future (1985)"
3114,"Toy Story 2 (1999)"

