In [64]:
%pip install --upgrade --quiet tensorflow-cpu chdb pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.11 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [65]:
import pandas as pd
import zipfile
import urllib.request
import os
import chdb
from chdb import session

# Download and extract the dataset
if not os.path.exists("ml-25m/ratings.csv"):
    url = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context
    filehandle, _ = urllib.request.urlretrieve(url)
    zip_file_object = zipfile.ZipFile(filehandle, "r")
    zip_file_object.extractall()

!ls -l ml-25m

total 2259160
-rw-r--r--  1 auxten  staff      10460 Dec 10 18:09 README.txt
-rw-r--r--  1 auxten  staff  435164157 Dec 10 18:09 genome-scores.csv
-rw-r--r--  1 auxten  staff      18103 Dec 10 18:09 genome-tags.csv
-rw-r--r--  1 auxten  staff    1368578 Dec 10 18:09 links.csv
-rw-r--r--  1 auxten  staff    3038099 Dec 10 18:09 movies.csv
-rw-r--r--  1 auxten  staff  678260987 Dec 10 18:09 ratings.csv
-rw-r--r--  1 auxten  staff   38810332 Dec 10 18:09 tags.csv


In [66]:
# Peek at the data
print(chdb.query("SELECT * FROM file('ml-25m/ratings.csv') LIMIT 5"))

1,296,5,1147880044
1,306,3.5,1147868817
1,307,5,1147868828
1,665,5,1147878820
1,899,3.5,1147868510



## Create views for the tables of movieLens dataset

In [74]:
# Create tables for the tables of movieLens dataset
chs = session.Session()
chs.query("CREATE DATABASE IF NOT EXISTS movielens ENGINE = Atomic")
chs.query("USE movielens")
chs.query(
    "CREATE VIEW movies AS SELECT movieId, title, genres FROM file('ml-25m/movies.csv')"
)
chs.query(
    "CREATE VIEW ratings AS SELECT userId, movieId, rating, timestamp FROM file('ml-25m/ratings.csv')"
)
chs.query(
    "CREATE VIEW tags AS SELECT userId, movieId, tag, timestamp FROM file('ml-25m/tags.csv')"
)
print(chs.query("SELECT * FROM movies LIMIT 5", "CSVWithNames"))
print(chs.query("SELECT * FROM ratings LIMIT 5", "CSVWithNames"))
print(chs.query("SELECT * FROM tags LIMIT 5", "CSVWithNames"))

"movieId","title","genres"
1,"Toy Story (1995)","Adventure|Animation|Children|Comedy|Fantasy"
2,"Jumanji (1995)","Adventure|Children|Fantasy"
3,"Grumpier Old Men (1995)","Comedy|Romance"
4,"Waiting to Exhale (1995)","Comedy|Drama|Romance"
5,"Father of the Bride Part II (1995)","Comedy"

"userId","movieId","rating","timestamp"
1,296,5,1147880044
1,306,3.5,1147868817
1,307,5,1147868828
1,665,5,1147878820
1,899,3.5,1147868510

"userId","movieId","tag","timestamp"
3,260,"classic",1439472355
3,260,"sci-fi",1439472256
4,1732,"dark comedy",1573943598
4,1732,"great dialogue",1573943604
4,7569,"so bad it's good",1573943455



## Create a view to join the movies/ratings

In [90]:
# Create a view to join the movies/ratings, if user rating >3.5 to a movie then 1(like) else 0(dislike)
chs.query(
    """
    CREATE OR REPLACE VIEW user_ratings AS
        SELECT ratings.userId userId, ratings.movieId movieId, movies.title title, genres,
            CASE WHEN rating > 3.5 THEN 1 ELSE 0 END AS liked
        FROM ratings
        JOIN movies USING movieId
    """
)
# Peek at the data
print(chs.query("SELECT * FROM user_ratings LIMIT 5", "CSVWithNames"))

# Split the data into train and test with userId
chs.query(
    """
    CREATE OR REPLACE VIEW train AS
        SELECT userId, movieId, liked
        FROM user_ratings
        WHERE userId % 10 < 8;
    CREATE OR REPLACE VIEW test AS
        SELECT userId, movieId, liked
        FROM user_ratings
        WHERE userId % 10 >= 8;
    """
)
# Count the number of rows in train and test
print("Training rows:", chs.query("SELECT COUNT(*) FROM train"))
print("Test rows:", chs.query("SELECT COUNT(*) FROM test"))

"userId","movieId","title","genres","liked"
1,296,"Pulp Fiction (1994)","Comedy|Crime|Drama|Thriller",1
1,306,"Three Colors: Red (Trois couleurs: Rouge) (1994)","Drama",0
1,307,"Three Colors: Blue (Trois couleurs: Bleu) (1993)","Drama",1
1,665,"Underground (1995)","Comedy|Drama|War",1
1,899,"Singin' in the Rain (1952)","Comedy|Musical|Romance",0

Training rows: 19994500

Test rows: 5005595



## Train a DNN model to predict if a user will like a movie or not

In [None]:
import tensorflow as tf
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

# Load the data
data = chs.query(
    "SELECT userId, movieId, liked, title, genres FROM train", "CSVWithNames"
)
data = np.array(data[1:], dtype=np.float32)

# Split the data into features and labels
X = data[:, :-1]
y = data[:, -1]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the movie title to extract the shooting year as a feature
title_years = np.array(
    [int(title.split("(")[-1].split(")")[0]) for title in X_train[:, 3]]
)
X_train = np.concatenate((X_train, title_years[:, np.newaxis]), axis=1)

# Split the genres by "|" to make the genre as a sparse feature
genres_sparse = np.array([genre.split("|") for genre in X_train[:, 4]])
X_train = np.concatenate((X_train, genres_sparse), axis=1)

# Define the model architecture
model = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(1),
    ]
)

# Compile the model
model.compile(optimizer="adam", loss="mse")

# Train the model
history = model.fit(
    X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32
)

# Evaluate the model
X_val_title_years = np.array(
    [int(title.split("(")[-1].split(")")[0]) for title in X_val[:, 3]]
)
X_val_genres_sparse = np.array([genre.split("|") for genre in X_val[:, 4]])
X_val = np.concatenate(
    (X_val, X_val_title_years[:, np.newaxis], X_val_genres_sparse), axis=1
)

y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("RMSE:", rmse)