# Import libraries

In [1]:
# Import libraries
import os
import math
import zipfile
from urllib.request import urlretrieve
import requests
import gzip
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import random
from typing import List, Tuple
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from scipy.sparse import hstack
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import StringLookup


# Import data

In [2]:
URL_MOVIELENS = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
URL_IMBD_NAMES_BASICS = "https://datasets.imdbws.com/name.basics.tsv.gz"
URL_IMBD_TITLE_BASICS = "https://datasets.imdbws.com/title.basics.tsv.gz"
URL_IMBD_TITLE_RATINGS = "https://datasets.imdbws.com/title.ratings.tsv.gz"
# Local path where the file will be saved
LOCAL_MOVIELENS_PATH = "ml-1m.zip"
LOCAL_IMBD_NAMES_BASICS_PATH = "name.basics.tsv.gz"
LOCAL_IMBD_TITLE_BASICS_PATH = "title.basics.tsv.gz"
LOCAL_IMBD_TITLE_RATINGS_PATH = "title.ratings.tsv.gz"
# Directory where the dataset will be extracted
EXTRACT_DIR = "dataset"

In [3]:
# Function to download the file
def download_file(url, local_filename):
    print(f"Downloading {url} to {local_filename}")
    # Check if the file already exists
    if os.path.exists(local_filename):
        print(f"File {local_filename} already exists")
        return local_filename
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    print(f"Downloaded {url} to {local_filename}")
    return local_filename

# Function to unzip the file
def unzip_file(zip_path, extract_to):
    print(f"Unzipping {zip_path} to {extract_to}")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Unzipped {zip_path} to {extract_to}")

def gunzip_file(gz_path, extract_to):
    print(f"Gunzipping {gz_path} to {extract_to}")
    with gzip.open(gz_path, 'rb') as f_in:
        with open(extract_to, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f"Gunzipped {gz_path} to {extract_to}")

In [4]:
# Ensure the dataset directory exists
os.makedirs(EXTRACT_DIR, exist_ok=True)
# Download the file
download_file(URL_MOVIELENS, LOCAL_MOVIELENS_PATH)
download_file(URL_IMBD_NAMES_BASICS, LOCAL_IMBD_NAMES_BASICS_PATH)
download_file(URL_IMBD_TITLE_BASICS, LOCAL_IMBD_TITLE_BASICS_PATH)
download_file(URL_IMBD_TITLE_RATINGS, LOCAL_IMBD_TITLE_RATINGS_PATH)

Downloading https://files.grouplens.org/datasets/movielens/ml-1m.zip to ml-1m.zip
File ml-1m.zip already exists
Downloading https://datasets.imdbws.com/name.basics.tsv.gz to name.basics.tsv.gz
File name.basics.tsv.gz already exists
Downloading https://datasets.imdbws.com/title.basics.tsv.gz to title.basics.tsv.gz
File title.basics.tsv.gz already exists
Downloading https://datasets.imdbws.com/title.ratings.tsv.gz to title.ratings.tsv.gz
File title.ratings.tsv.gz already exists


'title.ratings.tsv.gz'

In [5]:
# Extract the files
print("Unzipping file...")
unzip_file(LOCAL_MOVIELENS_PATH, EXTRACT_DIR)

print("Gunzipping files...")
gunzip_file(LOCAL_IMBD_NAMES_BASICS_PATH, os.path.join(EXTRACT_DIR, "name.basics.tsv"))
gunzip_file(LOCAL_IMBD_TITLE_BASICS_PATH, os.path.join(EXTRACT_DIR, "title.basics.tsv"))
gunzip_file(LOCAL_IMBD_TITLE_RATINGS_PATH, os.path.join(EXTRACT_DIR, "title.ratings.tsv"))

print("Extraction complete.")

Unzipping file...
Unzipping ml-1m.zip to dataset
Unzipped ml-1m.zip to dataset
Gunzipping files...
Gunzipping name.basics.tsv.gz to dataset\name.basics.tsv
Gunzipped name.basics.tsv.gz to dataset\name.basics.tsv
Gunzipping title.basics.tsv.gz to dataset\title.basics.tsv
Gunzipped title.basics.tsv.gz to dataset\title.basics.tsv
Gunzipping title.ratings.tsv.gz to dataset\title.ratings.tsv
Gunzipped title.ratings.tsv.gz to dataset\title.ratings.tsv
Extraction complete.


# Preprocess data

In [58]:
movies_dat_df = pd.read_csv('dataset/ml-1m/movies.dat', sep='::', header=None, engine='python', names=['MovieID', 'Title', 'Genres'], encoding='ISO-8859-1')
ratings = pd.read_csv('dataset/ml-1m/ratings.dat', sep='::', header=None, engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='ISO-8859-1')
users = pd.read_csv('dataset/ml-1m/users.dat', sep='::', header=None, engine='python', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], encoding='ISO-8859-1')

name_basics_df = pd.read_csv('dataset/name.basics.tsv', sep='\t', header=0)
title_basics_df = pd.read_csv('dataset/title.basics.tsv', sep='\t', header=0)
title_ratings_df = pd.read_csv('dataset/title.ratings.tsv', sep='\t', header=0)

# Filter only movies from title_basics_df
title_basics_df = title_basics_df[title_basics_df['titleType'] == 'movie']

# Preprocess the Title columns
movies_dat_df['Title'] = movies_dat_df['Title'].str.lower()
movies_dat_df['Title'] = movies_dat_df['Title'].str.replace(r"\(.*\)", "", regex=True).str.strip()

title_basics_df['primaryTitle'] = title_basics_df['primaryTitle'].str.lower().str.strip()

# ratings.drop(columns=['Timestamp'], inplace=True)



  title_basics_df = pd.read_csv('dataset/title.basics.tsv', sep='\t', header=0)


Here, we do some simple data processing to fix the data types of the columns.

In [None]:
users["UserID"] = users["UserID"].apply(lambda x: f"user_{x}")
users["Age"] = users["Age"].apply(lambda x: f"group_{x}")
users["Occupation"] = users["Occupation"].apply(lambda x: f"occupation_{x}")
movies = pd.merge(movies_dat_df, title_basics_df, left_on='Title', right_on='primaryTitle', how='left')

# Replace NaN values with empty strings
movies['Genres'] = movies['Genres'].fillna('')
movies['genres'] = movies['genres'].fillna('')

# Replace '|' with ',' in Genres column and convert to lowercase
movies['Genres'] = movies['Genres'].str.replace('|', ',')
movies['Genres'] = movies['Genres'].str.lower()

# Convert genres column to lowercase
movies['genres'] = movies['genres'].str.lower()

#Delete the \\N values in the genres column
movies = movies[movies['genres'] != '\\n']

# Function to combine and clean genre columns
def combine_genres(row):
    genres = set(row['genres'].split(',')) | set(row['Genres'].split(','))
    genres.discard('')  # Remove empty strings
    return ','.join(sorted(genres))

# Apply the function to combine the genres
movies['combined_genres'] = movies.apply(combine_genres, axis=1)

# Drop the original genre columns
movies = movies.drop(columns=['genres', 'Genres', 'primaryTitle'])
movies = pd.merge(movies, title_ratings_df, on='tconst', how='left')
movies = movies.drop(columns=['tconst'])
# Drop the column endYear as it is always NaN
movies = movies.drop(columns=['endYear'])
# Drop also the column titleType as we will keep only the rated movies
movies = movies.drop(columns=['titleType'])
# Drop the original Title column as we will use the cleaned one
movies = movies.drop(columns=['Title'])
# For the runtimeMinutes column, we will replace the NaN values with the median
movies['runtimeMinutes'] = pd.to_numeric(movies['runtimeMinutes'], errors='coerce')
movies['runtimeMinutes'] = movies['runtimeMinutes'].fillna(movies['runtimeMinutes'].median())

ratings["MovieID"] = ratings["MovieID"].apply(lambda x: f"movie_{x}")
ratings["UserID"] = ratings["UserID"].apply(lambda x: f"user_{x}")
ratings["Rating"] = ratings["Rating"].apply(lambda x: float(x))


In [None]:
ratings

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,user_user_user_user_1,movie_movie_movie_movie_1193,5.0,978300760
1,user_user_user_user_1,movie_movie_movie_movie_661,3.0,978302109
2,user_user_user_user_1,movie_movie_movie_movie_914,3.0,978301968
3,user_user_user_user_1,movie_movie_movie_movie_3408,4.0,978300275
4,user_user_user_user_1,movie_movie_movie_movie_2355,5.0,978824291
...,...,...,...,...
1000204,user_user_user_user_6040,movie_movie_movie_movie_1091,1.0,956716541
1000205,user_user_user_user_6040,movie_movie_movie_movie_1094,5.0,956704887
1000206,user_user_user_user_6040,movie_movie_movie_movie_562,5.0,956704746
1000207,user_user_user_user_6040,movie_movie_movie_movie_1096,4.0,956715648


Each movie has multiple genres. We split them into separate columns in the `movies`
DataFrame.

In [None]:
all_genres = set()
for genres in movies['combined_genres'].str.split(','):
    all_genres.update(genres)
all_genres = list(all_genres)
for genre in all_genres:
    movies[genre] = movies["combined_genres"].apply(
        lambda values: int(genre in values.split(","))
    )

### Transform the movie ratings data into sequences

First, let's sort the the ratings data using the `unix_timestamp`, and then group the
`movie_id` values and the `rating` values by `user_id`.

The output DataFrame will have a record for each `user_id`, with two ordered lists
(sorted by rating datetime): the movies they have rated, and their ratings of these movies.


In [None]:
ratings_group = ratings.sort_values(by=["Timestamp"]).groupby("UserID")
ratings_data = pd.DataFrame(
    data={
        "UserID": list(ratings_group.groups.keys()),
        "MovieIDs": list(ratings_group.UserID.apply(list)),
        "Ratings": list(ratings_group.Rating.apply(list)),
        "Timestamps": list(ratings_group.Timestamp.apply(list)),
    }
)

In [None]:
ratings_data

Unnamed: 0,UserID,MovieIDs,Ratings,Timestamps
0,user_user_user_1,"[user_user_user_1, user_user_user_1, user_user...","[4.0, 4.0, 5.0, 5.0, 3.0, 5.0, 4.0, 4.0, 5.0, ...","[978300019, 978300055, 978300055, 978300055, 9..."
1,user_user_user_10,"[user_user_user_10, user_user_user_10, user_us...","[4.0, 3.0, 3.0, 4.0, 4.0, 5.0, 5.0, 5.0, 3.0, ...","[978224375, 978224375, 978224375, 978224400, 9..."
2,user_user_user_100,"[user_user_user_100, user_user_user_100, user_...","[4.0, 3.0, 4.0, 3.0, 4.0, 3.0, 1.0, 1.0, 5.0, ...","[977593595, 977593595, 977593607, 977593624, 9..."
3,user_user_user_1000,"[user_user_user_1000, user_user_user_1000, use...","[4.0, 5.0, 4.0, 3.0, 5.0, 5.0, 2.0, 5.0, 5.0, ...","[975040566, 975040566, 975040566, 975040629, 9..."
4,user_user_user_1001,"[user_user_user_1001, user_user_user_1001, use...","[4.0, 4.0, 4.0, 2.0, 2.0, 1.0, 4.0, 5.0, 5.0, ...","[975039591, 975039702, 975039702, 975039898, 9..."
...,...,...,...,...
6035,user_user_user_995,"[user_user_user_995, user_user_user_995, user_...","[2.0, 4.0, 5.0, 3.0, 3.0, 4.0, 4.0, 4.0, 3.0, ...","[975054785, 975054785, 975054785, 975054853, 9..."
6036,user_user_user_996,"[user_user_user_996, user_user_user_996, user_...","[4.0, 3.0, 5.0, 3.0, 5.0, 5.0, 5.0, 5.0, 4.0, ...","[975052132, 975052132, 975052195, 975052284, 9..."
6037,user_user_user_997,"[user_user_user_997, user_user_user_997, user_...","[4.0, 3.0, 3.0, 3.0, 2.0, 5.0, 5.0, 5.0, 4.0, ...","[975044235, 975044425, 975044426, 975044426, 9..."
6038,user_user_user_998,"[user_user_user_998, user_user_user_998, user_...","[3.0, 4.0, 5.0, 5.0, 4.0, 3.0, 4.0, 3.0, 4.0, ...","[975043499, 975043593, 975043593, 975043593, 9..."
