# Import

In [1]:
import os
import requests
import zipfile
import gzip
import shutil
import pandas as pd
import numpy as np

# Data Collection and Preprocessing

## Data Collection

### Downloading the Data

In [2]:
URL_MOVIELENS = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
URL_IMBD_NAMES_BASICS = "https://datasets.imdbws.com/name.basics.tsv.gz"
URL_IMBD_TITLE_BASICS = "https://datasets.imdbws.com/title.basics.tsv.gz"
URL_IMBD_TITLE_RATINGS = "https://datasets.imdbws.com/title.ratings.tsv.gz"
# Local path where the file will be saved
LOCAL_MOVIELENS_PATH = "ml-1m.zip"
LOCAL_IMBD_NAMES_BASICS_PATH = "name.basics.tsv.gz"
LOCAL_IMBD_TITLE_BASICS_PATH = "title.basics.tsv.gz"
LOCAL_IMBD_TITLE_RATINGS_PATH = "title.ratings.tsv.gz"
# Directory where the dataset will be extracted
EXTRACT_DIR = "dataset"

In [3]:
# Function to download the file
def download_file(url, local_filename):
    print(f"Downloading {url} to {local_filename}")
    # Check if the file already exists
    if os.path.exists(local_filename):
        print(f"File {local_filename} already exists")
        return local_filename
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    print(f"Downloaded {url} to {local_filename}")
    return local_filename

# Function to unzip the file
def unzip_file(zip_path, extract_to):
    print(f"Unzipping {zip_path} to {extract_to}")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Unzipped {zip_path} to {extract_to}")

def gunzip_file(gz_path, extract_to):
    print(f"Gunzipping {gz_path} to {extract_to}")
    with gzip.open(gz_path, 'rb') as f_in:
        with open(extract_to, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f"Gunzipped {gz_path} to {extract_to}")

In [4]:
# Ensure the dataset directory exists
os.makedirs(EXTRACT_DIR, exist_ok=True)
# Download the file
download_file(URL_MOVIELENS, LOCAL_MOVIELENS_PATH)
download_file(URL_IMBD_NAMES_BASICS, LOCAL_IMBD_NAMES_BASICS_PATH)
download_file(URL_IMBD_TITLE_BASICS, LOCAL_IMBD_TITLE_BASICS_PATH)
download_file(URL_IMBD_TITLE_RATINGS, LOCAL_IMBD_TITLE_RATINGS_PATH)

Downloading https://files.grouplens.org/datasets/movielens/ml-1m.zip to ml-1m.zip
File ml-1m.zip already exists
Downloading https://datasets.imdbws.com/name.basics.tsv.gz to name.basics.tsv.gz
File name.basics.tsv.gz already exists
Downloading https://datasets.imdbws.com/title.basics.tsv.gz to title.basics.tsv.gz
File title.basics.tsv.gz already exists
Downloading https://datasets.imdbws.com/title.ratings.tsv.gz to title.ratings.tsv.gz
File title.ratings.tsv.gz already exists


'title.ratings.tsv.gz'

In [5]:
# Extract the files
print("Unzipping file...")
unzip_file(LOCAL_MOVIELENS_PATH, EXTRACT_DIR)

print("Gunzipping files...")
gunzip_file(LOCAL_IMBD_NAMES_BASICS_PATH, os.path.join(EXTRACT_DIR, "name.basics.tsv"))
gunzip_file(LOCAL_IMBD_TITLE_BASICS_PATH, os.path.join(EXTRACT_DIR, "title.basics.tsv"))
gunzip_file(LOCAL_IMBD_TITLE_RATINGS_PATH, os.path.join(EXTRACT_DIR, "title.ratings.tsv"))

print("Extraction complete.")

Unzipping file...
Unzipping ml-1m.zip to dataset
Unzipped ml-1m.zip to dataset
Gunzipping files...
Gunzipping name.basics.tsv.gz to dataset\name.basics.tsv
Gunzipped name.basics.tsv.gz to dataset\name.basics.tsv
Gunzipping title.basics.tsv.gz to dataset\title.basics.tsv
Gunzipped title.basics.tsv.gz to dataset\title.basics.tsv
Gunzipping title.ratings.tsv.gz to dataset\title.ratings.tsv
Gunzipped title.ratings.tsv.gz to dataset\title.ratings.tsv
Extraction complete.


## Preprocessing

In [6]:
movies_dat_df = pd.read_csv('dataset/ml-1m/movies.dat', sep='::', header=None, engine='python', names=['MovieID', 'Title', 'Genres'], encoding='ISO-8859-1')
ratings_dat_df = pd.read_csv('dataset/ml-1m/ratings.dat', sep='::', header=None, engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='ISO-8859-1')
users_dat_df = pd.read_csv('dataset/ml-1m/users.dat', sep='::', header=None, engine='python', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], encoding='ISO-8859-1')

name_basics_df = pd.read_csv('dataset/name.basics.tsv', sep='\t', header=0)
title_basics_df = pd.read_csv('dataset/title.basics.tsv', sep='\t', header=0)
title_ratings_df = pd.read_csv('dataset/title.ratings.tsv', sep='\t', header=0)

# Filter only movies from title_basics_df
title_basics_df = title_basics_df[title_basics_df['titleType'] == 'movie']

# Preprocess the Title columns
movies_dat_df['Title'] = movies_dat_df['Title'].str.lower()
movies_dat_df['Title'] = movies_dat_df['Title'].str.replace(r"\(.*\)", "", regex=True).str.strip()

title_basics_df['primaryTitle'] = title_basics_df['primaryTitle'].str.lower().str.strip()

ratings_dat_df.drop(columns=['Timestamp'], inplace=True)



  title_basics_df = pd.read_csv('dataset/title.basics.tsv', sep='\t', header=0)


In [54]:
features_df = pd.merge(movies_dat_df, title_basics_df, left_on='Title', right_on='primaryTitle', how='left')

# Replace NaN values with empty strings
features_df['Genres'] = features_df['Genres'].fillna('')
features_df['genres'] = features_df['genres'].fillna('')

# Replace '|' with ',' in Genres column and convert to lowercase
features_df['Genres'] = features_df['Genres'].str.replace('|', ',')
features_df['Genres'] = features_df['Genres'].str.lower()

# Convert genres column to lowercase
features_df['genres'] = features_df['genres'].str.lower()

#Delete the \\N values in the genres column
features_df = features_df[features_df['genres'] != '\\n']

# Function to combine and clean genre columns
def combine_genres(row):
    genres = set(row['genres'].split(',')) | set(row['Genres'].split(','))
    genres.discard('')  # Remove empty strings
    return ','.join(sorted(genres))

# Apply the function to combine the genres
features_df['combined_genres'] = features_df.apply(combine_genres, axis=1)

# Drop the original genre columns
features_df = features_df.drop(columns=['genres', 'Genres', 'primaryTitle'])
features_df = pd.merge(features_df, title_ratings_df, on='tconst', how='left')
features_df = features_df.drop(columns=['tconst'])
# Drop the column endYear as it is always NaN
features_df = features_df.drop(columns=['endYear'])
# Drop also the column titleType as we will keep only the rated movies
features_df = features_df.drop(columns=['titleType'])
# Drop the original Title column as we will use the cleaned one
features_df = features_df.drop(columns=['Title'])
# For the runtimeMinutes column, we will replace the NaN values with the median
features_df['runtimeMinutes'] = pd.to_numeric(features_df['runtimeMinutes'], errors='coerce')
features_df['runtimeMinutes'] = features_df['runtimeMinutes'].fillna(features_df['runtimeMinutes'].median())
features_df

Unnamed: 0,MovieID,originalTitle,isAdult,startYear,runtimeMinutes,combined_genres,averageRating,numVotes
0,1,Toy Story,0,1995,81.0,"adventure,animation,children's,comedy",8.3,1074033.0
1,2,Jumanji,0,1995,104.0,"adventure,children's,comedy,family,fantasy",7.1,379284.0
2,3,Grumpier Old Men,0,1995,101.0,"comedy,romance",6.6,29842.0
3,4,Waiting to Exhale,0,1995,124.0,"comedy,drama,romance",6.0,12281.0
4,5,Father of the Bride Part II,0,1995,106.0,"comedy,family,romance",6.1,41883.0
...,...,...,...,...,...,...,...,...
8372,3950,Tigerland,0,2000,101.0,"drama,war",6.9,43453.0
8373,3950,Taken by the Tiger,0,2019,91.0,"documentary,drama",6.9,72.0
8374,3950,Tigerland,0,\N,97.0,drama,,
8375,3951,Two Family House,0,2000,108.0,"comedy,drama,romance",7.2,1732.0


In [55]:

# Merge ratings with movie features
ratings_with_movies = ratings_dat_df.merge(features_df, on='MovieID', how='inner')

# Check for duplicates and aggregate if necessary
ratings_with_movies = ratings_with_movies.groupby(['UserID', 'MovieID'], as_index=False).agg({'Rating': 'mean'})

# Pivot the data to create user-item interaction matrix
user_item_matrix = ratings_with_movies.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)
# Set the 0 values in the rating column to NaN
user_item_matrix = user_item_matrix.replace(0, np.nan)
print(user_item_matrix)

MovieID  1     2     3     4     5     6     7     8     9     10    ...  \
UserID                                                               ...   
1         5.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         NaN   NaN   NaN   NaN   NaN   2.0   NaN   NaN   NaN   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
6036      NaN   NaN   NaN   2.0   NaN   3.0   NaN   NaN   NaN   NaN  ...   
6037      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6038      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6039      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6040      3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

MovieID  39