In [15]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

In [16]:
# Load cleaned dataset
movies = pd.read_csv("D:/movie_recommendation_system/rcmndn_model/data/processed/movies_cleaned.csv")


In [17]:
# Step 1: Apply MultiLabelBinarizer on genres_y
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(movies['genres_y'].apply(eval))  # eval converts string "[...]" to list

In [18]:
# Step 2: Create a DataFrame for genres
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_, index=movies.index)


In [19]:
# Step 3: Concatenate with original movies dataframe (optional, for inspection)
movies_genres = pd.concat([movies[['movieId', 'title']], genres_df], axis=1)


In [20]:
print(" Genres encoded with MultiLabelBinarizer")
print(movies_genres.head(5))
print(f"Total genres: {len(mlb.classes_)}")

 Genres encoded with MultiLabelBinarizer
   movieId                           title  Action  Adventure  Animation  \
0        1                Toy Story (1995)       0          0          1   
1       10                GoldenEye (1995)       1          1          0   
2       11  American President, The (1995)       0          0          0   
3       14                    Nixon (1995)       0          0          0   
4       15         Cutthroat Island (1995)       1          1          0   

   Comedy  Crime  Documentary  Drama  Family  ...  History  Horror  Music  \
0       1      0            0      0       1  ...        0       0      0   
1       0      0            0      0       0  ...        0       0      0   
2       1      0            0      1       0  ...        0       0      0   
3       0      0            0      1       0  ...        1       0      0   
4       0      0            0      0       0  ...        0       0      0   

   Mystery  Romance  Science Fiction  T

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 2: TF-IDF on overview
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")

# Fit & transform the overview column
tfidf_matrix = tfidf.fit_transform(movies["overview"])

print(" TF-IDF matrix created")
print(f"Shape: {tfidf_matrix.shape}")  # (n_movies, 5000 features)


 TF-IDF matrix created
Shape: (3537, 5000)


In [22]:
from sklearn.preprocessing import MinMaxScaler

# Step 3: Numeric features
numeric_features = movies[["runtime", "vote_average", "popularity"]]

scaler = MinMaxScaler()
numeric_scaled = scaler.fit_transform(numeric_features)

print(" Numeric features scaled")
print(f"Shape: {numeric_scaled.shape}")  # (n_movies, 3)
print("Sample (first 5 rows):")
print(numeric_scaled[:5])


 Numeric features scaled
Shape: (3537, 3)
Sample (first 5 rows):
[[0.24454148 0.68493151 0.08409696]
 [0.45851528 0.53424658 0.06831773]
 [0.35371179 0.52054795 0.01261963]
 [0.72925764 0.60273973 0.00429755]
 [0.41048035 0.4109589  0.00801985]]


In [23]:
from scipy.sparse import hstack

# Combine genres (sparse), overview (sparse), numeric (dense → sparse)
from scipy.sparse import csr_matrix

X_content = hstack([genres_encoded, tfidf_matrix, csr_matrix(numeric_scaled)])

print(" Final Content Feature Matrix created")
print(f"Shape: {X_content.shape}")


 Final Content Feature Matrix created
Shape: (3537, 5023)


In [24]:
from scipy.sparse import save_npz
import os

# Path to save processed features
processed_dir = "D:/movie_recommendation_system/rcmndn_model/trained_models"
os.makedirs(processed_dir, exist_ok=True)

# Save the sparse feature matrix
save_npz(os.path.join(processed_dir, "X_content.npz"), X_content)

print(f" Final Content Feature Matrix saved at: {processed_dir}/X_content.npz")


 Final Content Feature Matrix saved at: D:/movie_recommendation_system/rcmndn_model/trained_models/X_content.npz


In [25]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# --- Release Year ---
# Extract year from release_date (fill missing with median year)
movies["release_year"] = pd.to_datetime(
    movies["release_date"], errors="coerce"
).dt.year
movies["release_year"] = movies["release_year"].fillna(movies["release_year"].median())

# --- Vote Count ---
# Log-transform to reduce skew, then fill missing
movies["vote_count_log"] = np.log1p(movies["vote_count"].fillna(0))


In [26]:
scaler = MinMaxScaler()

numeric_features = movies[["runtime", "vote_average", "popularity", "release_year", "vote_count_log"]]
numeric_scaled = scaler.fit_transform(numeric_features)

print(" Numeric features scaled")
print(f"Shape: {numeric_scaled.shape}")
print("Sample (first 5 rows):")
print(numeric_scaled[:5])


 Numeric features scaled
Shape: (3537, 5)
Sample (first 5 rows):
[[0.24454148 0.68493151 0.08409696 0.79       0.89143942]
 [0.45851528 0.53424658 0.06831773 0.79       0.72159049]
 [0.35371179 0.52054795 0.01261963 0.79       0.5189042 ]
 [0.72925764 0.60273973 0.00429755 0.79       0.40556515]
 [0.41048035 0.4109589  0.00801985 0.79       0.47837237]]


In [27]:
from scipy.sparse import hstack, csr_matrix
from scipy.sparse import save_npz
import os

# Combine everything: genres (sparse) + overview (tfidf sparse) + numeric (dense → sparse)
X_content_v2 = hstack([genres_encoded, tfidf_matrix, csr_matrix(numeric_scaled)])

print(" Final Content Feature Matrix (v2) created")
print(f"Shape: {X_content_v2.shape}")


 Final Content Feature Matrix (v2) created
Shape: (3537, 5025)


In [28]:
# Save to trained_models
trained_dir = "D:/movie_recommendation_system/rcmndn_model/trained_models"
os.makedirs(trained_dir, exist_ok=True)

save_npz(os.path.join(trained_dir, "X_content_v2.npz"), X_content_v2)

print(f" Final Content Feature Matrix (v2) saved at: {trained_dir}/X_content_v2.npz")


 Final Content Feature Matrix (v2) saved at: D:/movie_recommendation_system/rcmndn_model/trained_models/X_content_v2.npz
