![dvd_image](dvd_image.jpg)

A DVD rental company needs your help! They want to figure out how many days a customer will rent a DVD for based on some features and has approached you for help. They want you to try out some regression models which will help predict the number of days a customer will rent a DVD for. The company wants a model which yeilds a MSE of 3 or less on a test set. The model you make will help the company become more efficient inventory planning.

The data they provided is in the csv file `rental_info.csv`. It has the following features:
- `"rental_date"`: The date (and time) the customer rents the DVD.
- `"return_date"`: The date (and time) the customer returns the DVD.
- `"amount"`: The amount paid by the customer for renting the DVD.
- `"amount_2"`: The square of `"amount"`.
- `"rental_rate"`: The rate at which the DVD is rented for.
- `"rental_rate_2"`: The square of `"rental_rate"`.
- `"release_year"`: The year the movie being rented was released.
- `"length"`: Lenght of the movie being rented, in minuites.
- `"length_2"`: The square of `"length"`.
- `"replacement_cost"`: The amount it will cost the company to replace the DVD.
- `"special_features"`: Any special features, for example trailers/deleted scenes that the DVD also has.
- `"NC-17"`, `"PG"`, `"PG-13"`, `"R"`: These columns are dummy variables of the rating of the movie. It takes the value 1 if the move is rated as the column name and 0 otherwise. For your convinience, the reference dummy has already been dropped.

In [50]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader,TensorDataset

In [51]:
# Load the dataset
df = pd.read_csv("rental_info.csv")  # Load the dataset containing rental information

# Convert the 'return_date' and 'rental_date' columns to datetime format
df["return_date"] = pd.to_datetime(df["return_date"])  # Convert return_date to datetime
df["rental_date"] = pd.to_datetime(df["rental_date"])  # Convert rental_date to datetime

# Calculate the rental length in days
df["rental_length_days"] = (df["return_date"] - df["rental_date"]).dt.days  # Subtract dates to get rental length

# Create dummy columns based on the values in 'special_features'
df["deleted_scenes"] = df["special_features"].apply(lambda x: 1 if "Deleted Scenes" in str(x) else 0)  # Mark if Deleted Scenes exists
df["behind_the_scenes"] = df["special_features"].apply(lambda x: 1 if "Behind the Scenes" in str(x) else 0)  # Mark if Behind the Scenes exists
df["trailers"] = df["special_features"].apply(lambda x: 1 if "Trailers" in str(x) else 0)  # Mark if Trailers exists
df["commentaries"] = df["special_features"].apply(lambda x: 1 if "Commentaries" in str(x) else 0)  # Mark if Commentaries exists

# Drop unnecessary columns
df.drop(["return_date", "rental_date", "special_features"], axis=1, inplace=True)  # Remove columns no longer needed

# Define target variable (Y) and feature variables (X)
Y = df[["rental_length_days"]]  # Target variable: rental length in days
X = df.drop("rental_length_days", axis=1)  # Feature variables

In [52]:
# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=9)  # 80% training, 20% testing

# Standardize the training features
scaler_train = StandardScaler()  # Initialize a StandardScaler
X_train_scale = torch.tensor(scaler_train.fit_transform(X_train), dtype=torch.float)  # Scale and convert to PyTorch tensor
Y_train = torch.tensor(Y_train.values, dtype=torch.float)  # Convert target to PyTorch tensor

# Standardize the testing features
scaler_test = StandardScaler()  # Initialize a StandardScaler for test set
X_test_scale = torch.tensor(scaler_test.fit_transform(X_test), dtype=torch.float)  # Scale and convert to PyTorch tensor
Y_test = torch.tensor(Y_test.values, dtype=torch.float)  # Convert target to PyTorch tensor

# Create DataLoader objects for training and testing
dataset_train = TensorDataset(X_train_scale, Y_train)  # Combine features and target into a dataset
dataloader_train = DataLoader(dataset_train, batch_size=10, shuffle=True)  # DataLoader for training

dataset_test = TensorDataset(X_test_scale, Y_test)  # Combine features and target into a dataset
dataloader_test = DataLoader(dataset_test, batch_size=10, shuffle=False)  # DataLoader for testing

In [55]:
# Define the Multi-Layer Neural Network
class MultiLayerNN(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super().__init__()
        self.relu = nn.ReLU()  # ReLU activation function
        self.fc1 = nn.Linear(in_dim, hidden_dim)  # First hidden layer
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)  # Second hidden layer
        self.fc3 = nn.Linear(hidden_dim, out_dim)  # Output layer
    
    def forward(self, x):
        # Forward pass through the network
        x = self.relu(self.fc1(x))  # First hidden layer
        x = self.relu(self.fc2(x))  # Second hidden layer
        x = self.fc3(x)  # Output layer (no activation for regression)
        return x

# Initialize model, loss function, and optimizer
learning_rate = 0.01  # Learning rate for optimizer
epochs = 1000  # Number of training epochs
model = MultiLayerNN(X_train_scale.shape[1], 256, 1)  # Initialize the model with input, hidden, and output dimensions
criterion = nn.MSELoss()  # Mean Squared Error Loss for regression
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  # Stochastic Gradient Descent optimizer
model.train()  # Set model to training mode

MultiLayerNN(
  (relu): ReLU()
  (fc1): Linear(in_features=16, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=1, bias=True)
)

In [56]:
# Training loop
for epoch in range(epochs):
    training_losses = []  # Track loss for each epoch
    for batch in dataloader_train:
        x, y = batch[0], batch[1]  # Extract features and target from batch
        optimizer.zero_grad()  # Reset gradients
        pred = model(x)  # Forward pass
        loss = criterion(pred, y)  # Compute loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights
        training_losses.append(loss.item())  # Store loss for the epoch
    if (epoch + 1) % 50 == 0:  # Print loss every 50 epochs
        print(f"Training loss of Epoch {epoch + 1}: {np.mean(training_losses)}")

Training loss of Epoch 50: 1.5728729182077825
Training loss of Epoch 100: 1.5014666848007951
Training loss of Epoch 150: 1.4636295957533367
Training loss of Epoch 200: 1.4491577288818416
Training loss of Epoch 250: 1.4395851522707581
Training loss of Epoch 300: 1.4315606250865802
Training loss of Epoch 350: 1.423676493208506
Training loss of Epoch 400: 1.4175708467291979
Training loss of Epoch 450: 1.4134946766862069
Training loss of Epoch 500: 1.4106774448679131
Training loss of Epoch 550: 1.4045877663449475
Training loss of Epoch 600: 1.4028247452691092
Training loss of Epoch 650: 1.4021198511699038
Training loss of Epoch 700: 1.3991649419955845
Training loss of Epoch 750: 1.3997845344122137
Training loss of Epoch 800: 1.3979827886707112
Training loss of Epoch 850: 1.3972070839305955
Training loss of Epoch 900: 1.3946481414779235
Training loss of Epoch 950: 1.3938441402722852
Training loss of Epoch 1000: 1.393873757906581


In [48]:
# Evaluate model on training data
with torch.no_grad():  # No gradient calculation during evaluation
    y_true = []  # Store true values
    y_pred = []  # Store predicted values
    model.eval()  # Set model to evaluation mode
    for batch in dataloader_train:
        x, y = batch[0], batch[1]
        pred = model(x)  # Forward pass
        y_true.extend(y.numpy())  # Append true values
        y_pred.extend(pred.numpy())  # Append predicted values

y_true = np.array(y_true)  # Convert to numpy array
y_pred = np.array(y_pred)  # Convert to numpy array
mse = mean_squared_error(y_true, y_pred)  # Compute Mean Squared Error
print(f"Training MSE: {mse}")

1.3491306


In [47]:
# Evaluate model on testing data
with torch.no_grad():
    y_true = []  # Store true values
    y_pred = []  # Store predicted values
    model.eval()
    for batch in dataloader_test:
        x, y = batch[0], batch[1]
        pred = model(x)  # Forward pass
        y_true.extend(y.numpy())  # Append true values
        y_pred.extend(pred.numpy())  # Append predicted values

y_true = np.array(y_true)  # Convert to numpy array
y_pred = np.array(y_pred)  # Convert to numpy array
mse = mean_squared_error(y_true, y_pred)  # Compute Mean Squared Error
print(f"Testing MSE: {mse}")

1.8955426
