# **Use regression models to predict the number of days a customer rents DVDs for**

## **Pre-process the data**

### 1. Import libraries

In [41]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from utils.utils import draw_stars_line


### 2. Read Data with pandas

In [42]:
# Load Data
data = pd.read_csv('dvd_rental_infos.csv')
# Display the first few rows of the dataset
print(data.head(1))

                 rental_date                return_date  amount  release_year  \
0  2005-05-25 02:54:33+00:00  2005-05-28 23:40:33+00:00    2.99        2005.0   

   rental_rate  length  replacement_cost                special_features  \
0         2.99   126.0             16.99  {Trailers,"Behind the Scenes"}   

   NC-17  PG  PG-13  R  amount_2  length_2  rental_rate_2  
0      0   0      0  1    8.9401   15876.0         8.9401  


In [43]:
# Display the data infos
print(data.info())

# Draw separation line
draw_stars_line()

# check for missing values
print(data.isnull().any().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15861 entries, 0 to 15860
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   rental_date       15861 non-null  object 
 1   return_date       15861 non-null  object 
 2   amount            15861 non-null  float64
 3   release_year      15861 non-null  float64
 4   rental_rate       15861 non-null  float64
 5   length            15861 non-null  float64
 6   replacement_cost  15861 non-null  float64
 7   special_features  15861 non-null  object 
 8   NC-17             15861 non-null  int64  
 9   PG                15861 non-null  int64  
 10  PG-13             15861 non-null  int64  
 11  R                 15861 non-null  int64  
 12  amount_2          15861 non-null  float64
 13  length_2          15861 non-null  float64
 14  rental_rate_2     15861 non-null  float64
dtypes: float64(8), int64(4), object(3)
memory usage: 1.8+ MB
None


***********************

1. Some columns in data have wrong data type: rental_date, return_date, year
2. There are no missing values in columns

### 3. Create a column named `rental_length_days` using the columns `return_date` and `rental_date`

#### a. Convert `rental_date` and `return_date` to DateTime

In [44]:
# Convert 'rental_date' and return_date to datetime
rental_date_col = 'rental_date'
return_date_col = 'return_date'
data[rental_date_col] = pd.to_datetime(data[rental_date_col])
data[return_date_col] = pd.to_datetime(data[return_date_col])
print(data[[rental_date_col, return_date_col]].info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15861 entries, 0 to 15860
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   rental_date  15861 non-null  datetime64[ns, UTC]
 1   return_date  15861 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](2)
memory usage: 248.0 KB
None


#### b. Add `rental_days_col` Column

In [45]:
# Add 'rental_lenght_data' column
rental_length_days_col = 'rental_length_days'
data[rental_length_days_col] = (data[return_date_col] - data[rental_date_col]).dt.days
# Display the first few rows of the updated dataset
print(data[[rental_date_col, return_date_col, rental_length_days_col]].head(1))

                rental_date               return_date  rental_length_days
0 2005-05-25 02:54:33+00:00 2005-05-28 23:40:33+00:00                   3


### 4. Create two columns of dummy variables from `special_features` which takes the value of 1 when:
#### - The value is `Deleted Scenes`, storing as a column called `deleted_scenes`.
#### - The value is `Behind the Scenes`, storing as a column called `behind_the_scenes`

In [46]:
# Columns variables
special_features_col = 'special_features'
deleted_col= 'deleted_scenes'
behind_the_scenes_col = 'behind_the_scenes'
# Check for special features
print(data[special_features_col].unique())

['{Trailers,"Behind the Scenes"}' '{Trailers}'
 '{Commentaries,"Behind the Scenes"}' '{Trailers,Commentaries}'
 '{"Deleted Scenes","Behind the Scenes"}'
 '{Commentaries,"Deleted Scenes","Behind the Scenes"}'
 '{Trailers,Commentaries,"Deleted Scenes"}' '{"Behind the Scenes"}'
 '{Trailers,"Deleted Scenes","Behind the Scenes"}'
 '{Commentaries,"Deleted Scenes"}' '{Commentaries}'
 '{Trailers,Commentaries,"Behind the Scenes"}'
 '{Trailers,"Deleted Scenes"}' '{"Deleted Scenes"}'
 '{Trailers,Commentaries,"Deleted Scenes","Behind the Scenes"}']


#### a. `deleted_scences` Column

In [47]:
# Add deleted_scenes Column
data[deleted_col] = np.where(data[special_features_col].str.contains('Deleted Scenes'), 1, 0)

#### b. `behind_the_scenes` Column

In [48]:
# Add behind_the_scenes Column
data[behind_the_scenes_col] = np.where(data[special_features_col].str.contains('Behind the Scenes'), 1, 0)

In [49]:
# Check deleted_scenes and behind_the_scenes columns
print(data[[deleted_col, behind_the_scenes_col]].head(5))


   deleted_scenes  behind_the_scenes
0               0                  1
1               0                  1
2               0                  1
3               0                  1
4               0                  1


### 5. Make Features and Target variables
#### - X: containing all the appropriate features you can use to run the regression models, avoiding columns that leak data about the target.
#### - y: Choose the "rental_length_days" as the target column and save it as a pandas Series called.

#### a. `Features variables`

In [50]:
# Features variables X
print(data.columns)

# Draw separation line
draw_stars_line()

col_to_drop = [ rental_date_col, return_date_col, special_features_col, rental_length_days_col]
# Drop unnecessary columns
X = data.drop(columns=col_to_drop).values
print("Shape of X:", X.shape)


Index(['rental_date', 'return_date', 'amount', 'release_year', 'rental_rate',
       'length', 'replacement_cost', 'special_features', 'NC-17', 'PG',
       'PG-13', 'R', 'amount_2', 'length_2', 'rental_rate_2',
       'rental_length_days', 'deleted_scenes', 'behind_the_scenes'],
      dtype='object')


**************************************************

Shape of X: (15861, 14)


#### a. `Target variable`

In [51]:
# Target variable y
y = data[rental_length_days_col].values
print("Shape of y:", y.shape)

Shape of y: (15861,)


### 6. Make prediction using Regression models


#### a. Split data to Train and test set, include 20% of the total data in the test set, with random state to 9

In [52]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)


#### b. Import Regression models libraries

In [None]:
# import LinearRegression
from sklearn.linear_model import LinearRegression
# import Lasso
from sklearn.linear_model import Lasso
# import Ridge
from sklearn.linear_model import Ridge
# import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
# import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
# import necessary libraries for evaluation
from sklearn.metrics import root_mean_squared_error

#### c. Create and evaluate models

In [58]:
# create a list of models
models = [
    ('Linear Regression', LinearRegression()),
    ('Lasso Regression', Lasso(alpha=0.1)),
    ('Ridge Regression', Ridge(alpha=0.1)),
    ('Random Forest Regressor', RandomForestRegressor(n_estimators=100, random_state=9)),
    ('Gradient Boosting Regressor', GradientBoostingRegressor(n_estimators=100, random_state=9))
]

# Evaluate each model
def evaluate_models(models, X_train, y_train, X_test, y_test):
    results = []
    for name, model in models:
        model.fit(X_train, y_train)
        mse = np.mean((model.predict(X_test) - y_test) ** 2)
        results.append((name, mse))
    return results

# Evaluate the models
results = evaluate_models(models, X_train, y_train, X_test, y_test)

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results, columns=['Model', 'Mean Squared Error'])
# Sort the results by Mean Squared Error
results_df.sort_values(by='Mean Squared Error', ascending=True, inplace=True)
# Display the results
print(results_df)



                         Model  Mean Squared Error
3      Random Forest Regressor            2.030142
4  Gradient Boosting Regressor            2.425346
0            Linear Regression            2.941724
2             Ridge Regression            2.941727
1             Lasso Regression            3.118637


#### d. Plots models Mean squared error

In [None]:
# Plot the results
plt.figure(figsize=(10, 6))
sns.barplot(x='Mean Squared Error', y='Model', data=results_df, palette='viridis')
plt.title('Model Comparison: Mean Squared Error')
plt.xlabel('Mean Squared Error')
plt.ylabel('Model')
plt.tight_layout()
plt.show()


#### e. Model yielding a mean squared error (MSE) less than 3 on the test set:
- `best_model` and
- `best_mse`

In [None]:
# Best model
best_model = results_df.iloc[0]['Model']
print(f"The best model is: {best_model}")

# Best Mean Squared Error
best_mse = results_df.iloc[0]['Mean Squared Error']
print(f"Best Mean Squared Error: {best_mse}")

The best model is: Random Forest Regressor
Best Mean Squared Error: 2.030141907417274
