In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

df_rental = pd.read_csv('../../assets/rental_info.csv')
df_rental.head()

Unnamed: 0,rental_date,return_date,amount,release_year,rental_rate,length,replacement_cost,special_features,NC-17,PG,PG-13,R,amount_2,length_2,rental_rate_2
0,2005-05-25 02:54:33+00:00,2005-05-28 23:40:33+00:00,2.99,2005.0,2.99,126.0,16.99,"{Trailers,""Behind the Scenes""}",0,0,0,1,8.9401,15876.0,8.9401
1,2005-06-15 23:19:16+00:00,2005-06-18 19:24:16+00:00,2.99,2005.0,2.99,126.0,16.99,"{Trailers,""Behind the Scenes""}",0,0,0,1,8.9401,15876.0,8.9401
2,2005-07-10 04:27:45+00:00,2005-07-17 10:11:45+00:00,2.99,2005.0,2.99,126.0,16.99,"{Trailers,""Behind the Scenes""}",0,0,0,1,8.9401,15876.0,8.9401
3,2005-07-31 12:06:41+00:00,2005-08-02 14:30:41+00:00,2.99,2005.0,2.99,126.0,16.99,"{Trailers,""Behind the Scenes""}",0,0,0,1,8.9401,15876.0,8.9401
4,2005-08-19 12:30:04+00:00,2005-08-23 13:35:04+00:00,2.99,2005.0,2.99,126.0,16.99,"{Trailers,""Behind the Scenes""}",0,0,0,1,8.9401,15876.0,8.9401


In [7]:
### Preprocessing
df_rental.isna().sum() # no missing values

# Count number of rental days
df_rental['return_date'] = pd.to_datetime(df_rental['return_date'])
df_rental['rental_date'] = pd.to_datetime(df_rental['rental_date'])
df_rental['rental_length_days'] = (df_rental['return_date'] - df_rental['rental_date']).dt.days
df_rental.drop(['rental_date', 'return_date'], axis=1, inplace=True)

# Replace str feature by dummy variables
df_rental["deleted_scenes"] =  np.where(df_rental["special_features"].str.contains("Deleted Scenes"), 1, 0)
df_rental["behind_the_scenes"] =  np.where(df_rental["special_features"].str.contains("Behind the Scenes"), 1, 0)
df_rental.drop('special_features', axis=1, inplace=True)

# Create features and labels
X = pd.DataFrame(data=df_rental.drop('rental_length_days', axis=1))
y = pd.Series(data=df_rental['rental_length_days'])

### Intantiating the models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)
lr = LinearRegression()
rf = RandomForestRegressor()
dt = DecisionTreeRegressor()

models = [lr, rf, dt]
best_model = ''
best_mse = 999
for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    mse = MSE(y_test, y_pred)
    if mse < best_mse:
        best_mse = mse
        best_model = m

print(best_model, best_mse)

RandomForestRegressor() 2.028222336017316
