In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# Load the dataset
df = pd.read_csv('rental_info.csv')

In [3]:
# Convert date columns to datetime format
df['rental_date'] = pd.to_datetime(df['rental_date'])
df['return_date'] = pd.to_datetime(df['return_date'])

In [4]:
# Calculate the rental length in days
df['rental_lenght_days'] = (df['return_date'] - df['rental_date']).dt.days

In [5]:
# Create dummy variables for special features
df["deleted_scenes"] = np.where(df["special_features"].str.contains("Deleted Scenes"), 1, 0)
df["behind_the_scenes"] = np.where(df["special_features"].str.contains("Behind the Scenes"), 1, 0)

In [6]:
# Drop unnecessary columns
df = df.drop(columns=['rental_date', 'return_date', 'special_features'])

In [7]:
# Separate features and target variable
X = df.drop(columns='rental_lenght_days', axis=1)
y = df['rental_lenght_days']

In [8]:
# Standardize the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.2, random_state=9)

In [10]:
# Import necessary regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [11]:
# Define the parameter grid for each model
model_parametrs = {
    'LinearRegression': {},
    'Ridge': {
        'alpha': [0.001, 0.01, 0.1, 1, 10]
    },
    'Lasso': {
        'alpha': [0.001, 0.01, 0.1, 1, 10]
    },
    'DecisionTreeRegressor': {
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10]
    },
    'RandomForestRegressor': {
        'n_estimators': [50, 100, 150, 200],
        'max_depth': [None, 10, 20, 30]
    },
    'GradientBoostingRegressor': {
        'n_estimators': [50, 100, 150, 200],
        'learning_rate': [0.01, 0.1, 0.5]
    }
}

In [12]:
# Define the models
models = {
    'Ridge': Ridge(),
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
}

In [14]:
from sklearn.model_selection import GridSearchCV
# List to store the results of the grid search
models_list = []

# Perform grid search for each model
for model_name, model in models.items():
    print(f"Searching for {model_name}")
    param_grid = model_parametrs[model_name]
    
    grid_search = GridSearchCV(model, param_grid, cv=4, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_mse = -grid_search.best_score_  # Take negative because GridSearchCV returns negative MSE
    
    models_list.append({'Model': model_name, 'Best Params': best_params, 'Best MSE': best_mse})

# Create a DataFrame to display the results
models_df = pd.DataFrame(models_list)
print(models_df.head())

Searching for Ridge
Searching for LinearRegression
Searching for Lasso
Searching for DecisionTreeRegressor
Searching for RandomForestRegressor
Searching for GradientBoostingRegressor
                   Model                                Best Params  Best MSE
0                  Ridge                               {'alpha': 1}  2.849350
1       LinearRegression                                         {}  2.849354
2                  Lasso                           {'alpha': 0.001}  2.849414
3  DecisionTreeRegressor  {'max_depth': 30, 'min_samples_split': 2}  2.305103
4  RandomForestRegressor     {'max_depth': 20, 'n_estimators': 200}  2.088894


In [15]:
# Re-initialize models with the best parameters found
models = {
    'Ridge': Ridge(alpha=models_df.loc[models_df['Model'] == 'Ridge', 'Best Params'].values[0]['alpha']),
    'LinearRegression': LinearRegression(),  # No parameters to set for Linear Regression
    'Lasso': Lasso(alpha=models_df.loc[models_df['Model'] == 'Lasso', 'Best Params'].values[0]['alpha']),
    'DecisionTreeRegressor': DecisionTreeRegressor(
        max_depth=models_df.loc[models_df['Model'] == 'DecisionTreeRegressor', 'Best Params'].values[0]['max_depth'],
        min_samples_split=models_df.loc[models_df['Model'] == 'DecisionTreeRegressor', 'Best Params'].values[0]['min_samples_split']
    ),
    'RandomForestRegressor': RandomForestRegressor(
        max_depth=models_df.loc[models_df['Model'] == 'RandomForestRegressor', 'Best Params'].values[0]['max_depth'],
        n_estimators=models_df.loc[models_df['Model'] == 'RandomForestRegressor', 'Best Params'].values[0]['n_estimators']
    ),
    'GradientBoostingRegressor': GradientBoostingRegressor(
        n_estimators=models_df.loc[models_df['Model'] == 'GradientBoostingRegressor', 'Best Params'].values[0]['n_estimators'],
        learning_rate=models_df.loc[models_df['Model'] == 'GradientBoostingRegressor', 'Best Params'].values[0]['learning_rate']
    )
}


In [16]:
# Fit models and evaluate on test data
best_mse = float('inf')
best_model = None

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    print(f'{model_name} MSE: {mse}')
    
    if mse < best_mse:
        best_mse = mse
        best_model = model

# Output the best model and its MSE
print(f'Best Model: {best_model}')
print(f'Best MSE: {best_mse}')

Ridge MSE: 2.941870616734153
LinearRegression MSE: 2.9417238646975976
Lasso MSE: 2.9417116642920518
DecisionTreeRegressor MSE: 2.165347833153885
RandomForestRegressor MSE: 2.031285398706584
GradientBoostingRegressor MSE: 2.06977259149959
Best Model: RandomForestRegressor(max_depth=20, n_estimators=200)
Best MSE: 2.031285398706584
