# Step 1: Setup env, SQL connection and analyze SQL database data

Import necessary libraries when needed and establish connection to SQL database (noshow.db)

In [None]:
# Import libraries that are currently needed
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set path to SQL database
db_path = "../data/noshow.db"

# Create connection to SQL database
conn = sqlite3.connect(db_path)

# Set pandas options for better readbility
pd.set_option('display.max_columns', None) # Display all columns in DataFrame
pd.set_option('display.max_rows', 100) # Limit number of rows displayed

# Setup matplotlib and seaborn for inline visualization
%matplotlib inline
sns.set(style = "whitegrid")

Explore database structure by listing all available tables before further actions

In [None]:
# Query to list all tables in database
query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql(query, conn)

# Display list of tables
tables

Since there is only 'noshow' table in the database, the first few rows can be previewed to understand its structure

In [None]:
# Preview first few rows of 'noshow' table
noshow_query = "SELECT * FROM noshow LIMIT 10;"
df_noshow = pd.read_sql(noshow_query, conn)

# Display first 10 rows of table
df_noshow.head()

The structure of 'noshow' table consists of following columns:
- booking_id (Unique value)
- no_show
- branch
- booking_month
- arrival_month
- arrival_day
- checkout_month
- checkout_day
- country
- first_time
- room (Drop missing values)
- price (Drop missing values)
- platform
- num_adults
- num_children

The schema of 'noshow' table is retrieved to understand the columns and their data types

In [None]:
# Get schema of 'noshow' table
schema_query = "PRAGMA table_info(noshow);"
schema_df = pd.read_sql(schema_query, conn)

# Display schema information
schema_df

From the schema, some columns can be seen to have TEXT type data so they will have to be converted to REAL type using one-hot encoding

# Step 2: Perform Exploratory Data Analysis (EDA) on 'noshow' table

Load all the data into a DataFrame to start data analysis

In [None]:
# Get all data from 'noshow' table
noshow_data_query = "SELECT * FROM noshow;"
noshow_data_df = pd.read_sql_query(noshow_data_query, conn)

Use booking_id to eliminate duplicate data as each row should be unique

In [None]:
noshow_data_df = noshow_data_df.drop_duplicates(subset = "booking_id", keep = "first")
noshow_data_df

Drop booking_id column after removing duplicates as each row is now unique

In [None]:
drop_col_noshow_data_df = noshow_data_df.drop(columns = "booking_id")

Clean-up rows with missing cell info <br>
Example: If cell in row has missing value, then row should be dropped

Firstly, check which columns have missing data

In [None]:
# Check number of rows that have missing data
missing_data = drop_col_noshow_data_df.isnull().sum()

# Display number of affected rows
print("Missing values in each column: ")
print(missing_data[missing_data > 0])

~~Next, drop all rows that have missing values~~ <br>
Use median/mode to replace missing values <br>
Only drop row if all values are missing

In [None]:
# Drop rows if there are any missing values
#cleaned_noshow_data_df = drop_col_noshow_data_df.dropna()

# Drop rows if all cells in that row have missing values
cleaned_noshow_data_df = drop_col_noshow_data_df.dropna(how="all")

# Use median/mode to replace missing values
cleaned_noshow_data_df["room"] = cleaned_noshow_data_df["room"].fillna("Unknown")

# Verify if all missing values are dropped
print("Missing values after dropping rows: ")
print(cleaned_noshow_data_df.isnull().sum())

Do manual data conversion for price and num_adults
- price (Convert all price to SGD$ and remove SGD$)
- num_adults (Convert string value to its integer value)

In [None]:
# Assume USD$ to SGD$ conversion rate is 1:1.34
# Remove all USD$/SGD$ tags and create a new column called sgd_price
def convert_price(price):
    if isinstance(price, str):
        # Check if price is in USD/SGD
        if "USD$" in price:
            value = float(price.replace("USD$ ", "").strip())
            # Convert USD to SGD by multiplying by 1.34
            value = value * 1.34
        else:
            # Extract number after SGD$ and convert to float
            value = float(price.replace("SGD$ ", "").strip())
        return value
    else:
        return None

# Apply function to 'price' column and create new column with converted values named 'sgd_price'
cleaned_noshow_data_df["sgd_price"] = cleaned_noshow_data_df["price"].apply(convert_price)
cleaned_noshow_data_df["sgd_price"] = cleaned_noshow_data_df["sgd_price"].fillna(cleaned_noshow_data_df["sgd_price"].median())

# Drop 'price' column
cleaned_noshow_data_df = cleaned_noshow_data_df.drop(columns="price")

# Display DataFrame for post price conversion
cleaned_noshow_data_df

In [None]:
# Convert string value in num_adults to float value
# Perform manual mapping after checking number of string values to replace
print("Unique values in 'num_adults' column - ")
print(cleaned_noshow_data_df["num_adults"].unique())

# There are only 'one' and 'two' string values in the column
number_mapping = {
    'one': "1",
    'two': "2"
}

# Apply mapping to 'num_adults' column
cleaned_noshow_data_df["num_adults"] = cleaned_noshow_data_df["num_adults"].replace(number_mapping)

# Display DataFrame post mapping
cleaned_noshow_data_df

Do extra mapping to factor in seasons

In [None]:
season_mapping = {
    "december": "winter", "january": "winter", "february": "winter", "march": "spring", "april": "spring",
    "may": "spring", "june": "summer", "july": "summer", "august": "summer", "september": "autumn", "october": "autumn", "november": "autumn"
}

cleaned_noshow_data_df["arrival_month"] = cleaned_noshow_data_df["arrival_month"].str.lower()
cleaned_noshow_data_df["checkout_month"] = cleaned_noshow_data_df["checkout_month"].str.lower()

cleaned_noshow_data_df["trip_season"] = cleaned_noshow_data_df["arrival_month"].map(season_mapping)

cleaned_noshow_data_df

Do extra mapping to convert arrival_month, arrival_day, checkout_month and checkout_day to number_of_days_stayed

In [None]:
month_mapping = {
    "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
    "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
}

cleaned_noshow_data_df["arrival_month_num"] = cleaned_noshow_data_df["arrival_month"].map(month_mapping)
cleaned_noshow_data_df["checkout_month_num"] = cleaned_noshow_data_df["checkout_month"].map(month_mapping)

cleaned_noshow_data_df["arrival_year"] = 2024
cleaned_noshow_data_df["checkout_year"] = cleaned_noshow_data_df.apply(
    lambda row: row["arrival_year"] + 1 if row["checkout_month_num"] < row["arrival_month_num"] else row["arrival_year"], axis=1
)

cleaned_noshow_data_df["arrival_date"] = pd.to_datetime(cleaned_noshow_data_df.apply(lambda row: f'{int(abs(row["arrival_day"]))}-{int(row["arrival_month_num"])}-{(row["arrival_year"])}', axis=1), format='%d-%m-%Y')
cleaned_noshow_data_df["checkout_date"] = pd.to_datetime(cleaned_noshow_data_df.apply(lambda row: f'{int(abs(row["checkout_day"]))}-{int(row["checkout_month_num"])}-{(row["checkout_year"])}', axis=1), format='%d-%m-%Y')

cleaned_noshow_data_df["stayed_num_days"] = (cleaned_noshow_data_df["checkout_date"] - cleaned_noshow_data_df["arrival_date"]).dt.days

drop_date_col_list = ["arrival_month", "arrival_day", "arrival_month_num", "arrival_date", "arrival_year", "checkout_month", "checkout_day", "checkout_month_num", "checkout_date", "checkout_year"]
cleaned_noshow_data_df = cleaned_noshow_data_df.drop(columns=drop_date_col_list)

cleaned_noshow_data_df

Analyze distribution of categorical variables

In [None]:
cat_col_list = ["branch", "country", "first_time", "room", "platform"]

for col in cat_col_list:
    plt.figure(figsize=(10,5))
    sns.countplot(x=col, data=cleaned_noshow_data_df)
    plt.title(f"Distribution of {col}")
    plt.xticks(rotation=45)
    plt.show()

Analyze distribution and relationship of numerical variables

In [None]:
num_col_list = ["sgd_price", "num_adults", "num_children"]

for col in num_col_list:
    plt.figure(figsize=(10,5))
    sns.histplot(cleaned_noshow_data_df[col].dropna(), kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()

Perform label categorization to make all columns have categorical values 

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

standard_list = ["sgd_price", "stayed_num_days"]
one_hot_list = ["branch", "booking_month", "country", "first_time", "room", "platform", "trip_season", "num_adults", "num_children"]

lab_enc = LabelEncoder()
cleaned_noshow_data_df["no_show"] = lab_enc.fit_transform(cleaned_noshow_data_df["no_show"])

# Standardize data-set
scaler = StandardScaler()
cleaned_noshow_data_df[standard_list] = scaler.fit_transform(cleaned_noshow_data_df[standard_list])

# Perform one-hot encoding on categorical variables
encoded_noshow_data_df = pd.get_dummies(cleaned_noshow_data_df, columns = one_hot_list, drop_first=True)
bool_col = encoded_noshow_data_df.select_dtypes(include=["bool"]).columns
encoded_noshow_data_df[bool_col] = encoded_noshow_data_df[bool_col].astype(int)
encoded_noshow_data_df

It can now be used to generate summary statistics to check on mean, median, 25%, etc

In [None]:
# Get summary statistics from 'noshow' DataFrame
summary_stats = encoded_noshow_data_df.describe()

# Display summary statistics
summary_stats

# Step 3: Analyze the patterns and distributions in 'noshow' DataFrame

Plot heatmap for dimension reduction visualization

In [None]:
# Calculate correlation matrix
corr_matrix = encoded_noshow_data_df.corr()

# Create heatmap of correlation matrix
plt.figure(figsize = (10, 8)) # Adjust size as needed
sns.heatmap(corr_matrix, annot = True, cmap = "coolwarm", fmt = ".2f", linewidths=0.5)

# Show plot
plt.title("Correlation matrix heatmap")
plt.tight_layout()
plt.show()

no_show_corr = corr_matrix["no_show"]

# Sort correlations by absolute value (if strong correlations should be prioritized)
sorted_corr = no_show_corr.abs().sort_values(ascending=False)

# Print numerical correlation values
print("Correlation with 'no_show': ")
print(sorted_corr)

Drop features with correlation value < 0.05

In [None]:
drop_cols = sorted_corr[sorted_corr < 0.05].index

final_noshow_data_df = encoded_noshow_data_df.drop(columns = drop_cols, axis = 1)
final_noshow_data_df

Check correlation matrix again

In [None]:
# Calculate correlation matrix
final_corr_matrix = final_noshow_data_df.corr()

# Create heatmap of correlation matrix
plt.figure(figsize = (10, 8)) # Adjust size as needed
sns.heatmap(final_corr_matrix, annot = True, cmap = "coolwarm", fmt = ".2f", linewidths=0.5)

# Show plot
plt.title("Final correlation matrix heatmap")
plt.tight_layout()
plt.show()

final_no_show_corr = final_corr_matrix["no_show"]

# Sort correlations by absolute value (if strong correlations should be prioritized)
final_sorted_corr = final_no_show_corr.abs().sort_values(ascending=False)

# Print numerical correlation values
print("Correlation with 'no_show': ")
print(final_sorted_corr)

Testing ML for chosen features in final_noshow_data_df

In [None]:
X = final_noshow_data_df.drop(["no_show"], axis = 1)
Y = final_noshow_data_df["no_show"]

from sklearn.model_selection import train_test_split

# Split data into test and train (20/80 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

Check original class distribution in training set

In [None]:
print("Original class distribution in training set:")
print(pd.Series(Y_train).value_counts())

Apply SMOTE (Syntheetic Minority Over-sampling Technique) to balance dataset

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy="auto", random_state=42)
resampled_X_train, resampled_Y_train = smote.fit_resample(X_train, Y_train)

Check new class distribution after SMOTE

In [None]:
print("Class distribution after SMOTE:")
print(pd.Series(resampled_Y_train).value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder

lab_enc = LabelEncoder()

resampled_Y_train = lab_enc.fit_transform(resampled_Y_train)
Y_test = lab_enc.fit_transform(Y_test)

Start with Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

log_model = LogisticRegression()
log_model.fit(resampled_X_train, resampled_Y_train)
log_Y_predict = log_model.predict(X_test)

log_confuse_matrix = confusion_matrix(Y_test, log_Y_predict)
log_class_rpt = classification_report(Y_test, log_Y_predict)

print("Pre-tuned Logistic Regression - ")
print(f"Confusion Matrix: \n{log_confuse_matrix}")
print(f"Classification Report: \n{log_class_rpt}")

Tune Logistic Regression for better results

In [None]:
def tune_n_eval_log_regression(X_train, Y_train, X_test, Y_test, search_method = "grid", param_grid = None, param_dist = None, random_iter = 50, cv = 5, num_jobs = 4):
    """
    Automates the tuning and evaluation of a Random Forest Regressor model.

    Parameters:
        X: Features (DataFrame or array).
        y: Target variable (Series or array).
        search_method: 'grid' for GridSearchCV, 'random' for RandomizedSearchCV.
        param_grid: Dictionary of hyperparameter ranges for GridSearchCV.
        param_dist: Dictionary of hyperparameter distributions for RandomizedSearchCV.
        random_iter: Number of iterations for RandomizedSearchCV.
        cv: Number of cross-validation folds.

    Returns:
        best_model: The tuned Random Forest Regressor model.
        best_params: The best hyperparameters found.
    """
    from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
    import random

    # Initialize parameters
    if param_grid is None:
        param_grid = {
            "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky"], # sag omitted due to non-convergence at current max_iter values
            "max_iter": [50, 75, 100, 125],
            "C": [0.1, 1, 5, 10],
            "class_weight": ["balanced", {0: 1, 1: 1.7}, {0: 1, 1: 5}]
        }
    if param_dist is None:
        from scipy.stats import randint
        param_dist = {
            "solver": ["lbfgs", "liblinear", "sag", "newton-cg", "newton-cholesky"],
            "max_iter": randint(50, 200),
            "C": [round(random.uniform(0, 20), 1) for _ in range(5)],
            "class_weight": ["balanced", {0: 1, 1: 1.7}, {0: 1, 1: 5}]
        }

    if search_method == "grid":
        search = GridSearchCV(
            LogisticRegression(random_state = 42),
            param_grid = param_grid,
            cv = cv,
            scoring = "neg_mean_squared_error",
            n_jobs = num_jobs
        )
    elif search_method == "random":
        search = RandomizedSearchCV(
            LogisticRegression(random_state = 42),
            param_distributions = param_dist,
            n_iter = random_iter,
            cv = cv,
            scoring = "neg_mean_squared_error",
            random_state = 42,
            n_jobs = num_jobs
        )
    else:
        raise ValueError("search_method must be either 'grid' or ' random'")
    
    # Fit the search
    print(f"Running {search_method.capitalize()} Search...")
    search.fit(X_train, Y_train)

    # Best model and parameters
    best_model = search.best_estimator_
    best_params = search.best_params_
    print(f"\nBest Parameters: {best_params}")

    # Test set evaluation
    tuned_log_Y_predict = best_model.predict(X_test)
    tuned_log_confuse_matrix = confusion_matrix(Y_test, tuned_log_Y_predict)
    tuned_log_class_rpt = classification_report(Y_test, tuned_log_Y_predict)
    print("Tuned Logistic Regression -")
    print(f"Tuned Set Confusion Matrix: \n{tuned_log_confuse_matrix}")
    print(f"Tuned Set Classification Report: \n{tuned_log_class_rpt}")

    # Plot Confusion Matrix of actual vs predicted
    conf_matrix = confusion_matrix(Y_test, log_Y_predict)

    ConfusionMatrixDisplay(conf_matrix, display_labels=["No_Show", "Show"]).plot(cmap="Blues")
    plt.show()

    return best_model, best_params

In [None]:
# Using Grid Search
grid_log_best_model, grid_log_best_param = tune_n_eval_log_regression(
    resampled_X_train,
    resampled_Y_train,
    X_test,
    Y_test,
    search_method = "grid",
    num_jobs = -1
)

In [None]:
# Using Random Search
rand_log_best_model, rand_log_best_param = tune_n_eval_log_regression(
    resampled_X_train,
    resampled_Y_train,
    X_test,
    Y_test,
    search_method = "random",
    num_jobs = -1
)

Next, try with Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rand_for_model = RandomForestClassifier()
rand_for_model.fit(resampled_X_train, resampled_Y_train)
rand_for_Y_predict = rand_for_model.predict(X_test)

rand_for_confuse_matrix = confusion_matrix(Y_test, rand_for_Y_predict)
rand_for_class_rpt = classification_report(Y_test, rand_for_Y_predict)

print("Pre-tuning Random Forest Classifier - ")
print(f"Confusion Matrix: \n{rand_for_confuse_matrix}")
print(f"Classification Report: \n{rand_for_class_rpt}")

Tune Random Forest for better results

In [None]:
def tune_n_eval_forest_classifier(X_train, Y_train, X_test, Y_test, search_method = "grid", param_grid = None, param_dist = None, random_iter = 50, cv = 5, num_jobs = 4):
    """
    Automates the tuning and evaluation of a Random Forest Regressor model.

    Parameters:
        X: Features (DataFrame or array).
        y: Target variable (Series or array).
        search_method: 'grid' for GridSearchCV, 'random' for RandomizedSearchCV.
        param_grid: Dictionary of hyperparameter ranges for GridSearchCV.
        param_dist: Dictionary of hyperparameter distributions for RandomizedSearchCV.
        random_iter: Number of iterations for RandomizedSearchCV.
        cv: Number of cross-validation folds.

    Returns:
        best_model: The tuned Random Forest Regressor model.
        best_params: The best hyperparameters found.
    """
    from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

    # Initialize parameters
    if param_grid is None:
        param_grid = {
            "n_estimators": [50, 100, 150, 200],
            "max_depth": [None, 5, 10],
            "max_features": ["sqrt", "log2", None],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4],
            "class_weight": ["balanced", {0: 1, 1: 1.7}, {0: 1, 1: 5}]
        }
    if param_dist is None:
        from scipy.stats import randint
        param_dist = {
            "n_estimators": randint(100, 500),
            "max_depth": [None, 5, 10, 15],
            "max_features": ["sqrt", "log2", None],
            "min_samples_split": randint(2, 20),
            "min_samples_leaf": randint(1, 20)
        }

    if search_method == "grid":
        search = GridSearchCV(
            RandomForestClassifier(random_state = 42),
            param_grid = param_grid,
            cv = cv,
            scoring = "neg_mean_squared_error",
            n_jobs = num_jobs
        )
    elif search_method == "random":
        search = RandomizedSearchCV(
            RandomForestClassifier(random_state = 42),
            param_distributions = param_dist,
            n_iter = random_iter,
            cv = cv,
            scoring = "neg_mean_squared_error",
            random_state = 42,
            n_jobs = num_jobs
        )
    else:
        raise ValueError("search_method must be either 'grid' or ' random'")
    
    # Fit the search
    print(f"Running {search_method.capitalize()} Search...")
    search.fit(X_train, Y_train)

    # Best model and parameters
    best_model = search.best_estimator_
    best_params = search.best_params_
    print(f"\nBest Parameters: {best_params}")

    # Test set evaluation
    tuned_rand_for_Y_predict = best_model.predict(X_test)
    tuned_rand_for_confuse_matrix = confusion_matrix(Y_test, tuned_rand_for_Y_predict)
    tuned_rand_for_class_rpt = classification_report(Y_test, tuned_rand_for_Y_predict)
    print("Tuned Random Forest Classification -")
    print(f"Tuned Set Confusion Matrix: \n{tuned_rand_for_confuse_matrix}")
    print(f"Tuned Set Classification Report: \n{tuned_rand_for_class_rpt}")

    # Plot Confusion Matrix of actual vs predicted
    conf_matrix = confusion_matrix(Y_test, log_Y_predict)

    ConfusionMatrixDisplay(conf_matrix, display_labels=["No_Show", "Show"]).plot(cmap="Blues")
    plt.show()

    return best_model, best_params

Call tuning and evaluation function

In [None]:
# Temp removed due to long run time
## Using Grid Search
#grid_rand_for_best_model, grid_rand_for_best_param = tune_n_eval_forest_classifier(
#    resampled_X_train,
#    resampled_Y_train,
#    X_test,
#    Y_test,
#    search_method = "grid",
#    num_jobs = -1
#)

In [None]:
# Using Random Search
rand_rand_for_best_model, rand_rand_for_best_param = tune_n_eval_forest_classifier(
    resampled_X_train,
    resampled_Y_train,
    X_test,
    Y_test,
    search_method = "random",
    num_jobs = -1
)

Use XGBoost for variation

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(resampled_X_train, resampled_Y_train)
xgb_Y_predict = xgb_model.predict(X_test)

xgb_confuse_matrix = confusion_matrix(Y_test, xgb_Y_predict)
xgb_class_rpt = classification_report(Y_test, xgb_Y_predict)

print("XGBoost - ")
print(f"Confusion Matrix: \n{xgb_confuse_matrix}")
print(f"Classification Report: \n{xgb_class_rpt}")

Plot XGBoost's Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

xgb_conf_matrix = confusion_matrix(Y_test, log_Y_predict)

ConfusionMatrixDisplay(xgb_conf_matrix, display_labels=["No_Show", "Show"]).plot(cmap="Blues")
plt.show()

Check variance and range of target variable - no_show <br>
(Skipped for classifier)

In [None]:
## Calculate variance of target variable
#variance_no_show = Y_test.var() # Variance of actual target values
#print(f"Variance of 'no_show': {variance_no_show:.3f}")
#
## Calculate range of target variable
#range_no_show = Y_test.max() - Y_test.min()
#print(f"Range of 'no_show': {range_no_show:.3f}")

Perform ML with PCA (95% variance) features <br>

In [None]:
from sklearn.decomposition import PCA
import numpy as np

pca = PCA(n_components = 0.95) # Keep 95% of variance
X_pca = pca.fit_transform(X)

print("Explained variance ratio: ", pca.explained_variance_ratio_)
print("Cumulative explained variance: ", np.cumsum(pca.explained_variance_ratio_))

pca_X_train, pca_X_test, pca_Y_train, pca_Y_test = train_test_split(X_pca, Y, test_size = 0.2, random_state = 42)
resampled_pca_X_train, resampled_pca_Y_train = smote.fit_resample(pca_X_train, pca_Y_train)

pca_model = RandomForestClassifier()
pca_model.fit(resampled_pca_X_train, resampled_pca_Y_train)

pca_Y_predict = pca_model.predict(pca_X_test)
pca_confuse_matrix = confusion_matrix(Y_test, pca_Y_predict)
pca_class_rpt = classification_report(Y_test, pca_Y_predict)

print("PCA Random Forest Classifier - ")
print(f"Confusion Matrix: \n{pca_confuse_matrix}")
print(f"Classification Report: \n{pca_class_rpt}")

Perform cross-validation with PCA feature <br>
(Skipped for classifier)

In [None]:
#from sklearn.model_selection import cross_val_score
#x_valid_model = RandomForestClassifier(random_state = 42)
#
## Perform 5-fold cross validation
#scores = cross_val_score(x_valid_model, X, Y, cv=5, scoring="neg_mean_squared_error")
#
#print(f"Cross-validation Mean Mean Squared Error: {-scores.mean():.4f}")
#print(f"Cross-validation STD Mean Squared Error: {-scores.std():.4f}")