#### Importing Required Libraries

In [None]:
!pip3 install snowflake-connector-python

In [None]:
!pip3 install scikit-learn numpy pandas matplotlib scipy seaborn plotly


In [None]:
import configparser
import snowflake.connector
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from scipy.stats import mode
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')


#### Data loading / Data Info

### Dataset Description for Car Insurance Policy Price Prediction

#### Overview:
The dataset contains transaction histories of customers who purchased insurance policies. Each customer's entire quote history is recorded, with the last row indicating the purchased coverage options.

#### Key Concepts:
- **Customer**: May represent multiple individuals as policies can cover more than one person. Each customer has multiple shopping points (instances where they view products with specific characteristics and costs).
- **Shopping Point**: Defined by a customer's interaction with a product at a specific time. Characteristics and product costs may change over time.
- **Product Options**: Each product has 7 customizable options with 2-4 possible ordinal values.

#### Variables:
- **customer_ID**: Unique identifier for each customer.
- **shopping_pt**: Unique identifier for the shopping point of a given customer.
- **record_type**: Indicates whether the record is a shopping point (0) or purchase point (1).
- **day**: Day of the week (0-6, 0=Monday).
- **time**: Time of day (HH:MM).
- **state**: State where the shopping point occurred.
- **location**: Location ID of the shopping point.
- **group_size**: Number of people covered under the policy (1-4).
- **homeowner**: Homeownership status (0=no, 1=yes).
- **car_age**: Age of the customer’s car.
- **car_value**: Value of the customer’s car when new.
- **risk_factor**: Risk assessment of the customer (1-4).
- **age_oldest**: Age of the oldest person in the customer's group.
- **age_youngest**: Age of the youngest person in the customer's group.
- **married_couple**: Indicates if the customer group contains a married couple (0=no, 1=yes).
- **C_previous**: Previous product option C/ Type of insured vehicle the customer had previously. (0=nothing, 1=Economy, 2=Mid-sized, 3=Luxury, 4=High-performance).
- **duration_previous**: Duration (in years) the customer was covered by their previous insurer.

#### Coverage Options:
- **A**: Insurance coverage/risk profile (0=Basic, 1=Standard, 2=Premium).
- **B**: Binary policyholder attribute (0=Non-smoker, 1=Smoker).
- **C**: Type of insured vehicle (1=Economy, 2=Mid-sized, 3=Luxury, 4=High-performance).
- **D**: Usage/purpose of the vehicle (1=Personal, 2=Business, 3=Commercial).
- **E**: Vehicle safety features (0=No, 1=Yes).
- **F**: Driver's record/history (0=Clean, 1=Minor violations, 2=Accidents, 3=Severe violations).
- **G**: Geographical location/risk zone (1=Urban, 2=Suburban, 3=Rural, 4=Hazardous).

#### Target Variable:
- **cost**: Cost of the quoted coverage options.

This dataset is used to predict the price of car insurance policies based on customer characteristics, product options, and interaction history.

In [None]:
# Read the config file
config = configparser.ConfigParser()
config.read('config.ini')

# Get the Snowflake credentials
snowflake_config = config['snowflake']
user = snowflake_config['user']
password = snowflake_config['password']
account = snowflake_config['account']
warehouse = snowflake_config['warehouse']
database = snowflake_config['database']
schema = snowflake_config['schema']
role = snowflake_config['role']

# Step 1: Connect to Snowflake
conn = snowflake.connector.connect(
    user=user,
    password=password,
    account=account,
    warehouse=warehouse,
    database=database,
    schema=schema,
    role=role,
)

# Step 2: Execute SQL Query
cur = conn.cursor()
cur.execute('SELECT * FROM insurancetable')

# Step 3: Fetch Data
data = cur.fetchall()
df = pd.DataFrame(data, columns=[x[0] for x in cur.description])

# # Print DataFrame (optional)
# print(df)
# Step 4: Close the Connection
cur.close()
conn.close()


In [None]:
# Display the first few rows of the dataset
print("\nFirst 5 rows of the dataset:")
df.head()

In [None]:
# Display basic information about the dataset
print("Dataset Information:")
df.info()


In [None]:
# Summary statistics for numerical columns
print("\nSummary statistics for numerical columns:")
df.describe()

In [None]:
# Check for missing values
print("\nMissing values:")
df.isnull().sum()

In [None]:
# Replace empty strings with NaN
df['STATE'].replace('', np.nan, inplace=True)
df['CAR_VALUE'].replace('', np.nan, inplace=True)

In [None]:
df.isnull().sum()

## Data Preprocessing

#### Dropping duplicates

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace =True)

In [None]:
df.duplicated().sum()

In [None]:
# Drop rows where the 'STATE' column is 0 irrelevant values
df = df[df['STATE'] != '0']

In [None]:
df['STATE'].value_counts()

### Missing Value Imputation

In [None]:
# Impute missing values
def impute_grouped_data(df, column, method='mode'):
    if method == 'mode':
        df[column] = df.groupby('CUSTOMER_ID')[column].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else x))
    elif method == 'median':
        df[column] = df.groupby('CUSTOMER_ID')[column].transform(lambda x: x.fillna(x.median()))
    elif method == 'mean':
        df[column] = df.groupby('CUSTOMER_ID')[column].transform(lambda x: x.fillna(x.mean()))
    elif method == 'ffill':
        df[column] = df.groupby('CUSTOMER_ID')[column].transform(lambda x: x.fillna(method='ffill'))
    elif method == 'bfill':
        df[column] = df.groupby('CUSTOMER_ID')[column].transform(lambda x: x.fillna(method='bfill'))


In [None]:
# Columns to impute and their methods
columns_to_impute = {
    'LOCATION': 'mode',
    'GROUP_SIZE': 'mode',
    'HOMEOWNER': 'mode',
    'STATE': 'mode',
    'CAR_VALUE': 'mode',
    'CAR_AGE': 'mode',
    'RISK_FACTOR': 'mode',
    'AGE_OLDEST': 'mode',
    'AGE_YOUNGEST': 'mode',
    'MARRIED_COUPLE': 'mode',
    'C_PREVIOUS': 'mode',
    'DURATION_PREVIOUS': 'mode',
    'A': 'mode',
    'B': 'mode',
    'C': 'mode',
    'D': 'mode',
    'E': 'mode',
    'F': 'mode',
    'G': 'mode'
}

# Apply imputation
for column, method in columns_to_impute.items():
    impute_grouped_data(df, column, method)

In [None]:
# Drop rows where SHOPPING_PT is null because shopping point + customer_id is the unique identifier for each record 
df = df.dropna(subset=['SHOPPING_PT'])

# Sort the rows based on CUSTOMER_ID and SHOPPING_PT
df = df.sort_values(by=['CUSTOMER_ID', 'SHOPPING_PT'])

# Correct the sequence of SHOPPING_PT to start from 1 for each CUSTOMER_ID
df['SHOPPING_PT'] = df.groupby('CUSTOMER_ID').cumcount() + 1


In [None]:
# Function to handle missing record_type values according to specified rules
def fill_missing_record_type(group):
    # Ensure group is sorted by 'SHOPPING_PT'
    group = group.sort_values('SHOPPING_PT').reset_index(drop=True)

    # Handle the last row separately
    if pd.isnull(group['RECORD_TYPE'].iloc[-1]):
        group['RECORD_TYPE'].iloc[-1] = 1

    # Handle the rest of the rows
    for i in range(len(group) - 1):
        if pd.isnull(group['RECORD_TYPE'].iloc[i]):
            group['RECORD_TYPE'].iloc[i] = 0

    return group

# Apply the function to each group of 'CUSTOMER_ID'
df = df.groupby('CUSTOMER_ID', group_keys=False).apply(fill_missing_record_type)

# Reset index
df.reset_index(drop=True, inplace=True)


In [None]:
def fill_missing_days(df):
  # Forward fill missing values within each customer group
  df['DAY'] = df.groupby('CUSTOMER_ID')['DAY'].ffill()

  # Backward fill missing values within each customer group
  df['DAY'] = df.groupby('CUSTOMER_ID')['DAY'].bfill()

  # Handling edge cases of leading/trailing NaNs and isolated middle NaNs with different adjacent days
  for customer in df['CUSTOMER_ID'].unique():
    customer_data = df[df['CUSTOMER_ID'] == customer]

    for i in range(1, len(customer_data) - 1):
      if pd.isnull(customer_data.iloc[i]['DAY']):
        prev_day = customer_data.iloc[i - 1]['DAY']
        next_day = customer_data.iloc[i + 1]['DAY']
        if prev_day != next_day:
          # Fill with the most frequent day within the customer's data
          most_frequent_day = customer_data['DAY'].mode().iloc[0]
          df.loc[customer_data.index[i], 'DAY'] = most_frequent_day

  return df

df = fill_missing_days(df)

In [None]:
# Convert 'TIME' to datetime for easier manipulation
df['TIME'] = pd.to_datetime(df['TIME'], format='%H:%M:%S', errors='coerce')

# Function to handle missing time values according to specified rules
def fill_missing_times(group):
    # Ensure group is sorted by 'SHOPPING_PT'
    group = group.sort_values('SHOPPING_PT')

    n = len(group)

    # Handle first row
    if pd.isnull(group['TIME'].iloc[0]):
        if n > 1:
            group['TIME'].iloc[0] = group['TIME'].iloc[1] - pd.Timedelta(minutes=2)
        else:
            group['TIME'].iloc[0] = pd.Timestamp(group['DAY'].iloc[0]) + pd.Timedelta(hours=15, minutes=0, seconds=0)

    # Handle middle rows
    for i in range(1, n-1):
        if pd.isnull(group['TIME'].iloc[i]):
            if group['DAY'].iloc[i] == group['DAY'].iloc[i-1]:
                group['TIME'].iloc[i] = group['TIME'].iloc[i-1] + pd.Timedelta(minutes=2)
            elif group['DAY'].iloc[i] == group['DAY'].iloc[i+1]:
                group['TIME'].iloc[i] = group['TIME'].iloc[i+1] - pd.Timedelta(minutes=2)

    # Handle last row if more than one row exists
    if n > 1 and pd.isnull(group['TIME'].iloc[-1]):
        if group['DAY'].iloc[-1] == group['DAY'].iloc[-2]:
            group['TIME'].iloc[-1] = group['TIME'].iloc[-2] + pd.Timedelta(minutes=2)
        else:
            group['TIME'].iloc[-1] = pd.Timestamp(group['DAY'].iloc[-1]) + pd.Timedelta(hours=15, minutes=0, seconds=0)

    return group

# Apply the function to each group of 'CUSTOMER_ID'
df = df.groupby('CUSTOMER_ID', group_keys=False).apply(fill_missing_times)

# Convert 'TIME' back to string format
df['TIME'] = df['TIME'].dt.strftime('%H:%M:%S')

# Reset index
df.reset_index(drop=True, inplace=True)

In [None]:
# Check for missing values
print("\nMissing values:")
df.isnull().sum()

In [None]:
# Replacing the remaining missing values that are not imputed using mode 
# The reason may be not such car policy exists previously , hence filling them with 0
df['C_PREVIOUS'].fillna(0, inplace=True)
df['DURATION_PREVIOUS'].fillna(0, inplace=True)

In [None]:
# dropping the remaining null values except riskfactor
df = df.dropna(subset = ['CAR_VALUE','GROUP_SIZE','HOMEOWNER','MARRIED_COUPLE','B','TIME'])

In [None]:
# Check for missing values
print("\nMissing values:")
df.isnull().sum()

In [2]:
# filtering Car_age column , removing outliers
df = df[df['CAR_AGE'] <= 25]

In [None]:
df['CAR_AGE'].value_counts()

In [None]:
df['RISK_FACTOR'].value_counts()

In [None]:
'''Imputation of null values in RISKFACTOR using K-Means Clustering'''
# # Identify features to be used for clustering
# features = ['HOMEOWNER', 'GROUP_SIZE', 'CAR_AGE', 'CAR_VALUE', 'AGE_OLDEST', 'AGE_YOUNGEST', 'MARRIED_COUPLE', 'COST']

# # Drop rows where any feature for clustering is null, excluding RISK_FACTOR
# X = df[features]
# X = X.dropna(subset=features)

# # Apply KMeans clustering
# # scaler = StandardScaler()
# # X = scaler.fit_transform(X.drop(columns=['CAR_VALUE']))

# # Encode 'CAR_VALUE' after splitting and scaling
# le_car_value = LabelEncoder()
# X['CAR_VALUE'] = le_car_value.fit_transform(X['CAR_VALUE'].fillna(-1))

# # Perform clustering
# kmeans = KMeans(n_clusters=4, random_state=42)  # Number of clusters can be adjusted
# clusters = kmeans.fit_predict(X)

# Add clusters to the dataframe
# df.loc[X.index, 'Cluster'] = clusters

# # Function to impute missing RISK_FACTOR
# def impute_risk_factor(row):
#     if pd.isna(row['RISK_FACTOR']):
#         cluster = row['Cluster']
#         cluster_data = df[df['Cluster'] == cluster]['RISK_FACTOR'].dropna()
#         if not cluster_data.empty:
#             # Fix: Directly assign the mode value
#             mode_value = mode(cluster_data).mode
#         else:
#             mode_value = 1  # Default value if no non-missing values are found in the cluster
#         return mode_value
#     else:
#         return row['RISK_FACTOR']

# # Apply imputation
# df['RISK_FACTOR'] = df.apply(impute_risk_factor, axis=1)

# # Ensure consistency within CUSTOMER_ID
# customer_ids = df['CUSTOMER_ID'].unique()
# for customer_id in customer_ids:
#     customer_data = df[df['CUSTOMER_ID'] == customer_id]
#     if customer_data['RISK_FACTOR'].isna().any():
#         non_na_values = customer_data['RISK_FACTOR'].dropna()
#         if not non_na_values.empty:
#             # Fix: Directly assign the mode value
#             mode_value = mode(non_na_values).mode
#         else:
#             mode_value = 1  # Default value if no non-missing values are found for the customer
#         df.loc[df['CUSTOMER_ID'] == customer_id, 'RISK_FACTOR'] = mode_value

# # Drop the Cluster column as it's no longer needed
# df.drop(columns=['Cluster'], inplace=True)

# print(df)

In [None]:
'''Imputation of null values in RISKFACTOR using prediction/ Random Forest Model'''

# Separate rows with missing and non-missing RISK_FACTOR
missing_risk_factor = df[df['RISK_FACTOR'].isna()]
non_missing_risk_factor = df[~df['RISK_FACTOR'].isna()]

# Select features for prediction
features = ['HOMEOWNER', 'GROUP_SIZE', 'CAR_AGE', 'CAR_VALUE', 'AGE_OLDEST', 'AGE_YOUNGEST', 'MARRIED_COUPLE', 'COST']


# Initialize a dictionary to store LabelEncoders
label_encoders = {}

# Label encode categorical variables
for feature in ['CAR_VALUE', 'STATE']:
    le = LabelEncoder()
    non_missing_risk_factor[feature] = le.fit_transform(non_missing_risk_factor[feature].astype(str))
    missing_risk_factor[feature] = le.transform(missing_risk_factor[feature].astype(str))

    # Save the fitted LabelEncoder to dictionary
    label_encoders[feature] = le

# Train a model to predict RISK_FACTOR
X = non_missing_risk_factor[features]
y = non_missing_risk_factor['RISK_FACTOR']


# Ensure y has no missing values and is of correct length
assert len(X) == len(y), "Mismatch in number of samples between X and y"
assert y.isna().sum() == 0, "y contains missing values"

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict missing RISK_FACTOR
missing_X = missing_risk_factor[features]
predicted_risk_factor = model.predict(missing_X)

# Assign predicted values back to the dataset
missing_risk_factor['RISK_FACTOR'] = predicted_risk_factor

# Combine datasets
df_dummy = pd.concat([non_missing_risk_factor, missing_risk_factor])

# Ensure same RISK_FACTOR for each CUSTOMER_ID
df['RISK_FACTOR'] = df_dummy.groupby('CUSTOMER_ID')['RISK_FACTOR'].transform(lambda x: x.mode()[0])


In [None]:
df.isnull().sum()

In [None]:
df['STATE'].value_counts()

In [None]:
df['CAR_VALUE'].value_counts()

In [None]:
# Save or use the imputed dataset
df.to_csv('preprocessed_data.csv', index=False)

In [None]:
df.shape

In [None]:
df.head()

#### Loading the preprocessed data to snowflake then using it for visualiztion


In [None]:
from snowflake.connector.pandas_tools import write_pandas

preprocessed_df = df
preprocessed_df.info()

In [None]:
# Writing the data to snowflake table

# Read the config file
config = configparser.ConfigParser()
config.read('config.ini')

# Get the Snowflake credentials
snowflake_config = config['snowflake']
user = snowflake_config['user']
password = snowflake_config['password']
account = snowflake_config['account']
warehouse = snowflake_config['warehouse']
database = snowflake_config['database']
schema = snowflake_config['schema']
role = snowflake_config['role']

# Step 1: Connect to Snowflake
conn = snowflake.connector.connect(
    user=user,
    password=password,
    account=account,
    warehouse=warehouse,
    database=database,
    schema=schema,
    role=role,
)

# Write the DataFrame to the Snowflake table
success, nchunks, nrows, _ = write_pandas(conn, preprocessed_df, 'PREPROCESSED_DATA')

if success:
    print(f"Successfully wrote {nrows} rows in {nchunks} chunks to the Snowflake table 'preprocessed_data'.")
else:
    print("Failed to write data to the Snowflake table.")

# Close the connection
conn.close()


## EDA

In [None]:
df = pd.read_csv('preprocessed_data.csv')

In [None]:
# Pairplot using seaborn to visualize relationships between variables
sns.pairplot(df[['AGE_OLDEST', 'AGE_YOUNGEST', 'CAR_AGE', 'COST']])
plt.show()

In [None]:
# Plotly interactive scatter plot
fig = px.scatter(df, x='AGE_OLDEST', y='COST', color='RISK_FACTOR', 
                 title='Cost vs Age of Oldest Customer by Risk Factor')
fig.show()

In [None]:
# Distribution of cost by risk factor using seaborn
plt.figure(figsize=(10, 6))
sns.boxplot(x='RISK_FACTOR', y='COST', data=df)
plt.title('Distribution of Cost by Risk Factor')
plt.show()

In [None]:
# Interactive plot for car value
fig = px.histogram(df, x='CAR_VALUE', y='COST', color='CAR_VALUE', 
                   title='Cost Distribution by Car Value', barmode='group')
fig.show()

In [None]:
# Bar plot for coverage options using seaborn
plt.figure(figsize=(12, 6))
sns.countplot(x='A', data=df, palette='viridis')
plt.title('Count of Coverage Option A')
plt.show()

In [None]:
# Interactive bar plot for STATE vs COST
fig = px.bar(df, x='STATE', y='COST', color='STATE', title='Average Cost by State')
fig.show()

In [None]:
# Interactive line plot for time series analysis
fig = px.line(df, x='TIME', y='COST', title='Cost Over Time')
fig.show()

In [None]:
# Data Distribution of target column
plt.hist(df['COST']); plt.title('Cost of Insurance Policy')
plt.xlabel('Cost')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Bivariate analysis: Correlation matrix and heatmap
plt.figure(figsize=(20, 12))
sns.heatmap(df.drop(['TIME','STATE','CAR_VALUE'], axis =1).corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

## ML Model Development 

#### Manually Selected features 

In [None]:
selected_features = ['STATE','GROUP_SIZE','HOMEOWNER','CAR_AGE','CAR_VALUE',
                     'MARRIED_COUPLE','AGE_YOUNGEST','C_PREVIOUS','DURATION_PREVIOUS', 'A' ,'B','C','D','E','F' ,'G']

In [None]:
len(selected_features)

#### Encoding Categorical Columns

In [None]:
# Initialize a dictionary to store LabelEncoders
label_encoders = {}

# Label encode categorical variables
for feature in ['CAR_VALUE', 'STATE']:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature].astype(str))
    # Save the fitted LabelEncoder to dictionary
    label_encoders[feature] = le

In [None]:
df['STATE'].value_counts()

In [None]:
df['CAR_VALUE'].value_counts()

#### Train-Test Split

In [None]:
    # Define target variable
target = 'COST'
# Subset the data with selected features
X_selected = df[selected_features]
y = df[target]
# Train-test split
X_train, X_test, y_train, y_test = trai_test_split(X_selected, y, test_size=0.3, random_state=42)

In [None]:
#X_selected.duplicated().sum()

In [None]:
#X_selected.drop_duplicates(inplace=True)

In [None]:
X_selected.head()

In [None]:
X_selected.shape

#### Scaling

In [None]:
# Columns to scale
columns_to_scale = ['CAR_AGE', 'AGE_YOUNGEST']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the training set and transform both training and testing sets
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])


#### Model Training  

In [None]:
# Fit the best model (Random Forest in this case) on the entire training data
best_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)
best_model.fit(X_train, y_train)

# Predictions and evaluation on the training set
train_predictions = best_model.predict(X_train)
train_mae = mean_absolute_error(y_train, train_predictions)
train_mse = mean_squared_error(y_train, train_predictions)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, train_predictions)

print(f"Train Set Evaluation for Random Forest:")
print(f"  MAE: {train_mae}")
print(f"  MSE: {train_mse}")
print(f"  RMSE: {train_rmse}")
print(f"  R2: {train_r2}")

# Predictions and evaluation on the test set
test_predictions = best_model.predict(X_test)
test_mae = mean_absolute_error(y_test, test_predictions)
test_mse = mean_squared_error(y_test, test_predictions)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, test_predictions)

print(f"\nTest Set Evaluation for Random Forest:")
print(f"  MAE: {test_mae}")
print(f"  MSE: {test_mse}")
print(f"  RMSE: {test_rmse}")
print(f"  R2: {test_r2}")

In [None]:
# Create DataFrame with features, actual and predicted values for test set
test_results = pd.DataFrame(X_test, columns=selected_features)
test_results['Actual'] = y_test.values
test_results['Predicted'] = test_predictions

print("\nSample of Test Results:")
print(test_results.head())

#### Model Evaluation

In [None]:
# Compare metrics between train and test sets
metrics_comparison = pd.DataFrame({
    'Metric': ['MAE', 'MSE', 'RMSE', 'R2'],
    'Train': [train_mae, train_mse, train_rmse, train_r2],
    'Test': [test_mae, test_mse, test_rmse, test_r2]
})

# Save results to a CSV file
metrics_comparison.to_csv('model_metrics.csv', index=True)
print("\nMetrics Comparison:")
metrics_comparison

In [None]:

# Visualize the metrics comparison
plt.figure(figsize=(10, 6))
sns.barplot(x='Metric', y='value', hue='variable', data=pd.melt(metrics_comparison, ['Metric']))
plt.title('Metrics Comparison between Train and Test Sets')
plt.ylabel('Value')
plt.show()

#### Saving the model for Deployment

In [None]:
import joblib
# # Save the preprocessing steps and the model
# # Save the LabelEncoders using joblib
# for feature, encoder in label_encoders.items():
#     joblib.dump(encoder, f'{feature}_label_encoder.pkl')
joblib.dump(scaler, 'minmax_scaler.pkl')
joblib.dump(model, 'random_forest_model.pkl')