In [None]:
# !pip install python-dotenv

In [None]:
!pip install snowflake-connector-python

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import snowflake.connector
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from scipy.stats import mode
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
import sklearn
import joblib

In [None]:
!python --version
# .__version__

###Imputation+ Preprocessing

In [None]:
# Step 1: Connect to Snowflake
conn = snowflake.connector.connect(
    user='CHINMAYA54',
    password='Chiy@#542',
    account='sgb52108.us-east-1',
    warehouse='INSURANCEWH',
    database='INSURANCEDB',
    schema='INSURANCESCHEMA',
    role='ACCOUNTADMIN',
)

# Step 2: Execute SQL Query
cur = conn.cursor()
cur.execute('SELECT * FROM insurancetable')

# Step 3: Fetch Data
data = cur.fetchall()
df = pd.DataFrame(data, columns=[x[0] for x in cur.description])

# print(df)

# Step 4: Close the Connection
cur.close()
conn.close()

In [None]:
# Display the first few rows of the dataset
print("\nFirst 5 rows of the dataset:")
df.head()

In [None]:
# Display basic information about the dataset
print("Dataset Information:")
df.info()


In [None]:
# Summary statistics for numerical columns
print("\nSummary statistics for numerical columns:")
df.describe()

In [None]:
df.dtypes

In [None]:
# Check for missing values
print("\nMissing values:")
df.isnull().sum()

In [None]:
# df['DURATION_PREVIOUS'].value_counts()

In [None]:
# Replace empty strings with NaN
df['STATE'].replace('', np.nan, inplace=True)
df['CAR_VALUE'].replace('', np.nan, inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace =True)

In [None]:
df.duplicated().sum()

In [None]:
df.head()

In [None]:
# Impute missing values
def impute_grouped_data(df, column, method='mode'):
    if method == 'mode':
        df[column] = df.groupby('CUSTOMER_ID')[column].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else x))
    elif method == 'median':
        df[column] = df.groupby('CUSTOMER_ID')[column].transform(lambda x: x.fillna(x.median()))
    elif method == 'mean':
        df[column] = df.groupby('CUSTOMER_ID')[column].transform(lambda x: x.fillna(x.mean()))
    elif method == 'ffill':
        df[column] = df.groupby('CUSTOMER_ID')[column].transform(lambda x: x.fillna(method='ffill'))
    elif method == 'bfill':
        df[column] = df.groupby('CUSTOMER_ID')[column].transform(lambda x: x.fillna(method='bfill'))


In [None]:
# Columns to impute and their methods
columns_to_impute = {
    'LOCATION': 'mode',
    'GROUP_SIZE': 'mode',
    'HOMEOWNER': 'mode',
    'STATE': 'mode',
    'CAR_VALUE': 'mode',
    'CAR_AGE': 'mode',
    'RISK_FACTOR': 'mode',
    'AGE_OLDEST': 'mode',
    'AGE_YOUNGEST': 'mode',
    'MARRIED_COUPLE': 'mode',
    'C_PREVIOUS': 'mode',
    'DURATION_PREVIOUS': 'mode',
    'A': 'mode',
    'B': 'mode',
    'C': 'mode',
    'D': 'mode',
    'E': 'mode',
    'F': 'mode',
    'G': 'mode'
}

# Apply imputation
for column, method in columns_to_impute.items():
    impute_grouped_data(df, column, method)

In [None]:
# Drop rows where SHOPPING_PT is null
df = df.dropna(subset=['SHOPPING_PT'])

# Sort the rows based on CUSTOMER_ID and SHOPPING_PT
df = df.sort_values(by=['CUSTOMER_ID', 'SHOPPING_PT'])

# Correct the sequence of SHOPPING_PT to start from 1 for each CUSTOMER_ID
df['SHOPPING_PT'] = df.groupby('CUSTOMER_ID').cumcount() + 1


In [None]:
# Function to handle missing record_type values according to specified rules
def fill_missing_record_type(group):
    # Ensure group is sorted by 'SHOPPING_PT'
    group = group.sort_values('SHOPPING_PT').reset_index(drop=True)

    # Handle the last row separately
    if pd.isnull(group['RECORD_TYPE'].iloc[-1]):
        group['RECORD_TYPE'].iloc[-1] = 1

    # Handle the rest of the rows
    for i in range(len(group) - 1):
        if pd.isnull(group['RECORD_TYPE'].iloc[i]):
            group['RECORD_TYPE'].iloc[i] = 0

    return group

# Apply the function to each group of 'CUSTOMER_ID'
df = df.groupby('CUSTOMER_ID', group_keys=False).apply(fill_missing_record_type)

# Reset index
df.reset_index(drop=True, inplace=True)


In [None]:
def fill_missing_days(df):
  # Forward fill missing values within each customer group
  df['DAY'] = df.groupby('CUSTOMER_ID')['DAY'].ffill()

  # Backward fill missing values within each customer group
  df['DAY'] = df.groupby('CUSTOMER_ID')['DAY'].bfill()

  # Handling edge cases of leading/trailing NaNs and isolated middle NaNs with different adjacent days
  for customer in df['CUSTOMER_ID'].unique():
    customer_data = df[df['CUSTOMER_ID'] == customer]

    for i in range(1, len(customer_data) - 1):
      if pd.isnull(customer_data.iloc[i]['DAY']):
        prev_day = customer_data.iloc[i - 1]['DAY']
        next_day = customer_data.iloc[i + 1]['DAY']
        if prev_day != next_day:
          # Fill with the most frequent day within the customer's data
          most_frequent_day = customer_data['DAY'].mode().iloc[0]
          df.loc[customer_data.index[i], 'DAY'] = most_frequent_day

  return df

df = fill_missing_days(df)

In [None]:
# Convert 'TIME' to datetime for easier manipulation
df['TIME'] = pd.to_datetime(df['TIME'], format='%H:%M:%S', errors='coerce')

# Function to handle missing time values according to specified rules
def fill_missing_times(group):
    # Ensure group is sorted by 'SHOPPING_PT'
    group = group.sort_values('SHOPPING_PT')

    n = len(group)

    # Handle first row
    if pd.isnull(group['TIME'].iloc[0]):
        if n > 1:
            group['TIME'].iloc[0] = group['TIME'].iloc[1] - pd.Timedelta(minutes=2)
        else:
            group['TIME'].iloc[0] = pd.Timestamp(group['DAY'].iloc[0]) + pd.Timedelta(hours=15, minutes=0, seconds=0)

    # Handle middle rows
    for i in range(1, n-1):
        if pd.isnull(group['TIME'].iloc[i]):
            if group['DAY'].iloc[i] == group['DAY'].iloc[i-1]:
                group['TIME'].iloc[i] = group['TIME'].iloc[i-1] + pd.Timedelta(minutes=2)
            elif group['DAY'].iloc[i] == group['DAY'].iloc[i+1]:
                group['TIME'].iloc[i] = group['TIME'].iloc[i+1] - pd.Timedelta(minutes=2)

    # Handle last row if more than one row exists
    if n > 1 and pd.isnull(group['TIME'].iloc[-1]):
        if group['DAY'].iloc[-1] == group['DAY'].iloc[-2]:
            group['TIME'].iloc[-1] = group['TIME'].iloc[-2] + pd.Timedelta(minutes=2)
        else:
            group['TIME'].iloc[-1] = pd.Timestamp(group['DAY'].iloc[-1]) + pd.Timedelta(hours=15, minutes=0, seconds=0)

    return group

# Apply the function to each group of 'CUSTOMER_ID'
df = df.groupby('CUSTOMER_ID', group_keys=False).apply(fill_missing_times)

# Convert 'TIME' back to string format
df['TIME'] = df['TIME'].dt.strftime('%H:%M:%S')

# Reset index
df.reset_index(drop=True, inplace=True)

In [None]:
# Check for missing values
print("\nMissing values:")
df.isnull().sum()

In [None]:
df['C_PREVIOUS'].fillna(0, inplace=True)
df['DURATION_PREVIOUS'].fillna(0, inplace=True)

In [None]:
# Check for missing values
print("\nMissing values:")
df.isnull().sum()

In [None]:
df.head()

In [None]:
# Check for missing values
print("\nMissing values:")
df.isnull().sum()

In [None]:
df['RISK_FACTOR'].value_counts()

In [None]:
df = df.dropna(subset = ['CAR_VALUE','GROUP_SIZE','HOMEOWNER','MARRIED_COUPLE','B','TIME'])

In [None]:

# # Identify features to be used for clustering
# features = ['HOMEOWNER', 'GROUP_SIZE', 'CAR_AGE', 'CAR_VALUE', 'AGE_OLDEST', 'AGE_YOUNGEST', 'MARRIED_COUPLE', 'COST']

# # Drop rows where any feature for clustering is null, excluding RISK_FACTOR
# X = df[features]
# X = X.dropna(subset=features)

# # Apply KMeans clustering
# # scaler = StandardScaler()
# # X = scaler.fit_transform(X.drop(columns=['CAR_VALUE']))

# # Encode 'CAR_VALUE' after splitting and scaling
# le_car_value = LabelEncoder()
# X['CAR_VALUE'] = le_car_value.fit_transform(X['CAR_VALUE'].fillna(-1))

# # Perform clustering
# kmeans = KMeans(n_clusters=4, random_state=42)  # Number of clusters can be adjusted
# clusters = kmeans.fit_predict(X)


In [None]:
# Add clusters to the dataframe
# df.loc[X.index, 'Cluster'] = clusters

In [None]:
# Separate rows with missing and non-missing RISK_FACTOR
missing_risk_factor = df[df['RISK_FACTOR'].isna()]
non_missing_risk_factor = df[~df['RISK_FACTOR'].isna()]

# Select features for prediction
features = ['HOMEOWNER', 'GROUP_SIZE', 'CAR_AGE', 'CAR_VALUE', 'AGE_OLDEST', 'AGE_YOUNGEST', 'MARRIED_COUPLE', 'COST']


# Initialize a dictionary to store LabelEncoders
label_encoders = {}

# Label encode categorical variables
for feature in ['CAR_VALUE', 'STATE']:
    le = LabelEncoder()
    non_missing_risk_factor[feature] = le.fit_transform(non_missing_risk_factor[feature].astype(str))
    missing_risk_factor[feature] = le.transform(missing_risk_factor[feature].astype(str))

    # Save the fitted LabelEncoder to dictionary
    label_encoders[feature] = le

# Train a model to predict RISK_FACTOR
X = non_missing_risk_factor[features]
y = non_missing_risk_factor['RISK_FACTOR']


# Ensure y has no missing values and is of correct length
assert len(X) == len(y), "Mismatch in number of samples between X and y"
assert y.isna().sum() == 0, "y contains missing values"

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict missing RISK_FACTOR
missing_X = missing_risk_factor[features]
predicted_risk_factor = model.predict(missing_X)

# Assign predicted values back to the dataset
missing_risk_factor['RISK_FACTOR'] = predicted_risk_factor

# Combine datasets
df = pd.concat([non_missing_risk_factor, missing_risk_factor])

# Ensure same RISK_FACTOR for each CUSTOMER_ID
df['RISK_FACTOR'] = df.groupby('CUSTOMER_ID')['RISK_FACTOR'].transform(lambda x: x.mode()[0])


In [None]:
df.isnull().sum()

In [None]:
df['STATE'].value_counts()

In [None]:
df['CAR_VALUE'].value_counts()

In [None]:
# Save or use the imputed dataset
df.to_csv('preprocessed_data.csv', index=False)

In [None]:
df.shape

In [None]:
df.head

In [None]:
# # Function to impute missing RISK_FACTOR
# def impute_risk_factor(row):
#     if pd.isna(row['RISK_FACTOR']):
#         cluster = row['Cluster']
#         cluster_data = df[df['Cluster'] == cluster]['RISK_FACTOR'].dropna()
#         if not cluster_data.empty:
#             # Fix: Directly assign the mode value
#             mode_value = mode(cluster_data).mode
#         else:
#             mode_value = 1  # Default value if no non-missing values are found in the cluster
#         return mode_value
#     else:
#         return row['RISK_FACTOR']

# # Apply imputation
# df['RISK_FACTOR'] = df.apply(impute_risk_factor, axis=1)

# # Ensure consistency within CUSTOMER_ID
# customer_ids = df['CUSTOMER_ID'].unique()
# for customer_id in customer_ids:
#     customer_data = df[df['CUSTOMER_ID'] == customer_id]
#     if customer_data['RISK_FACTOR'].isna().any():
#         non_na_values = customer_data['RISK_FACTOR'].dropna()
#         if not non_na_values.empty:
#             # Fix: Directly assign the mode value
#             mode_value = mode(non_na_values).mode
#         else:
#             mode_value = 1  # Default value if no non-missing values are found for the customer
#         df.loc[df['CUSTOMER_ID'] == customer_id, 'RISK_FACTOR'] = mode_value

# # Drop the Cluster column as it's no longer needed
# df.drop(columns=['Cluster'], inplace=True)

# print(df)

In [None]:
# Bivariate analysis: Correlation matrix and heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.drop(['TIME'], axis =1).corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.title('Correlation Matrix')
plt.show()

###Preprocessed csv

In [None]:
df = pd.read_csv('/content/preprocessed_data.csv')

In [None]:
# df= df.drop('Cluster',axis =1)

In [None]:
df.shape

In [None]:
df.head(10)

Encoding

In [None]:
# # Assuming df is your DataFrame
# categorical_features = ['STATE']
# print(categorical_features)
# # Label encoding
# label_encoder = LabelEncoder()
# for col in categorical_features:
#     df[col] = label_encoder.fit_transform(df[col])



###Train-test split

In [None]:
# Define target variable
target = 'COST'

# Train-test split
X = df.drop(columns=[target,'TIME','CUSTOMER_ID'])
#X = X.dropna()
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X.isnull().sum()

Scaling

In [None]:
# Columns to scale
columns_to_scale = ['CAR_AGE', 'AGE_OLDEST']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the training set and transform both training and testing sets
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])


In [None]:
X_train.head()

In [None]:
# Correlation Matrix with Heatmap
corr_matrix = df.drop('TIME',axis=1).corr()
plt.figure(figsize=(20, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()


In [None]:
plt.hist(df['COST']); plt.title('Cost of Insurance Policy')
plt.xlabel('Cost')
plt.ylabel('Frequency')
plt.show()

###PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import MinMaxScaler

# Columns to scale
columns_to_scale = ['CAR_AGE', 'AGE_YOUNGEST', 'AGE_OLDEST']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the training set and transform both training and testing sets
X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

# Apply PCA
n_components = 5  # Number of principal components to keep
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

# Print the explained variance ratio of each principal component
print(f"Explained variance ratio of each principal component: {pca.explained_variance_ratio_}")

# Create a DataFrame with the principal components
X_pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(n_components)])


In [None]:
# Split the PCA-transformed data into training and testing sets
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca_df, y, test_size=0.2, random_state=42)

# Initialize results dictionary
results = {}

# Train and evaluate each model using the PCA components
for name, model in models.items():
    # Fit the model on the training data
    model.fit(X_train_pca, y_train_pca)

    # Make predictions on the test data
    y_pred = model.predict(X_test_pca)

    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test_pca, y_pred)
    mse = mean_squared_error(y_test_pca, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_pca, y_pred)

    # Store the results
    results[name] = {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2
    }

# Convert results to a DataFrame
results_df = pd.DataFrame(results).T  # Transpose for better formatting

# Print the results
print(results_df)

# Save results to a CSV file
results_df.to_csv('model_evaluation_results_pca.csv', index=True)


In [None]:
# Plot comparison of R2 scores
plt.figure(figsize=(12, 6))
model_names = list(results.keys())
r2_scores = [results[name]['R2'] for name in model_names]
sns.barplot(x=model_names, y=r2_scores)
plt.title('Comparison of R-squared Scores')
plt.ylabel('R-squared Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


###K-best, 13 features

In [None]:
from sklearn.feature_selection import SelectKBest
# Univariate Selection
select_kbest = SelectKBest(score_func=f_regression, k=13)
fit = select_kbest.fit(X, y)
feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': fit.scores_})
print("Univariate Selection:")
print(feature_scores.nlargest(13, 'Score'))



In [None]:
# Univariate Selection
select_kbest = SelectKBest(score_func=f_regression, k=13)
fit = select_kbest.fit(X, y)
selected_features = X.columns[fit.get_support()]
print(f"Selected features from Univariate Selection: {selected_features}")

# Subset the data with selected features
X_selected = X[selected_features]

In [None]:
# Subset the data with selected features
X_selected = X[selected_features]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


In [None]:


# # Feature Importance from Random Forest
# model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
# model_rf.fit(X, y)
# importances = model_rf.feature_importances_
# feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
# print("\nFeature Importance from Random Forest:")
# print(feature_importances.sort_values(by='Importance', ascending=False))

# # Lasso Regularization (L1 Regularization)
# model_lasso = Lasso(alpha=0.01)
# model_lasso.fit(X, y)
# lasso_coef = pd.DataFrame({'Feature': X.columns, 'Coefficient': model_lasso.coef_})
# print("\nLasso Regularization:")
# print(lasso_coef[lasso_coef['Coefficient'] != 0])


In [None]:
# with pd.option_context('display.max_columns', None):
#   print(X_train.head(25))

In [None]:
# Models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}



In [None]:


# Initialize results dictionary
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Store the results
    results[name] = {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2
    }



In [None]:
# Convert results to a DataFrame
results_df = pd.DataFrame(results).T  # Transpose for better formatting

# Save results to a CSV file
results_df.to_csv('model_evaluation_results_kbest_final.csv', index=True)

# Display the results as a table
results_df

In [None]:
# Plot comparison of R2 scores
plt.figure(figsize=(12, 6))
model_names = list(results.keys())
r2_scores = [results[name]['R2'] for name in model_names]
sns.barplot(x=model_names, y=r2_scores)
plt.title('Comparison of R-squared Scores')
plt.ylabel('R-squared Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Fit the best model (Random Forest in this case) on the entire training data
best_model = RandomForestRegressor()
best_model.fit(X_train, y_train)

# Predictions and evaluation on the training set
train_predictions = best_model.predict(X_train)
train_mae = mean_absolute_error(y_train, train_predictions)
train_mse = mean_squared_error(y_train, train_predictions)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, train_predictions)

print(f"Train Set Evaluation for Random Forest:")
print(f"  MAE: {train_mae}")
print(f"  MSE: {train_mse}")
print(f"  RMSE: {train_rmse}")
print(f"  R2: {train_r2}")

# Predictions and evaluation on the test set
test_predictions = best_model.predict(X_test)
test_mae = mean_absolute_error(y_test, test_predictions)
test_mse = mean_squared_error(y_test, test_predictions)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, test_predictions)

print(f"\nTest Set Evaluation for Random Forest:")
print(f"  MAE: {test_mae}")
print(f"  MSE: {test_mse}")
print(f"  RMSE: {test_rmse}")
print(f"  R2: {test_r2}")

In [None]:
# Create DataFrame with features, actual and predicted values for test set
test_results = pd.DataFrame(X_test, columns=columns_to_scale)
test_results['Actual'] = y_test.values
test_results['Predicted'] = test_predictions

print("\nSample of Test Results:")
print(test_results.head())

In [None]:
# Compare metrics between train and test sets
metrics_comparison = pd.DataFrame({
    'Metric': ['MAE', 'MSE', 'RMSE', 'R2'],
    'Train': [train_mae, train_mse, train_rmse, train_r2],
    'Test': [test_mae, test_mse, test_rmse, test_r2]
})

# Save results to a CSV file
metrics_comparison.to_csv('best_model_metrics.csv', index=True)
print("\nMetrics Comparison:")
metrics_comparison

In [None]:

# Visualize the metrics comparison
plt.figure(figsize=(10, 6))
sns.barplot(x='Metric', y='value', hue='variable', data=pd.melt(metrics_comparison, ['Metric']))
plt.title('Metrics Comparison between Train and Test Sets')
plt.ylabel('Value')
plt.show()

###Manual FS , 15 features

In [None]:
selected_features = ['RECORD_TYPE','STATE','GROUP_SIZE','HOMEOWNER','CAR_AGE','CAR_VALUE','RISK_FACTOR',
                     'MARRIED_COUPLE','AGE_OLDEST','C_PREVIOUS', 'A' ,'B','C','D','E','F' ,'G']

In [None]:
len(selected_features)

In [None]:
    # Define target variable
target = 'COST'
# Subset the data with selected features
X_selected = df[selected_features]
y = df[target]
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)



In [None]:
X_selected.duplicated().sum()

In [None]:
X_selected.drop_duplicates(inplace=True)

In [None]:
X_selected.head()

In [None]:
# Columns to scale
columns_to_scale = ['CAR_AGE', 'AGE_OLDEST']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the training set and transform both training and testing sets
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])


In [None]:
# Fit the best model (Random Forest in this case) on the entire training data
best_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)
best_model.fit(X_train, y_train)

# Predictions and evaluation on the training set
train_predictions = best_model.predict(X_train)
train_mae = mean_absolute_error(y_train, train_predictions)
train_mse = mean_squared_error(y_train, train_predictions)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, train_predictions)

print(f"Train Set Evaluation for Random Forest:")
print(f"  MAE: {train_mae}")
print(f"  MSE: {train_mse}")
print(f"  RMSE: {train_rmse}")
print(f"  R2: {train_r2}")

# Predictions and evaluation on the test set
test_predictions = best_model.predict(X_test)
test_mae = mean_absolute_error(y_test, test_predictions)
test_mse = mean_squared_error(y_test, test_predictions)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, test_predictions)

print(f"\nTest Set Evaluation for Random Forest:")
print(f"  MAE: {test_mae}")
print(f"  MSE: {test_mse}")
print(f"  RMSE: {test_rmse}")
print(f"  R2: {test_r2}")

In [None]:
# Create DataFrame with features, actual and predicted values for test set
test_results = pd.DataFrame(X_test, columns=selected_features)
test_results['Actual'] = y_test.values
test_results['Predicted'] = test_predictions

print("\nSample of Test Results:")
print(test_results.head())

In [None]:
# Compare metrics between train and test sets
metrics_comparison = pd.DataFrame({
    'Metric': ['MAE', 'MSE', 'RMSE', 'R2'],
    'Train': [train_mae, train_mse, train_rmse, train_r2],
    'Test': [test_mae, test_mse, test_rmse, test_r2]
})

# # Save results to a CSV file
# metrics_comparison.to_csv('best_model_metrics_trial02.csv', index=True)
print("\nMetrics Comparison:")
metrics_comparison

In [None]:

# Visualize the metrics comparison
plt.figure(figsize=(10, 6))
sns.barplot(x='Metric', y='value', hue='variable', data=pd.melt(metrics_comparison, ['Metric']))
plt.title('Metrics Comparison between Train and Test Sets')
plt.ylabel('Value')
plt.show()

###RFR - hyperparameter tuning

In [None]:
from sklearn.model_selection import train_test_split

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the model
rf = RandomForestRegressor()

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [None]:
# Create the randomized search cross-validation object
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit on the training data
rf_random.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters found:")
print(rf_random.best_params_)
print("\nBest Score found:")
print(rf_random.best_score_)


In [None]:
# 5. Evaluate on training set
train_predictions = rf_random.best_estimator_.predict(X_train)
train_mae = mean_absolute_error(y_train, train_predictions)
train_mse = mean_squared_error(y_train, train_predictions)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, train_predictions)

print(f"Train Set Evaluation for Random Forest:")
print(f"  MAE: {train_mae}")
print(f"  MSE: {train_mse}")
print(f"  RMSE: {train_rmse}")
print(f"  R2: {train_r2}")

In [None]:
# Evaluate on validation set
val_predictions = rf_random.best_estimator_.predict(X_val)
val_mae = mean_absolute_error(y_val, val_predictions)
val_mse = mean_squared_error(y_val, val_predictions)
val_rmse = np.sqrt(val_mse)
val_r2 = r2_score(y_val, val_predictions)

print(f"Validation Set Evaluation for Random Forest:")
print(f"  MAE: {val_mae}")
print(f"  MSE: {val_mse}")
print(f"  RMSE: {val_rmse}")
print(f"  R2: {val_r2}")


In [None]:
# Predictions and evaluation on the test set
test_predictions = rf_random.best_estimator_.predict(X_test)
test_mae = mean_absolute_error(y_test, test_predictions)
test_mse = mean_squared_error(y_test, test_predictions)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, test_predictions)

print(f"\nTest Set Evaluation for Random Forest (after tuning):")
print(f"  MAE: {test_mae}")
print(f"  MSE: {test_mse}")
print(f"  RMSE: {test_rmse}")
print(f"  R2: {test_r2}")


In [None]:
import pandas as pd

# Assuming you have metrics variables defined (e.g., val_mae, val_mse, etc.)

# Create a DataFrame with metrics results
metrics_results = pd.DataFrame({
    'Metric': ['MAE', 'MSE', 'RMSE', 'R2'],
    'Training': [train_mae, train_mse, train_rmse, train_r2],
    'Validation': [val_mae, val_mse, val_rmse, val_r2],
    'Test': [test_mae, test_mse, test_rmse, test_r2]
})

# Save metrics results to CSV
metrics_results.to_csv('metrics_results_hyperparam.csv', index=False)

print("Metrics results saved successfully.")


###K-best , 18 features

In [None]:
# Univariate Selection
select_kbest = SelectKBest(score_func=f_regression, k=18)
fit = select_kbest.fit(X, y)
selected_features = X.columns[fit.get_support()]
print(f"Selected features from Univariate Selection: {selected_features}")

# Subset the data with selected features
X_selected = X[selected_features]

In [None]:
# Subset the data with selected features
X_selected = X[selected_features]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


In [None]:


# # Feature Importance from Random Forest
# model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
# model_rf.fit(X, y)
# importances = model_rf.feature_importances_
# feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
# print("\nFeature Importance from Random Forest:")
# print(feature_importances.sort_values(by='Importance', ascending=False))

# # Lasso Regularization (L1 Regularization)
# model_lasso = Lasso(alpha=0.01)
# model_lasso.fit(X, y)
# lasso_coef = pd.DataFrame({'Feature': X.columns, 'Coefficient': model_lasso.coef_})
# print("\nLasso Regularization:")
# print(lasso_coef[lasso_coef['Coefficient'] != 0])


In [None]:
# with pd.option_context('display.max_columns', None):
#   print(X_train.head(25))

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}



In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize results dictionary
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Store the results
    results[name] = {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2
    }



In [None]:
# Convert results to a DataFrame
results_df = pd.DataFrame(results).T  # Transpose for better formatting

# Save results to a CSV file
results_df.to_csv('model_evaluation_results_kbest_18.csv', index=True)

# Display the results as a table
results_df

In [None]:
# Plot comparison of R2 scores
plt.figure(figsize=(12, 6))
model_names = list(results.keys())
r2_scores = [results[name]['R2'] for name in model_names]
sns.barplot(x=model_names, y=r2_scores)
plt.title('Comparison of R-squared Scores')
plt.ylabel('R-squared Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# # Fit the best model (Random Forest in this case) on the entire training data
# best_model = RandomForestRegressor()
# best_model.fit(X_train, y_train)

# # Predictions and evaluation on the test set
# test_predictions = best_model.predict(X_test)
# test_mae = mean_absolute_error(y_test, test_predictions)
# test_mse = mean_squared_error(y_test, test_predictions)
# test_rmse = np.sqrt(test_mse)
# test_r2 = r2_score(y_test, test_predictions)

# print(f"Test Set Evaluation for Random Forest:")
# print(f"  MAE: {test_mae}")
# print(f"  MSE: {test_mse}")
# print(f"  RMSE: {test_rmse}")
# print(f"  R2: {test_r2}")

# # Create DataFrame with features, actual and predicted values for test set
# test_results = pd.DataFrame(X_test, columns=columns_to_scale)
# test_results['Actual'] = y_test.values
# test_results['Predicted'] = test_predictions

# print("\nSample of Test Results:")
# print(test_results.head())


###RFE, 15 features

In [None]:
# Recursive Feature Elimination (RFE)
model_lr = LinearRegression()
rfe = RFE(model_lr, n_features_to_select=15)
fit = rfe.fit(X, y)
selected_features = pd.DataFrame({'Feature': X.columns, 'Selected': fit.support_, 'Ranking': fit.ranking_})
print("\nRecursive Feature Elimination:")
print(selected_features[selected_features['Selected'] == True])

In [None]:
# Extract the names of the selected features
selected_feature_names = selected_features[selected_features['Selected'] == True]['Feature'].tolist()


In [None]:
# Subset the data with selected features
X_selected = X[selected_feature_names]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


In [None]:


# # Feature Importance from Random Forest
# model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
# model_rf.fit(X, y)
# importances = model_rf.feature_importances_
# feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
# print("\nFeature Importance from Random Forest:")
# print(feature_importances.sort_values(by='Importance', ascending=False))

# # Lasso Regularization (L1 Regularization)
# model_lasso = Lasso(alpha=0.01)
# model_lasso.fit(X, y)
# lasso_coef = pd.DataFrame({'Feature': X.columns, 'Coefficient': model_lasso.coef_})
# print("\nLasso Regularization:")
# print(lasso_coef[lasso_coef['Coefficient'] != 0])


In [None]:
# with pd.option_context('display.max_columns', None):
#   print(X_train.head(25))

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}



In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize results dictionary
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Store the results
    results[name] = {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2
    }



In [None]:
# Convert results to a DataFrame
results_df_rfe = pd.DataFrame(results).T  # Transpose for better formatting

# Save results to a CSV file
results_df_rfe.to_csv('model_evaluation_results_rfe.csv', index=True)

# Display the results as a table
results_df_rfe

In [None]:
# Plot comparison of R2 scores
plt.figure(figsize=(12, 6))
model_names = list(results.keys())
r2_scores = [results[name]['R2'] for name in model_names]
sns.barplot(x=model_names, y=r2_scores)
plt.title('Comparison of R-squared Scores')
plt.ylabel('R-squared Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Fit the best model (Random Forest in this case) on the entire training data
best_model = RandomForestRegressor()
best_model.fit(X_train, y_train)

# Predictions and evaluation on the test set
test_predictions = best_model.predict(X_test)
test_mae = mean_absolute_error(y_test, test_predictions)
test_mse = mean_squared_error(y_test, test_predictions)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, test_predictions)

print(f"Test Set Evaluation for Random Forest:")
print(f"  MAE: {test_mae}")
print(f"  MSE: {test_mse}")
print(f"  RMSE: {test_rmse}")
print(f"  R2: {test_r2}")

# Create DataFrame with features, actual and predicted values for test set
test_results = pd.DataFrame(X_test, columns=columns_to_scale)
test_results['Actual'] = y_test.values
test_results['Predicted'] = test_predictions

print("\nSample of Test Results:")
print(test_results.head())


###RFE , 18 features

In [None]:
# Recursive Feature Elimination (RFE)
model_lr = LinearRegression()
rfe = RFE(model_lr, n_features_to_select=18)
fit = rfe.fit(X, y)
selected_features = pd.DataFrame({'Feature': X.columns, 'Selected': fit.support_, 'Ranking': fit.ranking_})
print("\nRecursive Feature Elimination:")
print(selected_features[selected_features['Selected'] == True])

In [None]:
# Extract the names of the selected features
selected_feature_names = selected_features[selected_features['Selected'] == True]['Feature'].tolist()


In [None]:
# Subset the data with selected features
X_selected = X[selected_feature_names]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


In [None]:


# # Feature Importance from Random Forest
# model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
# model_rf.fit(X, y)
# importances = model_rf.feature_importances_
# feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
# print("\nFeature Importance from Random Forest:")
# print(feature_importances.sort_values(by='Importance', ascending=False))

# # Lasso Regularization (L1 Regularization)
# model_lasso = Lasso(alpha=0.01)
# model_lasso.fit(X, y)
# lasso_coef = pd.DataFrame({'Feature': X.columns, 'Coefficient': model_lasso.coef_})
# print("\nLasso Regularization:")
# print(lasso_coef[lasso_coef['Coefficient'] != 0])


In [None]:
# with pd.option_context('display.max_columns', None):
#   print(X_train.head(25))

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}



In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize results dictionary
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Store the results
    results[name] = {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2
    }



In [None]:
# Convert results to a DataFrame
results_df_rfe = pd.DataFrame(results).T  # Transpose for better formatting

# Save results to a CSV file
results_df_rfe.to_csv('model_evaluation_results_rfe_18.csv', index=True)

# Display the results as a table
results_df_rfe

In [None]:
# Plot comparison of R2 scores
plt.figure(figsize=(12, 6))
model_names = list(results.keys())
r2_scores = [results[name]['R2'] for name in model_names]
sns.barplot(x=model_names, y=r2_scores)
plt.title('Comparison of R-squared Scores')
plt.ylabel('R-squared Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# # Fit the best model (Random Forest in this case) on the entire training data
# best_model = RandomForestRegressor()
# best_model.fit(X_train, y_train)

# # Predictions and evaluation on the test set
# test_predictions = best_model.predict(X_test)
# test_mae = mean_absolute_error(y_test, test_predictions)
# test_mse = mean_squared_error(y_test, test_predictions)
# test_rmse = np.sqrt(test_mse)
# test_r2 = r2_score(y_test, test_predictions)

# print(f"Test Set Evaluation for Random Forest:")
# print(f"  MAE: {test_mae}")
# print(f"  MSE: {test_mse}")
# print(f"  RMSE: {test_rmse}")
# print(f"  R2: {test_r2}")

# # Create DataFrame with features, actual and predicted values for test set
# test_results = pd.DataFrame(X_test, columns=columns_to_scale)
# test_results['Actual'] = y_test.values
# test_results['Predicted'] = test_predictions

# print("\nSample of Test Results:")
# print(test_results.head())


###Extra

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [None]:
# import matplotlib.pyplot as plt
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# import numpy as np

# # Function to evaluate model and return metrics
# def evaluate_model(model, X_train, X_test, y_train, y_test):
#     # Predictions
#     train_predictions = model.predict(X_train)
#     test_predictions = model.predict(X_test)

#     # MAE, MSE, RMSE, R2 for training set
#     train_mae = mean_absolute_error(y_train, train_predictions)
#     train_mse = mean_squared_error(y_train, train_predictions)
#     train_rmse = np.sqrt(train_mse)
#     train_r2 = r2_score(y_train, train_predictions)

#     # MAE, MSE, RMSE, R2 for test set
#     test_mae = mean_absolute_error(y_test, test_predictions)
#     test_mse = mean_squared_error(y_test, test_predictions)
#     test_rmse = np.sqrt(test_mse)
#     test_r2 = r2_score(y_test, test_predictions)

#     return train_mae, train_mse, train_rmse, train_r2, test_mae, test_mse, test_rmse, test_r2


In [None]:
# # Evaluate Linear Regression model
# lr_train_mae, lr_train_mse, lr_train_rmse, lr_train_r2, lr_test_mae, lr_test_mse, lr_test_rmse, lr_test_r2 = \
#     evaluate_model(lr_model, X_train, X_test, y_train, y_test)

# # Evaluate Random Forest model
# rf_train_mae, rf_train_mse, rf_train_rmse, rf_train_r2, rf_test_mae, rf_test_mse, rf_test_rmse, rf_test_r2 = \
#     evaluate_model(rf_model, X_train, X_test, y_train, y_test)

# # Print metrics for both models
# print("Linear Regression Metrics:")
# print(f"  Train MAE: {lr_train_mae}, MSE: {lr_train_mse}, RMSE: {lr_train_rmse}, R2: {lr_train_r2}")
# print(f"  Test MAE: {lr_test_mae}, MSE: {lr_test_mse}, RMSE: {lr_test_rmse}, R2: {lr_test_r2}")

# print("\nRandom Forest Metrics:")
# print(f"  Train MAE: {rf_train_mae}, MSE: {rf_train_mse}, RMSE: {rf_train_rmse}, R2: {rf_train_r2}")
# print(f"  Test MAE: {rf_test_mae}, MSE: {rf_test_mse}, RMSE: {rf_test_rmse}, R2: {rf_test_r2}")


In [None]:
X_test.columns.tolist()

In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt

# Function to evaluate model and return metrics and predictions
def evaluate_model(model, X_train, X_test, y_train, y_test, feature_names):
    # Predictions
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)

    # MAE, MSE, RMSE, R2 for training set
    train_mae = mean_absolute_error(y_train, train_predictions)
    train_mse = mean_squared_error(y_train, train_predictions)
    train_rmse = np.sqrt(train_mse)
    train_r2 = r2_score(y_train, train_predictions)

    # MAE, MSE, RMSE, R2 for test set
    test_mae = mean_absolute_error(y_test, test_predictions)
    test_mse = mean_squared_error(y_test, test_predictions)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(y_test, test_predictions)

    # Create DataFrame with features, actual and predicted values for train and test sets
    train_results = pd.DataFrame(X_train, columns=feature_names)
    train_results['Actual'] = y_train
    train_results['Predicted'] = train_predictions

    test_results = pd.DataFrame(X_test, columns=feature_names)
    test_results['Actual'] = y_test
    test_results['Predicted'] = test_predictions

    return (train_mae, train_mse, train_rmse, train_r2, test_mae, test_mse, test_rmse, test_r2,
            train_results, test_results)

# Feature names
feature_names = X_train.columns.to_list()
#  ['HOMEOWNER', 'GROUP_SIZE', 'CAR_AGE', 'CAR_VALUE', 'AGE_OLDEST', 'AGE_YOUNGEST', 'MARRIED_COUPLE']

# Evaluate Linear Regression model
(lr_train_mae, lr_train_mse, lr_train_rmse, lr_train_r2, lr_test_mae, lr_test_mse, lr_test_rmse, lr_test_r2,
 lr_train_results, lr_test_results) = evaluate_model(lr_model, X_train, X_test, y_train, y_test, feature_names)

# Evaluate Random Forest model
(rf_train_mae, rf_train_mse, rf_train_rmse, rf_train_r2, rf_test_mae, rf_test_mse, rf_test_rmse, rf_test_r2,
 rf_train_results, rf_test_results) = evaluate_model(rf_model, X_train, X_test, y_train, y_test, feature_names)


In [None]:
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# import matplotlib.pyplot as plt
# import numpy as np

# def evaluate_model(model, X_train, X_test, y_train, y_test, feature_names):
#     # Fit the model
#     model.fit(X_train, y_train)

#     # Predictions
#     y_train_pred = model.predict(X_train)
#     y_test_pred = model.predict(X_test)

#     # Training metrics
#     train_mae = mean_absolute_error(y_train, y_train_pred)
#     train_mse = mean_squared_error(y_train, y_train_pred)
#     train_rmse = np.sqrt(train_mse)
#     train_r2 = r2_score(y_train, y_train_pred)

#     # Testing metrics
#     test_mae = mean_absolute_error(y_test, y_test_pred)
#     test_mse = mean_squared_error(y_test, y_test_pred)
#     test_rmse = np.sqrt(test_mse)
#     test_r2 = r2_score(y_test, y_test_pred)

#     # Collect results
#     train_results = pd.DataFrame({
#         'Actual': y_train,
#         'Predicted': y_train_pred,
#         'Feature Importance': feature_names
#     })

#     test_results = pd.DataFrame({
#         'Actual': y_test,
#         'Predicted': y_test_pred,
#         'Feature Importance': feature_names
#     })

#     return (train_mae, train_mse, train_rmse, train_r2, test_mae, test_mse, test_rmse, test_r2,
#             train_results, test_results)


In [None]:
# # Define models to evaluate
# models = {
#     'Linear Regression': LinearRegression(),
#     'Ridge Regression': Ridge(),
#     'Lasso Regression': Lasso(),
#     'Decision Tree': DecisionTreeRegressor(),
#     'Random Forest': RandomForestRegressor(),
#     'Gradient Boosting': GradientBoostingRegressor()
# }

# # Initialize results storage
# model_metrics = {}
# r2_scores = []

# # Feature names
# feature_names = X_train.columns.tolist()

# # Evaluate each model
# for name, model in models.items():
#     (train_mae, train_mse, train_rmse, train_r2, test_mae, test_mse, test_rmse, test_r2,
#      train_results, test_results) = evaluate_model(model, X_train, X_test, y_train, y_test, feature_names)

#     # Store metrics
#     model_metrics[name] = {
#         'Train MAE': train_mae,
#         'Train MSE': train_mse,
#         'Train RMSE': train_rmse,
#         'Train R2': train_r2,
#         'Test MAE': test_mae,
#         'Test MSE': test_mse,
#         'Test RMSE': test_rmse,
#         'Test R2': test_r2
#     }

#     # Collect R2 scores for plotting
#     r2_scores.append((name, 'Train', train_r2))
#     r2_scores.append((name, 'Test', test_r2))

# # Print the results
# for name, metrics in model_metrics.items():
#     print(f"{name} Metrics:")
#     print(f"  Train MAE: {metrics['Train MAE']}, MSE: {metrics['Train MSE']}, RMSE: {metrics['Train RMSE']}, R2: {metrics['Train R2']}")
#     print(f"  Test MAE: {metrics['Test MAE']}, MSE: {metrics['Test MSE']}, RMSE: {metrics['Test RMSE']}, R2: {metrics['Test R2']}")
#     print()



In [None]:
# # Plot comparison of R2 scores
# labels = [f"{name} {kind}" for name, kind, _ in r2_scores]
# r2_values = [score for _, _, score in r2_scores]

# plt.figure(figsize=(14, 8))
# plt.bar(labels, r2_values, color=['blue', 'lightblue', 'green', 'lightgreen', 'red', 'pink', 'orange', 'yellow', 'purple', 'violet', 'brown', 'lightbrown'])
# plt.title('Comparison of R-squared Scores')
# plt.ylim(min(r2_values) - 0.1, max(r2_values) + 0.1)
# plt.ylabel('R-squared Score')
# plt.xticks(rotation=45, ha='right')
# plt.tight_layout()
# plt.show()


In [None]:
# Print metrics for both models
print("Linear Regression Metrics:")
print(f"  Train MAE: {lr_train_mae}, MSE: {lr_train_mse}, RMSE: {lr_train_rmse}, R2: {lr_train_r2}")
print(f"  Test MAE: {lr_test_mae}, MSE: {lr_test_mse}, RMSE: {lr_test_rmse}, R2: {lr_test_r2}")

print("\nRandom Forest Metrics:")
print(f"  Train MAE: {rf_train_mae}, MSE: {rf_train_mse}, RMSE: {rf_train_rmse}, R2: {rf_train_r2}")
print(f"  Test MAE: {rf_test_mae}, MSE: {rf_test_mse}, RMSE: {rf_test_rmse}, R2: {rf_test_r2}")

In [None]:
# Print a sample of the train and test results
print("\nSample of Linear Regression Train Results:")
print(lr_train_results.head())

print("\nSample of Linear Regression Test Results:")
print(lr_test_results.head())

In [None]:
with pd.option_context('display.max_columns', None):
  print(rf_train_results.head(25))

In [None]:
print("\nSample of Random Forest Train Results:")
rf_train_results.head(20)


In [None]:

print("\nSample of Random Forest Test Results:")
rf_test_results.head(20)

In [None]:
# Plot comparison of R2 scores
labels = ['Linear Regression Train', 'Linear Regression Test', 'Random Forest Train', 'Random Forest Test']
r2_scores = [lr_train_r2, lr_test_r2, rf_train_r2, rf_test_r2]

plt.figure(figsize=(10, 6))
plt.bar(labels, r2_scores, color=['blue', 'lightblue', 'green', 'lightgreen'])
plt.title('Comparison of R-squared Scores')
plt.ylim(min(r2_scores) - 0.1, max(r2_scores) + 0.1)
plt.ylabel('R-squared Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Assuming you have the metrics already calculated and stored in variables

# Create a dictionary to hold the metrics
metrics_data = {
    "Model": ["Linear Regression", "Random Forest"],
    "Train MAE": [lr_train_mae, rf_train_mae],
    "Train MSE": [lr_train_mse, rf_train_mse],
    "Train RMSE": [lr_train_rmse, rf_train_rmse],
    "Train R2": [lr_train_r2, rf_train_r2],
    "Test MAE": [lr_test_mae, rf_test_mae],
    "Test MSE": [lr_test_mse, rf_test_mse],
    "Test RMSE": [lr_test_rmse, rf_test_rmse],
    "Test R2": [lr_test_r2, rf_test_r2]
}

# Convert the dictionary to a DataFrame
metrics_df = pd.DataFrame(metrics_data)

# Save to CSV
metrics_df.to_csv('baseline_model_metrics.csv', index=False)

print("Metrics saved to model_metrics.csv")


###DEPLOYMENT

In [None]:
import joblib

# Save the model to a file
model_filename = 'rf_bestest_model.pkl'
joblib.dump(best_model, model_filename)
print(f"\nModel saved to {model_filename} successfully.")
# joblib.dump(label_encoders, 'label_encoders.pkl')
# joblib.dump(scaler, 'scaler.pkl')

# print("Model and preprocessing objects have been saved.")


###Deep Learning

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Define the deep learning model
def build_model():
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1)  # Output layer for regression
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Build the model
model = build_model()

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=100, validation_split=0.2, batch_size=32, callbacks=[early_stopping])

# Evaluate the model on the training set
train_predictions = model.predict(X_train).flatten()
train_mae = mean_absolute_error(y_train, train_predictions)
train_mse = mean_squared_error(y_train, train_predictions)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, train_predictions)

print(f"Train Set Evaluation for Deep Learning Model:")
print(f"  MAE: {train_mae}")
print(f"  MSE: {train_mse}")
print(f"  RMSE: {train_rmse}")
print(f"  R2: {train_r2}")

# Evaluate the model on the test set
test_predictions = model.predict(X_test).flatten()
test_mae = mean_absolute_error(y_test, test_predictions)
test_mse = mean_squared_error(y_test, test_predictions)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, test_predictions)

print(f"\nTest Set Evaluation for Deep Learning Model:")
print(f"  MAE: {test_mae}")
print(f"  MSE: {test_mse}")
print(f"  RMSE: {test_rmse}")
print(f"  R2: {test_r2}")

# Save the model
model.save('deep_learning_best_model.h5')
print("\nModel saved successfully.")


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt

# Define the deep learning model
def build_model():
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1)  # Output layer for regression
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Build the model
model = build_model()

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Lists to store R2 scores
train_r2_scores = []
val_r2_scores = []

# Train the model and calculate R2 scores after each epoch
epochs = 100
batch_size = 32
validation_split = 0.2

for epoch in range(epochs):
    history = model.fit(X_train, y_train, epochs=1, validation_split=validation_split, batch_size=batch_size, callbacks=[early_stopping], verbose=0)

    # Calculate R2 score for the training set
    train_predictions = model.predict(X_train).flatten()
    train_r2 = r2_score(y_train, train_predictions)
    train_r2_scores.append(train_r2)

    # Calculate R2 score for the validation set
    val_indices = int((1 - validation_split) * len(X_train))
    X_val = X_train[val_indices:]
    y_val = y_train[val_indices:]
    val_predictions = model.predict(X_val).flatten()
    val_r2 = r2_score(y_val, val_predictions)
    val_r2_scores.append(val_r2)

    # Early stopping check
    if early_stopping.stopped_epoch > 0:
        break

# Evaluate the model on the training set
train_predictions = model.predict(X_train).flatten()
train_mae = mean_absolute_error(y_train, train_predictions)
train_mse = mean_squared_error(y_train, train_predictions)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, train_predictions)

print(f"Train Set Evaluation for Deep Learning Model:")
print(f"  MAE: {train_mae}")
print(f"  MSE: {train_mse}")
print(f"  RMSE: {train_rmse}")
print(f"  R2: {train_r2}")

# Evaluate the model on the test set
test_predictions = model.predict(X_test).flatten()
test_mae = mean_absolute_error(y_test, test_predictions)
test_mse = mean_squared_error(y_test, test_predictions)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, test_predictions)

print(f"\nTest Set Evaluation for Deep Learning Model:")
print(f"  MAE: {test_mae}")
print(f"  MSE: {test_mse}")
print(f"  RMSE: {test_rmse}")
print(f"  R2: {test_r2}")

# Save the model
model.save('deep_learning_best_model.h5')
print("\nModel saved successfully.")

In [None]:
# Plot the training and validation loss and R2 scores
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']

epochs_range = range(1, len(loss_values) + 1)

plt.figure(figsize=(14, 6))

# Plot for Loss
plt.subplot(1, 2, 1)
plt.plot(epochs_range, loss_values, 'bo-', label='Training Loss')
plt.plot(epochs_range, val_loss_values, 'ro-', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

# Plot for R2 Score
plt.subplot(1, 2, 2)
plt.plot(epochs_range, train_r2_scores, 'bo-', label='Training R2')
plt.plot(epochs_range, val_r2_scores, 'ro-', label='Validation R2')
plt.title('Training and Validation R2 Score')
plt.xlabel('Epochs')
plt.ylabel('R2 Score')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assume history is the object returned by the model.fit() method
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']

epochs = range(1, len(loss_values) + 1)

# Plotting the training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(epochs, loss_values, 'bo-', label='Training Loss')
plt.plot(epochs, val_loss_values, 'ro-', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()


###Deployemnt trial

In [None]:
# Save the LabelEncoders using joblib
for feature, encoder in label_encoders.items():
    joblib.dump(encoder, f'{feature}_label_encoder.pkl')

In [None]:
import joblib
# Save the preprocessing steps and the model
# Save the LabelEncoders using joblib
for feature, encoder in label_encoders.items():
    joblib.dump(encoder, f'{feature}_label_encoder.pkl')
joblib.dump(scaler, 'minmax_scaler.pkl')
joblib.dump(model, 'random_forest_model.pkl')


In [None]:
import joblib
import pandas as pd

# Load the preprocessing steps and the model
label_encoder = joblib.load('label_encoders.pkl')
minmax_scaler = joblib.load('scaler.pkl')
model = joblib.load('rf_bestest_model.pkl')

selected_features = ['RECORD_TYPE', 'STATE', 'GROUP_SIZE', 'HOMEOWNER', 'CAR_AGE', 'CAR_VALUE', 'RISK_FACTOR', 'MARRIED_COUPLE', 'AGE_OLDEST', 'C_PREVIOUS', 'DURATION_PREVIOUS', 'A','B', 'C', 'E', 'G']
categorical_columns = ['STATE', 'CAR_VALUE']
numerical_columns = ['CAR_AGE', 'AGE_OLDEST']

def preprocess_input(data):
    # Convert input data to DataFrame
    df = pd.DataFrame(data, index=[0])

    # Apply the same preprocessing as training data
    for col in categorical_columns:
        if col in label_encoder:
            df[col] = label_encoder[col].transform(df[col].astype(str))

    df[numerical_columns] = minmax_scaler.transform(df[numerical_columns])

    # Select the same features as training data
    df_selected = df[selected_features]

    return df_selected

def predict(data):
    preprocessed_data = preprocess_input(data)
    prediction = model.predict(preprocessed_data)
    rounded_prediction = round(prediction[0], 2)  # Round off to 2 decimal places
    return rounded_prediction

# Example usage
input_data = {
    'RECORD_TYPE': 1,
    'STATE': 'NY',
    'GROUP_SIZE': 2,
    'HOMEOWNER': 1,
    'CAR_AGE': 5,
    'CAR_VALUE': 'b',
    'RISK_FACTOR': 3,
    'MARRIED_COUPLE': 0,
    'AGE_OLDEST': 45,
    'C_PREVIOUS': 1,
    'DURATION_PREVIOUS': 2.5,
    'A': 0,
    'B': 1,
    'C': 1,
    'E': 0,
    'G': 1
}
print(predict(input_data))


In [None]:
# Example usage
input_data = {
    'RECORD_TYPE': 1,
    'GROUP_SIZE': 2,
    'HOMEOWNER': 1,
    'CAR_AGE': 5,
    'RISK_FACTOR': 3,
    'MARRIED_COUPLE': 0,
    'AGE_OLDEST': 45,
    'C_PREVIOUS': 1
}

In [None]:
# Create a boolean mask for the specified columns only
mask = pd.Series([True] * len(df))
for key, value in input_data.items():
    mask &= (df[key] == value)

# Extract rows that match the input data
matching_rows = df[mask]

matching_rows

In [None]:
df['CAR_VALUE'].unique()

In [None]:
matching_rows['AGE_OLDEST']

In [None]:
from flask import Flask, request, jsonify
import joblib
import pandas as pd

app = Flask(__name__)

# Load the preprocessing steps and the model
label_encoder = joblib.load('label_encoders.pkl')
minmax_scaler = joblib.load('scaler.pkl')
model = joblib.load('rf_bestest_model.pkl')

selected_features = ['RECORD_TYPE', 'STATE', 'GROUP_SIZE', 'HOMEOWNER', 'CAR_AGE', 'CAR_VALUE', 'RISK_FACTOR', 'MARRIED_COUPLE', 'AGE_OLDEST', 'C_PREVIOUS', 'DURATION_PREVIOUS', 'A','B', 'C', 'E', 'G']
categorical_columns = ['STATE', 'CAR_VALUE']
numerical_columns = ['CAR_AGE', 'AGE_OLDEST']

def preprocess_input(data):
    # Convert input data to DataFrame
    df = pd.DataFrame(data, index=[0])

    # Apply the same preprocessing as training data
    for col in categorical_columns:
        df[col] = label_encoder.transform(df[col])

    df[numerical_columns] = minmax_scaler.transform(df[numerical_columns])

    # Select the same features as training data
    df_selected = df[selected_features]

    return df_selected

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    preprocessed_data = preprocess_input(data)
    prediction = model.predict(preprocessed_data)
    return jsonify({'prediction': prediction.tolist()})

if __name__ == '__main__':
    app.run(debug=True)
