In [1]:
import pandas as pd
import numpy as np

balance_df = pd.read_excel('balance_frame.xlsx')
print(balance_df.shape)


(8369, 36)


In [2]:
features = ['Gear', 'Year of Manufacture', 'modelYear', 'km', 'transmission','Mileage',
            'City','bt','ownerNo','Insurance Validity','Fuel Type',]#'model''Mileage']

In [3]:
# Outliers removal using IQR

Q1 = balance_df.quantile(0.05)
Q3 = balance_df.quantile(0.95)
IQR = Q3 - Q1
balance_df = balance_df[~((balance_df < (Q1 - 1.5 * IQR)) |(balance_df > (Q3 + 1.5 * IQR))).any(axis=1)]
balance_df.shape

(7697, 36)

In [4]:
def clean_gearbox(gear_box):
    if gear_box in [0, 1, 2, 3]:
        return 4
    return gear_box


In [5]:
balance_df['Gear'] = balance_df['Gear'].apply(clean_gearbox)
balance_df['Gear'].value_counts()

Gear
4    6945
5     379
6     263
7     110
Name: count, dtype: int64

In [6]:
#scaling the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# X_train[features] = scaler.fit_transform(X_train[features])
# X_test[features] = scaler.fit_transform(X_test[features])
balance_df[features] = scaler.fit_transform(balance_df[features])

In [7]:
from sklearn.model_selection import train_test_split
X = balance_df[features]
y = balance_df['price_in_lakhs']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((6157, 11), (6157,), (1540, 11), (1540,))

In [None]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
r2_score(y_test, y_pred)

In [None]:
len(X_train.columns)

In [None]:
importances = model.feature_importances_
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
feature_importances

In [None]:
parameters = {
    'n_estimators': [50, 100, 150, 200, 250, 300, 400],  # More granularity for tree count
    'max_depth': [5, 10, 15, 20, None],  # Including deeper trees and no limit
    'min_samples_split': [2, 5, 10, 20],  # Explore stricter splitting rules
    'min_samples_leaf': [1, 2, 4, 8],  # Experiment with leaf size
    'bootstrap': [True, False],  # Test both bootstrapping and non-bootstrapping
    'criterion': ['squared_error', 'absolute_error', 'poisson'],  # Include Poisson criterion
    'max_features': ['auto', 'sqrt', 'log2'],  # Explore feature subsets
    'oob_score': [True, False],  # Out-of-bag scoring for more robust validation
}

from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=parameters, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)


In [None]:
best_params

In [None]:
#Training the model with best parameters
model = RandomForestRegressor(**best_params)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
r2_score(y_test, y_pred)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'RMSE : {rmse}')
print(f'MAE : {mae}')
print(f'R2 Score : {r2}')

## ML Flow


In [28]:
# import pickle
# pickle.dump(model,open('random_forest_model.pkl', 'wb'))

In [30]:
import mlflow

In [None]:
best_params

In [None]:
mlflow.set_experiment("First experiment")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

with mlflow.start_run():
    mlflow.log_params(best_params)
    mlflow.log_metric('rmse',rmse)
    mlflow.log_metric('mae',mae)
    mlflow.log_metric('r2',r2)

    mlflow.sklearn.log_model(model,"Random Forest Regressor")

# Multiple Models

In [9]:

from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
models = [
    (
        "Gradient Boosting Regressor",
        
        {"n_estimators": 150, "learning_rate": 0.1},
        GradientBoostingRegressor(),  # Example of hyperparameters
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "Random Forest",
        {"n_estimators": 150, "max_depth": 5}, 
         RandomForestRegressor(), # Corrected position of params
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "Support Vector Regression",
        
        {"kernel": "rbf", "C": 1.0, "epsilon": 0.1}, 
         SVR(), # Params for SVR
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "XGB Regressor",
       
        {"n_estimators": 100, "learning_rate": 0.1}, 
        XGBRegressor(), # Params for XGBoost
        (X_train, y_train),
        (X_test, y_test)
    )
]


In [38]:
min_samples = min(len(X), len(y))
X = X[:min_samples]
y = y[:min_samples]


In [None]:
# Check consistency of data
assert len(X) == len(y), "Features (X) and target (y) must have the same number of samples!"

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the sizes after the split
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")


In [None]:
reports=[]

for model_name, params,model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]
    # Apply hyperparameters and train the model
    model.set_params(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store the results
    reports.append((model_name, rmse, mae, r2))

In [10]:
X_train.isna().sum()

Gear                   0
Year of Manufacture    5
modelYear              0
km                     0
transmission           0
Mileage                0
City                   0
bt                     0
ownerNo                0
Insurance Validity     0
Fuel Type              0
dtype: int64

In [11]:
X_train = X_train.dropna()

In [12]:
X_train.isna().sum()

Gear                   0
Year of Manufacture    0
modelYear              0
km                     0
transmission           0
Mileage                0
City                   0
bt                     0
ownerNo                0
Insurance Validity     0
Fuel Type              0
dtype: int64

In [16]:
from sklearn.impute import SimpleImputer

# Imputer to handle NaN values
imputer = SimpleImputer(strategy='mean')  # Replace NaN with column mean
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
reports = []

for model_name, params, model, train_set, test_set in models:
    # Extract training and testing sets
    X_train, y_train = train_set
    X_test, y_test = test_set

    # Handle missing values in X_train and X_test
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    # Debugging step: Check and align data lengths
    min_train_samples = min(len(X_train), len(y_train))
    X_train, y_train = X_train[:min_train_samples], y_train[:min_train_samples]

    min_test_samples = min(len(X_test), len(y_test))
    X_test, y_test = X_test[:min_test_samples], y_test[:min_test_samples]

    # Apply hyperparameters and train the model
    model.set_params(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store the results
    reports.append((model_name, rmse, mae, r2))

# Print the results
for report in reports:
    print(f"Model: {report[0]}, RMSE: {report[1]:.4f}, MAE: {report[2]:.4f}, R2: {report[3]:.4f}")


Model: Gradient Boosting Regressor, RMSE: 4.2559, MAE: 2.0546, R2: 0.7847
Model: Random Forest, RMSE: 4.9590, MAE: 2.5546, R2: 0.7077
Model: Support Vector Regression, RMSE: 4.7907, MAE: 2.1758, R2: 0.7272
Model: XGB Regressor, RMSE: 3.8067, MAE: 1.7298, R2: 0.8278


In [19]:
import mlflow.sklearn
import mlflow.xgboost
from mlflow.models.signature import infer_signature
import numpy as np

for i, element in enumerate(models):
    model_name = element[0]
    params = element[1]
    model = element[2]
    report = reports[i]

    # Extract the metrics for this model
    rmse, mae, r2 = report[1], report[2], report[3]

    # Create an input example as a NumPy array to ensure compatibility
    input_example = np.array(X_train[:1])  # Single row of the training data
    signature = infer_signature(X_train, model.predict(X_train[:5]))  # Input and output schema

    with mlflow.start_run(run_name=model_name):
        # Log hyperparameters
        mlflow.log_params(params)

        # Log metrics
        mlflow.log_metric('rmse', rmse)
        mlflow.log_metric('mae', mae)
        mlflow.log_metric('r2', r2)

        # Log the model with input example and signature
        if 'XGB' in model_name:
            mlflow.xgboost.log_model(model, "model", signature=signature, input_example=input_example)
        else:
            mlflow.sklearn.log_model(model, "model", signature=signature, input_example=input_example)


  "inputs": [
    [
      0.0,
      0.4761904761904816,
      0.607142857142847,
      0.3402730769230769,
      1.0,
      0.31884057971014496,
      0.6000000000000001,
      0.7777777777777777,
      0.6000000000000001,
      0.5714285714285714,
      1.0
    ]
  ]
}. Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: 'super' object has no attribute '__sklearn_tags__'
