In [2]:
%pip install wandb

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.ensemble import RandomForestRegressor  # For the Random Forest regression model
import shap
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
import os
import wandb
import re

pd.set_option('display.expand_frame_repr', False)




In [9]:
def save_dataframe(df,file_name):
  from google.colab import drive
  drive.mount('/content/drive')
  file_path = "/content/drive/My Drive/"+file_name+".csv"
  df.to_csv(file_path, index=False)
  print(f"File saved successfully at: {file_path}")

def load_dataframe_from_drive(file_name='Train'):
    """
    Function to load a DataFrame from a Google Drive file path.
    """
    from google.colab import drive
    drive.mount('/content/drive')

    # Load the file from Google Drive
    file_path = "/content/drive/My Drive/"+file_name+".csv"  # Replace with the saved file path
    df = pd.read_csv(file_path)
    return df

def save_model(model,model_name):
  from google.colab import drive
  drive.mount('/content/drive')

  import joblib

  # Assuming 'rf_model' is your trained RandomForestRegressor
  model_path = '/content/drive/My Drive/'+model_name+'.pkl'
  joblib.dump(model, model_path)

  print(f"Model saved at {model_path}")

def load_model(model_name):
  import joblib

  model = joblib.load('/content/drive/My Drive/'+model_name+'.pkl')
  print("Model loaded successfully!")
  return model

def RMSLE(y_test, y_pred):
    '''
    RSMLE approximates the percent change
    '''
    return np.sqrt(np.mean((np.log(y_pred) - np.log(y_test))**2))

def RMSE(y_, y_pred_):
    '''
    RSME
    '''
    return ((y_ - y_pred_) ** 2).mean() ** 0.5

def train_model(df,column_to_predict,test_size_value=0.3,random_state_value=42):
    '''
    Function to train a Random Forest regression model on a given DataFrame.
    '''
    print("train_model START")

    !wandb login --relogin

    # Initialize a new W&B run
    wandb.init(
        project="Predict-heavy-machinery-price"
    )

    X = df.drop(columns=[column_to_predict])
    y = df[column_to_predict]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_value, random_state=random_state_value)
    model = RandomForestRegressor()
    model.fit(X_train, y_train)


    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_test_pred)

    wandb.log({"mean_squared_error": mse})
    wandb.log({"Train RMSE": RMSE(y_train, y_train_pred)})
    wandb.log({"Test RMSE": RMSE(y_test, y_test_pred)})

    # Optional: Log feature importances
    importances = model.feature_importances_

    # Create a zip of feature names and feature importances
    feature_zip = zip(model.feature_names_in_, model.feature_importances_)

    # Sort the zip by feature importance in descending order
    sorted_feature_zip = sorted(feature_zip, key=lambda x: x[1], reverse=True)

    # Print the sorted feature names, importances, and index
    for idx, (feat, importance) in enumerate(sorted_feature_zip, 1):
        wandb.log({f"feature_{idx}_{feat}_importance" : importance})

    # Finish the run
    wandb.finish()

    print("RMSE Baseline accuracy:", y_test.std())
    print("Train RMSE:", RMSE(y_train, y_train_pred))
    print("Test RMSE:", RMSE(y_test, y_test_pred))

    print("train_model END")

    return model,(X_train, X_test, y_train, y_test)



def generate_X_Y_params(df,column_to_predict,test_size_value=0.3,random_state_value=42):
    X = df.drop(columns=[column_to_predict])
    y = df[column_to_predict]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_value, random_state=random_state_value)
    return X_train, X_test, y_train, y_test

def list_unique_values(df, column_name):
    """
    Function to list all unique values in a given column of a DataFrame.

    Parameters:
    - df: pandas DataFrame
    - column_name: Name of the column to find unique values

    Returns:
    - A list of unique values in the specified column.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")

    unique_values = df[column_name].unique()
    return unique_values


# Function to calculate YearMade based on ModelID
def calc_YearMade(df, model_id):
    valid_years = df.loc[(df['ModelID'] == model_id) & (df['YearMade'] > 1000), 'YearMade']
    return int(valid_years.mean()) if not valid_years.empty else 1000  # Default if no valid years exist

def update_YearMade(df):
  print("update_YearMade START")
  # Update YearMade where it is 1000
  df.loc[df['YearMade'] == 1000, 'YearMade'] = df.loc[df['YearMade'] == 1000, 'ModelID'].apply(lambda model_id: calc_YearMade(df, model_id))
  # Compute the mean excluding rows where YearMade == 1000
  mean_yearmade = df.loc[df['YearMade'] != 1000, 'YearMade'].mean()

  # Update all rows where YearMade == 1000 with the calculated mean
  df.loc[df['YearMade'] == 1000, 'YearMade'] = int(round(mean_yearmade))

  print("update_YearMade END")

  return df

def update_NaN_To_None_or_Unspecified(df):
    """
    Updates NaN values to 'None or Unspecified' in columns where the value
    'None or Unspecified' is already present.
    """
    print("update_NaN_To_None_or_Unspecified START")

    update_to = 'None or Unspecified'

    for col in df.columns:
        unique_values = list_unique_values(df, col)  # Get unique values for the column

        # Check if 'None or Unspecified' exists and if there are NaN values
        if update_to in unique_values and pd.isnull(unique_values).any():
            df[col] = df[col].fillna(update_to)  # Fill NaN with the specified value

    print("update_NaN_To_None_or_Unspecified END")

    return df

def update_auctioneerID(df):
  print("update_auctioneerID START")

  # Assuming 'df' is your DataFrame
  # Fill null values in the 'auctioneerID' column with 100.0
  if 'auctioneerID' in df.columns:
      df['auctioneerID'] = df['auctioneerID'].fillna(100.0)
      #print("'auctioneerID' null values filled with 100.0 successfully!")
  #else:
  #    print("Column 'auctioneerID' does not exist in the DataFrame.")
  print("update_auctioneerID END")

  return df


def convert_date_columns(df, date_column):
    """
    Convert date column into multiple numeric features like year, month, day, etc.
    Handles potential issues like missing date_column or invalid data.
    """
    print("convert_date_columns START")

    # Ensure the date_column exists in the DataFrame
    if date_column not in df.columns:
        raise KeyError(f"Column '{date_column}' does not exist in the DataFrame.")

    # Convert the column to datetime
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')  # Handle invalid date strings

    # Check if any date values are invalid after conversion
    if df[date_column].isnull().all():
        raise ValueError(f"All values in '{date_column}' could not be converted to datetime.")

    # Extract date features
    df['year'] = df[date_column].dt.year
    df['month'] = df[date_column].dt.month
    df['day'] = df[date_column].dt.day
    df['day_of_week'] = df[date_column].dt.dayofweek
    df['is_weekend'] = df['day_of_week'] >= 5  # Saturday=5, Sunday=6

    # Drop the original date column
    df = df.drop(columns=[date_column])

    print("convert_date_columns END")

    return df

def update_MachineHoursCurrentMeter(df):
  print("update_MachineHoursCurrentMeter START")

  # Update the 'MachineHoursCurrentMeter' column: replace NaN or 0 with 0
  column_name = 'MachineHoursCurrentMeter'

  # Check if the column exists
  if column_name in df.columns:
      df[column_name] = df[column_name].fillna(0)  # Replace NaN with 0
      df[column_name] = df[column_name].replace(0, 0)  # Ensure 0 stays 0

  print("update_MachineHoursCurrentMeter END")

  return df

def encode_and_impute(df):
    '''
    Function to convert categorical columns into category codes
    and impute missing values in numerical columns.
    '''
    print("Data Preprocessing START")

    # Handling categorical variables
    for col in df.select_dtypes(['object']):
        df[col] = df[col].fillna("Unknown").astype('category')

    for col in df.select_dtypes(['category']):
        df[col] = df[col].cat.codes  # Convert categories to numerical codes

    # Handling numerical missing values
    for col in df.select_dtypes(['number']):
        df[col] = df[col].fillna(df[col].median())  # Fill NaNs with median

    print("Data Preprocessing END")
    return df

def update_col_NaN(df,column_name,update_to):
   df[column_name]=df[column_name].fillna(update_to)
   return df

def clean_column(df, column_name):
    """
    Cleans a column in the DataFrame.
    - Replaces 'None or Unspecified' with 1.0.
    - Removes double quotes (") from values.
    - Removes the word "inch" from values.
    - Removes all spaces from values.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the column.
        column_name (str): The column to clean.

    Returns:
        pd.DataFrame: Updated DataFrame with cleaned column.
    """
    if column_name not in df.columns:
        raise KeyError(f"Column '{column_name}' not found in DataFrame.")

    df[column_name] = df[column_name].fillna(0.0)

    df[column_name] = df[column_name].replace('None or Unspecified', '0.0')

    # Function to clean values
    def clean_value(value):
        str_value = str(value)
        str_value = str_value.replace('"', '')  # Remove double quotes
        str_value = str_value.replace('inch', '')  # Remove 'inch'
        str_value = str_value.replace(' ', '')  # Remove spaces
        return str_value

    df[column_name] = df[column_name].apply(clean_value)
    df[column_name] = pd.to_numeric(df[column_name], errors='coerce')

    # Replace 0.0 values with the column median
    median_value = df[df[column_name] != 0][column_name].median()
    df[column_name] = df[column_name].replace(0.0, median_value)

    return df

def convert_feet_inches(df, column_name):
    """
    Converts a column containing feet and inches strings to numeric inches.
    - Replaces 'None or Unspecified' with "0' 0".
    - Converts values like "9' 6"" (feet and inches) into total inches.
    - Converts the column data type to numeric.
    - Replaces 0 values with the column median.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the column.
        column_name (str): The column to process.

    Returns:
        pd.DataFrame: Updated DataFrame with cleaned numeric column.
    """
    if column_name not in df.columns:
        raise KeyError(f"Column '{column_name}' not found in DataFrame.")

    # Replace 'None or Unspecified' with "0' 0" before processing
    df[column_name] = df[column_name].replace('None or Unspecified', "0' 0")

    # Function to convert feet and inches format to numeric inches
    def convert_to_inches(value):
        str_value = str(value).strip()

        # Match feet and inches pattern
        match = re.match(r"(\d+)'\s*(\d+)?\"?", str_value)
        if match:
            feet = int(match.group(1))
            inches = int(match.group(2)) if match.group(2) else 0
            return feet * 12 + inches

        return 0  # If format is invalid, default to 0

    df[column_name] = df[column_name].apply(convert_to_inches)

    # Convert column to numeric
    df[column_name] = pd.to_numeric(df[column_name], errors='coerce')

    # Replace 0 values with the column median
    median_value = df[df[column_name] != 0][column_name].median()
    df[column_name] = df[column_name].replace(0, median_value)

    return df

def process_fidesc_secondary_series(df):
    """
    Iterates over rows and updates fiSecondaryDesc and fiModelSeries based on fiModelDesc and fiBaseModel.

    Parameters:
    df (DataFrame): The input DataFrame.

    Returns:
    None: The function modifies the DataFrame in place.
    """

    for index in df.index:
        fi_model_desc = df.at[index, 'fiModelDesc']
        fi_base_model = df.at[index, 'fiBaseModel']
        fi_secondary_desc = df.at[index, 'fiSecondaryDesc']
        fi_model_series = df.at[index, 'fiModelSeries']

        # Create fidesc by removing fiBaseModel from fiModelDesc
        fidesc = fi_model_desc.replace(fi_base_model, '', 1).strip()

        # Update fiSecondaryDesc if fidesc starts with a capital letter and is NaN
        if fidesc and fidesc[0].isupper() and pd.isna(fi_secondary_desc):
            df.at[index, 'fiSecondaryDesc'] = fidesc[0]  # Assign first letter
            fidesc = fidesc[1:].strip()  # Remove first letter from fidesc

        # Update fiModelSeries if it's NaN and fidesc is not empty
        if pd.isna(fi_model_series) and fidesc:
            df.at[index, 'fiModelSeries'] = fidesc

    #df['fiModelSeries'] = df['fiModelSeries'].fillna('N/A')
    #df['fiSecondaryDesc'] = df['fiSecondaryDesc'].fillna('N/A')

    return df



def prepare_dataframe(df_pre:pd.DataFrame):
  '''04/02/2025 Start'''
  df_pre=update_YearMade(df_pre)
  print()
  df_pre=update_NaN_To_None_or_Unspecified(df_pre)
  print()

  df_pre=update_MachineHoursCurrentMeter(df_pre)
  print()

  df_pre=update_auctioneerID(df_pre)
  print()

  #df_pre=update_Enclosure(df_pre)
  #print()

  df_pre=convert_date_columns(df_pre,'saledate')
  print()

  #df_pre=encode_all_categories(df_pre)
  '''04/02/2025 End'''

  '''Start changes 06-02-2025'''
  df_pre = process_fidesc_secondary_series(df_pre)

  #df_pre=update_col_NaN(df_pre,'UsageBand','None or Unspecified')
  #df_pre=convert_feet_inches(df_pre,'Stick_Length')
  #df_pre=clean_column(df_pre,'Tire_Size')
  df_pre=clean_column(df_pre,'Undercarriage_Pad_Width')
  '''End changes 06-02-2025'''

  '''Start changes 06-02-2025 0015'''

  df_pre=update_col_NaN(df_pre,'ProductSize','None or Unspecified')
  df_pre=update_col_NaN(df_pre,'Drive_System','No')
  df_pre=update_col_NaN(df_pre,'Stick','None or Unspecified')
  df_pre=update_col_NaN(df_pre,'Engine_Horsepower','No')
  df_pre=update_col_NaN(df_pre,'Track_Type','None or Unspecified')
  df_pre=update_col_NaN(df_pre,'Grouser_Type','None or Unspecified')
  df_pre=update_col_NaN(df_pre,'Differential_Type','None or Unspecified')
  df_pre=update_col_NaN(df_pre,'Steering_Controls','No')
  '''End changes 06-02-2025 0015'''

  '''Start changes 05-02-2025 1530'''

  df_pre=encode_and_impute(df_pre)
  '''End changes 05-02-2025 1530'''

  return df_pre

def run_test(model):
  file_path = "/content/drive/My Drive/Valid.csv"  # Replace with the saved file path
  df_valid = pd.read_csv(file_path)
  df_valid['SalePrice']=None
  df_valid=prepare_dataframe(df_valid)
  X = df_valid.drop(columns=['SalePrice'])
  y = df_valid['SalePrice']
  y = model.predict(X)

  df_valid.loc[X.index, 'SalePrice'] = y

  file_path = "/content/drive/My Drive/output.csv"  # Save to Google Drive root
  df_valid[['SalesID', 'SalePrice']].to_csv(file_path, index=False)

  print(f"File saved successfully at: {file_path}")

def run_train():
  pd.set_option('display.expand_frame_repr', False)

  df=load_dataframe_from_drive()
  #df = df.sample(n=10000, random_state=42)
  df=prepare_dataframe(df)
  model,xs=train_model(df=df,column_to_predict='SalePrice')
  #save_dataframe(df,'04022025_df')
  #save_model(model,'04022025_model')
  #save_dataframe(df,'05022025_df')
  #save_model(model,'05022025_model')
  #save_dataframe(df,'06022025_df')
  #save_model(model,'06022025_model')
  save_dataframe(df,'06022025_1440_df')
  save_model(model,'06022025_1440_model')

  '''
  df=load_dataframe_from_drive('04022025_df')
  model=load_model('04022025_model')
  xs=generate_X_Y_params(df,'SalePrice')
  '''
  return model,xs


**RUN CELL**

In [10]:
print('Starting.....')
model,xs=run_train()
print('Testing...')
run_test(model)


Starting.....
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  df = pd.read_csv(file_path)


update_YearMade START
update_YearMade END

update_NaN_To_None_or_Unspecified START
update_NaN_To_None_or_Unspecified END

update_MachineHoursCurrentMeter START
update_MachineHoursCurrentMeter END

update_auctioneerID START
update_auctioneerID END

convert_date_columns START
convert_date_columns END

Data Preprocessing START
Data Preprocessing END
train_model START
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


0,1
Test RMSE,▁
Train RMSE,▁
feature_10_fiModelDesc_importance,▁
feature_11_fiSecondaryDesc_importance,▁
feature_12_fiModelSeries_importance,▁
feature_13_month_importance,▁
feature_14_Enclosure_importance,▁
feature_15_day_importance,▁
feature_16_state_importance,▁
feature_17_day_of_week_importance,▁

0,1
Test RMSE,7108.66644
Train RMSE,2658.78618
feature_10_fiModelDesc_importance,0.01742
feature_11_fiSecondaryDesc_importance,0.01314
feature_12_fiModelSeries_importance,0.01284
feature_13_month_importance,0.01151
feature_14_Enclosure_importance,0.01114
feature_15_day_importance,0.01016
feature_16_state_importance,0.0095
feature_17_day_of_week_importance,0.00483


RMSE Baseline accuracy: 22932.4005340408
Train RMSE: 2658.786175522111
Test RMSE: 7108.666438908499
train_model END
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File saved successfully at: /content/drive/My Drive/06022025_1440_df.csv
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model saved at /content/drive/My Drive/06022025_1440_model.pkl
Testing...
update_YearMade START
update_YearMade END

update_NaN_To_None_or_Unspecified START
update_NaN_To_None_or_Unspecified END

update_MachineHoursCurrentMeter START
update_MachineHoursCurrentMeter END

update_auctioneerID START
update_auctioneerID END

convert_date_columns START
convert_date_columns END

Data Preprocessing START
Data Preprocessing END
File saved successfully at: /content/drive/My Drive/output.csv


  df_valid.loc[X.index, 'SalePrice'] = y
