In [14]:
# Re-import necessary libraries due to execution reset
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
import numpy as np
import re
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Define a function to categorize U.S. states based on agricultural land percentage
def categorize_us_states_by_agriculture(df):
    """
    Categorizes U.S. states into High, Medium, or Low Agriculture based on external data.
    It then creates one-hot encoded dummy variables for each category.

    Parameters:
    df (pd.DataFrame): DataFrame containing a 'state' column.

    Returns:
    pd.DataFrame: DataFrame with new agriculture-based dummy variables.
    """

    # Define the mapping of states to Agriculture categories based on external USDA data
    agriculture_mapping = {
        # High Agriculture States (Above 53% agricultural land)
        "Iowa": "High Ag", "Nebraska": "High Ag", "South Dakota": "High Ag",
        "North Dakota": "High Ag", "Kansas": "High Ag", "Montana": "High Ag",
        "Minnesota": "High Ag", "Missouri": "High Ag", "Idaho": "High Ag",
        "Oklahoma": "High Ag", "Illinois": "High Ag", "Indiana": "High Ag",
        "Wisconsin": "High Ag", "Arkansas": "High Ag", "Kentucky": "High Ag",

        # Medium Agriculture States (Between 40% - 53% agricultural land)
        "Texas": "Medium Ag", "Mississippi": "Medium Ag", "Alabama": "Medium Ag",
        "Tennessee": "Medium Ag", "Georgia": "Medium Ag", "North Carolina": "Medium Ag",
        "South Carolina": "Medium Ag", "Louisiana": "Medium Ag", "Colorado": "Medium Ag",
        "Ohio": "Medium Ag", "Michigan": "Medium Ag", "Virginia": "Medium Ag",

        # Low Agriculture States (Below 40% agricultural land)
        "California": "Low Ag", "New York": "Low Ag", "New Jersey": "Low Ag",
        "Nevada": "Low Ag", "Florida": "Low Ag", "Washington": "Low Ag",
        "Oregon": "Low Ag", "Arizona": "Low Ag", "Pennsylvania": "Low Ag",
        "Maine": "Low Ag", "Massachusetts": "Low Ag", "New Hampshire": "Low Ag",
        "Vermont": "Low Ag", "Rhode Island": "Low Ag", "Connecticut": "Low Ag",
        "Delaware": "Low Ag", "Maryland": "Low Ag", "West Virginia": "Low Ag",
        "Hawaii": "Low Ag", "Alaska": "Low Ag", "Washington DC": "Low Ag",
        "Puerto Rico": "Low Ag"
    }

    # Map states in the dataset to Agriculture categories
    df["state_agriculture_category"] = df["state"].map(agriculture_mapping).fillna("Other")

    # One-hot encode the agriculture categories
    df = pd.get_dummies(df, columns=["state_agriculture_category"], prefix="agriculture")

    return df






# Define a function to create machine age categories and one-hot encode them
def create_machine_age_category(df):
    """
    Adds machine age categories and one-hot encodes them in the given DataFrame.
    
    A machine's age is calculated as (sale_year - YearMade), and categorized into:
    - "New (0-5)"
    - "Lightly Used (6-10)"
    - "Moderately Used (11-20)"
    - "Old (21-50)"
    
    Parameters:
    df (pd.DataFrame): DataFrame containing 'YearMade' and 'saledate'.

    Returns:
    pd.DataFrame: DataFrame with new categorical machine age columns.
    """
    #df = df.copy()  # Avoid modifying the original DataFrame

    # Convert 'saledate' to datetime if not already
   # if not pd.api.types.is_datetime64_any_dtype(df["saledate"]):
    #    df["saledate"] = pd.to_datetime(df["saledate"], errors="coerce")

    # Remove invalid YearMade values
    #df = df.dropna(subset=["YearMade"])
    #df = df[df["YearMade"] != 1000]

    # Extract sale year and compute machine age
    #df["sale_year"] = df["saledate"].dt.year
    df["machine_age"] = df["sale_year"] - df["YearMade"]

    # Remove unrealistic ages (negative or older than 50 years)
    #df = df[(df["machine_age"] >= 0) & (df["machine_age"] <= 150)]

    # Define bins and labels for machine age categories
    #bins = [0, 5, 10, 20, 50,100]
    #labels = ["New (0-5)", "Lightly Used (6-10)", "Moderately Used (11-20)", "Old (21-50)", "vintage (51-150)"]

    # Create categorical column
    #df["machine_age_category"] = pd.cut(df["machine_age"], bins=bins, labels=labels, include_lowest=True)

    # One-hot encode machine age categories
    #df = pd.get_dummies(df, columns=["machine_age_category"], prefix="age")
    
    # Create 'is_new_machine' dummy variable (1 for new, 0 for used)
    df["is_new_machine"] = (df["machine_age"] <= 1).astype(int)

    #df.drop(columns=["machine_age"],inplace=True)

    return df




def winter_month(df):
        # Define winter months
    winter_months = [12, 1, 2]
    
    # Assign 1 to high_price_season for winter months, 0 otherwise
    df["high_price_season"] = 0  # Default to 0
    df.loc[df["sale_month"].isin(winter_months), "high_price_season"] = 1
    return df

# Define final categorization for Steering_Controls
def categorize_steering_controls_final(value):
    if value in ["Four Wheel Standard", "Conventional"]:
        return "Mid Price Steering"
    elif value in ["Command Control"]:
        return "High Price Steering"
    else:
        return "Unknown"



# Define price-based categories for Track_Type
def categorize_track_type(value):
    if value in ["Rubber"]:
        return "Low Price Track"
    elif value in ["Steel"]:
        return "High Price Track"
    else:
        return "Unknown"


# Define price-based categories for Transmission
def categorize_transmission(value):
    if value in ["Powershuttle", "Standard", "Direct Drive"]:
        return "Low Price Transmission"
    elif value in ["Autoshift", "Hydrostatic"]:
        return "Mid Price Transmission"
    elif value in ["Powershift", "None or Unspecified", "AutoShift"]:
        return "High Price Transmission"
    else:
        return "Unknown"


# Define price-based categories for Steering_Controls
def categorize_steering_controls(value):
    if value in ["Four Wheel Standard"]:
        return "Low Price Steering"
    elif value in ["Conventional"]:
        return "Mid Price Steering"
    elif value in ["Command Control"]:
        return "High Price Steering"
    else:
        return "Unknown"



# Define price-based categories for ProductGroup
def categorize_product_group(price):
    if price < 20000:
        return "Low Price Group"
    elif 20000 <= price < 40000:
        return "Mid Price Group"
    else:
        return "High Price Group"


def create_model_category_mapping(df,model_avg_price):
    """
    Creates a mapping of ModelID to Model_Category based on fiModelDesc and SalePrice in training data.
    
    Parameters:
    training_df (pd.DataFrame): Training dataset containing fiModelDesc and SalePrice.
    
    Returns:
    dict: A dictionary mapping ModelID to Model_Category.
    """
    # Compute average SalePrice per fiModelDesc in training data
    
    
    # Define price categories
    def categorize_price(price):
        if price < 20000:
            return "Low Price Models"
        elif 20000 <= price < 60000:
            return "Mid Price Models"
        else:
            return "High Price Models"
    
    # Map fiModelDesc to categories
    model_category_mapping = model_avg_price.apply(categorize_price).to_dict()
    
    # Create ModelID to category mapping using fiModelDesc
    df["Model_Category"] = df["fiModelDesc"].map(model_category_mapping)
    modelid_to_category = df.set_index("ModelID")["Model_Category"].to_dict()
    
    return modelid_to_category, model_category_mapping

def categorize_model_id(df, modelid_to_category, model_category_mapping):
    """
    Categorizes ModelID based on precomputed price categories.
    If ModelID is not found, falls back to fiModelDesc categorization.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing ModelID and fiModelDesc columns.
    modelid_to_category (dict): Mapping of ModelID to Model_Category.
    model_category_mapping (dict): Mapping of fiModelDesc to Model_Category for fallback.
    
    Returns:
    pd.Series: A Series with categorized model price labels.
    """
    df["Predicted_Model_Category"] = df["ModelID"].map(modelid_to_category)
    
    # Handle missing ModelID by checking fiModelDesc mapping
    missing_mask = df["Predicted_Model_Category"].isna()
    df.loc[missing_mask, "Predicted_Model_Category"] = df.loc[missing_mask, "fiModelDesc"].map(model_category_mapping)
    
    # Handle any remaining missing values by assigning 'Unknown'
    df["Predicted_Model_Category"].fillna("Unknown", inplace=True)
    
    return df["Predicted_Model_Category"]



def extract_horsepower(df):
    """
    Extracts and imputes missing Horsepower values from fiProductClassDesc.
    Optimized for performance using vectorized operations.
    """
    def extract_numeric_range(value):
        """Extracts the average horsepower value from a range like '100 to 120 Horsepower'."""
        if pd.isna(value) or not isinstance(value, str):
            return np.nan
        numbers = re.findall(r'[\d\.]+', value)
        if len(numbers) == 2:
            return (float(numbers[0]) + float(numbers[1])) / 2
        elif len(numbers) == 1:
            return float(numbers[0])
        return np.nan
    
    # Identify rows with Horsepower in fiProductClassDesc
    mask_horsepower = df['fiProductClassDesc'].str.contains("Horsepower", na=False)
    
    # Extract horsepower values
    df.loc[mask_horsepower, 'Horsepower_Unit_Type'] = 'Horsepower'
    df.loc[mask_horsepower, 'Extracted_Horsepower'] = df.loc[mask_horsepower, 'fiProductClassDesc'].apply(extract_numeric_range)
    
    # Handle 'Variable' and 'No' values in Engine_Horsepower
    df.loc[df['Engine_Horsepower'].isin(['Variable', 'No']), 'Engine_Horsepower'] = np.nan
    
    # Impute missing Engine_Horsepower values where Extracted_Horsepower is available
    df['Engine_Horsepower_Imputed'] = df['Engine_Horsepower'].combine_first(df['Extracted_Horsepower'])
    
    # Fill remaining NaN values with median horsepower
    #df['Engine_Horsepower_Imputed'].fillna(df['Engine_Horsepower_Imputed'].median(), inplace=True)
    df.drop(columns='Engine_Horsepower',inplace=True)
    
    return df

def preprocess_product_size(df):
    """
    Function to preprocess and impute missing ProductSize values based on fiProductClassDesc.
    
    Steps:
    1. Extract Product Type and Metric Tons / Horsepower from fiProductClassDesc.
    2. Identify Unit Type (Metric Tons, Horsepower, or Lb Operating Capacity).
    3. Convert Metric Tons / Horsepower to numerical values.
    4. Use Metric Tons to impute missing ProductSize values.
    """
    
    # Step 1: Extract Product Type and Size Descriptor
    def split_product_desc(desc):
        if pd.isna(desc):
            return pd.NA, pd.NA
        match = re.search(r'(.+?)-\s*([\d\.]+.*)', desc)
        if match:
            return match.group(1).strip(), match.group(2).strip()
        return desc.strip(), pd.NA  # If no match, return full description as product
    
    df[['Product_Type', 'Metric_Tons_HP']] = df['fiProductClassDesc'].apply(lambda x: pd.Series(split_product_desc(x)))
    
    # Step 2: Identify Unit Type
    def extract_unit_type(desc):
        if pd.isna(desc):
            return pd.NA
        if "Metric Tons" in desc:
            return "Metric Tons"
        elif "Horsepower" in desc:
            return "Horsepower"
        elif "Lb Operating Capacity" in desc:
            return "Lb Operating Capacity"
        return pd.NA
    
    df["Unit_Type"] = df["Metric_Tons_HP"].apply(extract_unit_type)
    
    # Step 3: Convert Metric Tons to numerical values
    def extract_numeric_range(value):
        """Extracts the average value from a range like '12.0 to 14.0 Metric Tons'."""
        if pd.isna(value) or not isinstance(value, str):
            return np.nan
        numbers = re.findall(r'[\d\.]+', value)
        if len(numbers) == 2:  # If range exists, take the average
            return (float(numbers[0]) + float(numbers[1])) / 2
        elif len(numbers) == 1:  # If only one number exists, use it
            return float(numbers[0])
        return np.nan
    
    df['Metric_Tons_Value'] = df['Metric_Tons_HP'].apply(lambda x: extract_numeric_range(x) if isinstance(x, str) else np.nan)
    
    # Step 4: Impute missing ProductSize values using Metric Tons
    def impute_product_size(row):
        if pd.isna(row['ProductSize']) and not pd.isna(row['Metric_Tons_Value']) and (row['Metric_Tons_Value']=='Metric Tons'):
            if row['Metric_Tons_Value']  <= 5:
                return 'Mini'
            elif 5 < row['Metric_Tons_Value'] <= 75:
                return 'Compact'
            elif 20 < row['Metric_Tons_Value'] <= 50:
                return 'Large / Medium'
            elif 75 < row['Metric_Tons_Value'] <= 200:
                return 'Medium'
            elif row['Metric_Tons_Value'] > 200:
                return 'Large'
        return row['ProductSize']
    
    df['ProductSize_Imputed'] = df.apply(impute_product_size, axis=1)
    df.drop(columns=['ProductSize','Metric_Tons_Value','Metric_Tons_HP','Unit_Type'],inplace=True)

    
    
    return df
def Enclosure_fun(df):
    list1=['EROPS','OROPS','EROPS AC']
    list2=['NO ROPS','None or Unspecified']
    df['Enclosure_cat']=0
    df.loc[df['Enclosure'].isin(list1), 'Enclosure_cat'] = 1
    df.loc[df['Enclosure'].isin(list2), 'Enclosure_cat'] = 0
    df.loc[df['Enclosure']=='EROPS w AC','Enclosure_cat']=2
    df.drop(columns='Enclosure',inplace=True)
    return df
# Define categories based on observed price trends
def categorize_hydraulics(value):
    if value in ["Missing", "Auxiliary", "Standard"]:
        return "Basic"
    elif value in ["2 Valve", "None or Unspecified", "3 Valve"]:
        return "Mid-Level"
    else:
        return "Advanced"



# Reload the dataset
    

#df=df[['SalesID','saledate',"Enclosure","YearMade","ModelID","MachineID","Ripper","ProductGroup","ProductSize","ProductGroupDesc","Tire_Size","MachineHoursCurrentMeter","Blade_Type","ProductGroupDesc","Travel_Controls","auctioneerID","datasource","Drive_System","Pushblock","Tire_Size","Ride_Control","UsageBand","Coupler","Engine_Horsepower_Imputed"]].copy()
def saledate(df):
    # Convert 'saledate' to datetime and extract year, month, and day
    df['saledate'] = pd.to_datetime(df['saledate'], errors='coerce')
    df['sale_year'] = df['saledate'].dt.year
    df['sale_month'] = df['saledate'].dt.month
    df['sale_day'] = df['saledate'].dt.day
    return df



def preprocess_data(df,modelid_to_category, model_category_mapping,product_group_avg_price):
    # Apply categorization
    df["Hydraulics_Category"] = df["Hydraulics"].apply(categorize_hydraulics)
    # Apply categorization
    #df["Steering_Controls_Category"] = df["Steering_Controls"].apply(categorize_steering_controls)
    # Apply categorization
    df["Transmission_Category"] = df["Transmission"].apply(categorize_transmission)
    
    # Apply categorization
    df["Steering_Controls_Category"] = df["Steering_Controls"].apply(categorize_steering_controls_final)
    # Apply categorization to test data
    # Apply categorization to test dat
    df["Predicted_Model_Category"] = categorize_model_id(df, modelid_to_category, model_category_mapping) # need to upload the modelid_to_category, model_category_mapping in the test
    df["Track_Type_Category"] = df["Track_Type"].apply(categorize_track_type)
    # Map each ProductGroup to a price category
    df["ProductGroup_Category"] = df["ProductGroup"].map(product_group_avg_price).apply(categorize_product_group) # need to upload the categorize_product_group in the test
    # Apply categorization




    return df

# Define function to update 'YearMade' and create 'YearMade_Bucket'
def update_YearMade(df):
    print("update_YearMade START")
    
    # Compute median YearMade for each ModelID
    model_medians = df.loc[df['YearMade'] > 1000].groupby('ModelID')['YearMade'].median()
    
    # Update YearMade where it is 1000 using ModelID median
    df.loc[df['YearMade'] == 1000, 'YearMade'] = df['ModelID'].map(model_medians)
    
    # Compute overall median YearMade excluding 1000 values
    overall_median = df.loc[df['YearMade'] > 1000, 'YearMade'].median()
    
    # Replace any remaining 1000 values with overall median
    df['YearMade'].fillna(overall_median, inplace=True)
        
    # Bucketize YearMade
    bins = [0, 1980, 1995, 2005, 2010, 2025]
    labels = ["Before 1980", "1980-1995", "1996-2005", "2006-2010", "2011-2025"]
    df['YearMade_Bucket'] = pd.cut(df['YearMade'], bins=bins, labels=labels, right=True)

    df["YearMade_Bucket"] = df["YearMade_Bucket"].astype(str).fillna("Unknown")
    
    # Define category order and apply Ordinal Encoding
    year_bucket_encoder = OrdinalEncoder(
        categories=[["Before 1980", "1980-1995", "1996-2005", "2006-2010", "2011-2025", "Unknown"]],
        handle_unknown="use_encoded_value",
        unknown_value=-1
    )
    
    df["YearMade_Bucket"] = year_bucket_encoder.fit_transform(df[["YearMade_Bucket"]])
    
    print("update_YearMade END")
    return df
def pre_train(df,file_path):
    model_avg_price = df.groupby("fiModelDesc")["SalePrice"].mean()
    product_group_avg_price = df.groupby("ProductGroup")["SalePrice"].mean()

    model_avg_price.to_csv(file_path+"\model_avg_price.csv", header=True)
    product_group_avg_price.to_csv(file_path+"\product_group_avg_price.csv", header=True)

    

    return model_avg_price,product_group_avg_price

def Pre_train_test_model(df,train:str=True):
    # Filter dataset to only include the last 5 years
    recent_years = df['sale_year'].dropna().unique()
    recent_years.sort()
    #selected_years = recent_years[-5:]
    #df_filtered = df[df['sale_year'].isin(selected_years)]
    df_filtered=df.copy()
    
    # Drop irrelevant columns: 'SalesID', 'saledate'
    high_cardinality_cols = [col for col in df_filtered.select_dtypes(include=['object']).columns if df_filtered[col].nunique() > 50]
    df_filtered = df_filtered.drop(columns=['SalesID', 'saledate'] + high_cardinality_cols, errors='ignore')
    
    # Identify categorical columns
    categorical_cols = df_filtered.select_dtypes(include=['object']).columns
    
    
    
    # Identify high, moderate, and low cardinality categorical columns
    high_cardinality_cols = [col for col in categorical_cols if df_filtered[col].nunique() >= 50]
    moderate_cardinality_cols = [col for col in categorical_cols if 10 <= df_filtered[col].nunique() < 50]
    low_cardinality_cols = [col for col in categorical_cols if df_filtered[col].nunique() < 10]
    
    # Fill missing categorical values with "Unknown"
    for col in categorical_cols:
        df_filtered[col] = df_filtered[col].fillna("Unknown")
    
    # Encoding strategy
    
    # Apply One-Hot Encoding (OHE) to low-cardinality categories
    df_encoded = pd.get_dummies(df_filtered, columns=low_cardinality_cols, drop_first=True)
    
    # Apply Ordinal Encoding to moderate-cardinality categories
    if moderate_cardinality_cols:
        ord_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
        df_encoded[moderate_cardinality_cols] = ord_encoder.fit_transform(df_encoded[moderate_cardinality_cols])
    
    # Apply Frequency Encoding to high-cardinality categories
    for col in high_cardinality_cols:
        freq_encoding = df_encoded[col].value_counts().to_dict()
        df_encoded[col] = df_encoded[col].map(freq_encoding)
    
    # Drop rows with missing SalePrice (target variable)
    if train==True:
        df_encoded = df_encoded.dropna(subset=['SalePrice'])
        
    

    return df_encoded
def train_run(df_encoded,median_price_map):
    #!wandb login --relogin

    # Initialize a new W&B run
    #wandb.init(
     #   project="Predict-heavy-machinery-price"
    #)


    # Split into features and target
    X = df_encoded.drop(columns=['SalePrice'])
    y = df_encoded['SalePrice']
    
    # Split data into train and test sets
    from sklearn.model_selection import train_test_split
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
   # Merge `median_price_by_year` back into X_test for conversion
    X_test = X_test.merge(median_price_map, on='sale_year', how='left')

    # Train a Random Forest model
    best_parameter = {'n_estimators': 100, 
                      'min_samples_split': 10,  
                      'min_samples_leaf': 3,   
                      'max_features': 0.5,
                      'max_depth': 100}

    rf_model = RandomForestRegressor(**best_parameter, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    
    # Predict adjusted sale prices
    y_pred_adjusted = rf_model.predict(X_test.drop(columns=['median_price_by_year']))

    # Convert adjusted predictions and actual values back to real SalePrice
    y_pred_real = y_pred_adjusted * X_test['median_price_by_year']
    y_test_real = y_test * X_test['median_price_by_year']

    # Calculate RMSE in real SalePrice terms
    mae = mean_absolute_error(y_test_real, y_pred_real)
    mse = mean_squared_error(y_test_real, y_pred_real)
    rmse = mse ** 0.5

    # Display model performance
    print(f"MAE: {mae:.2f}")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    return rf_model

def function_preperation(df,file_path:str,train:str=True): # add new function to this part
# Apply YearMade update
    df = update_YearMade(df)
    df= saledate(df)
    df=winter_month(df) # winter sales
    # Apply the function to the dataset
    df = create_machine_age_category(df)
    if train==True:
        model_avg_price,product_group_avg_price=pre_train(df,file_path)
    else:
        model_avg_price=pd.read_csv(file_path+"/model_avg_price.csv", index_col=0)
        product_group_avg_price=pd.read_csv(file_path+"/product_group_avg_price.csv", index_col=0)
        # Convert to Series explicitly (if needed)
        model_avg_price = model_avg_price.iloc[:, 0]  # Extract first column as Series
        product_group_avg_price = product_group_avg_price.iloc[:, 0]
    # Create mappings from training data only in saleprice 
    modelid_to_category, model_category_mapping = create_model_category_mapping(df,model_avg_price) # need to upload the modelid_to_category, model_category_mapping in the test
    
    
    df=preprocess_data(df,modelid_to_category, model_category_mapping,product_group_avg_price)
    df=Enclosure_fun(df)
    df=extract_horsepower(df)
    
    # Map each ProductGroup to a price category
    df=preprocess_product_size(df)
    
    # Apply the function to the dataset
    df = categorize_us_states_by_agriculture(df)
    
    df.drop(columns='state',inplace=True)

    return df
#-------------------------------------------------------------------------------------------------


In [15]:
#def main():
file_path=r"C:\Users\eitanb\Documents\DS\ML\ML_project\DATA"


df = pd.read_csv(file_path+"/Train.csv")[['SalesID', 'SalePrice',  'ModelID',
        'YearMade', 'MachineHoursCurrentMeter', 'UsageBand',
       'saledate', 'ProductSize',
       'fiProductClassDesc', 'state', 'ProductGroup', 
       'Drive_System', 'Enclosure', 
        'Transmission', 'Turbocharged',  'Engine_Horsepower', 'Hydraulics',
         'Tire_Size',
       'Track_Type',
       'Travel_Controls', 'Differential_Type', 'Steering_Controls','fiModelDesc']]

df=function_preperation(df,file_path,True)


# Compute the average price for each ProductGroup
# Apply categorization


df=Pre_train_test_model(df,True)


### 📌 1️⃣ Inflation Adjustment Before Training ###
# Compute yearly median price
df['median_price_by_year'] = df.groupby('sale_year')['SalePrice'].transform('median')

# Save median prices separately for post-processing
median_price_map = df[['sale_year', 'median_price_by_year']].drop_duplicates()

# Normalize SalePrice
df['SalePrice'] = df['SalePrice'] / df['median_price_by_year']

# Drop 'median_price_by_year' BEFORE training (avoiding data leakage)
df.drop(columns=['median_price_by_year'], inplace=True)

# Drop irrelevant features
df.drop(columns=['YearMade_Bucket', 'Hydraulics', 'Differential_Type_Locking', 
                 'Steering_Controls_No', 'Steering_Controls_Wheel'], inplace=True)

# Train the model
rf_model = train_run(df,median_price_map)

    

update_YearMade START
update_YearMade END


NameError: name 'RandomForestRegressor' is not defined

In [None]:
# Optional: Log feature importances
importances = rf_model.feature_importances_

# Create a zip of feature names and feature importances
feature_zip = zip(rf_model.feature_names_in_, rf_model.feature_importances_)

# Sort the zip by feature importance in descending order
sorted_feature_zip = sorted(feature_zip, key=lambda x: x[1], reverse=True)

# Print the sorted feature names, importances, and index
for idx, (feat, importance) in enumerate(sorted_feature_zip, 1):
  print({f"feature_{idx}_{feat}_importance" : importance})

In [None]:
import pandas as pd
import numpy as np
file_path=r"C:\Users\eitanb\Documents\DS\ML\ML_project\DATA"
# Load validation data
Valid_df = pd.read_csv(file_path+"/Valid.csv")[[
    'SalesID', 'ModelID', 'YearMade', 'MachineHoursCurrentMeter', 'UsageBand',
    'saledate', 'ProductSize', 'fiProductClassDesc', 'state', 'ProductGroup',
    'Drive_System', 'Enclosure', 'Transmission', 'Turbocharged',
    'Engine_Horsepower', 'Hydraulics', 'Tire_Size', 'Track_Type',
    'Travel_Controls', 'Differential_Type', 'Steering_Controls', 'fiModelDesc'
]]

# Preserve SalesID before preprocessing
sales_id_col = Valid_df[['SalesID']].copy()


df=function_preperation(Valid_df,file_path,False)

# Model pre-training process
#df = Pre_train_test_model(df)
# Model pre-training process
df = Pre_train_test_model(df,False)

# Ensure index alignment
df.index = sales_id_col.index  # Keep index same as SalesID

# Load training feature names
train_feature_names = rf_model.feature_names_in_  # Retrieves features seen during model training

# Ensure test data has the same columns as training data
for col in train_feature_names:
    if col not in df.columns:
        df[col] = 0  # Add missing features with default value (zero)

# Drop extra columns in test data
df = df[train_feature_names]

# Make predictions
predictions = rf_model.predict(df)

# Ensure matching row count
if len(predictions) != len(df):
    raise ValueError("Mismatch in row count between processed data and predictions!")

# Construct results DataFrame with correct alignment
results = pd.DataFrame({
    "SalesID": sales_id_col["SalesID"].values,  # Retrieve original SalesID
    "SalePrice": predictions  # Reverse log transformation if needed
})

# Save to Excel
results.to_excel("predictions.xlsx", index=False)

print("Predictions saved successfully.")
# Save to Excel
file_path=r"C:\Users\eitanb\Documents\DS\ML\ML_project\DATA"
results.to_csv(file_path+"\predictions_12_2_1250.csv", index=False)