<a href="https://colab.research.google.com/github/colab-ds18/ML-DS18/blob/main/ML_DS18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
#ofer functions
'''
General Funtions Author OA
Date 29/01/2025
'''
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.ensemble import RandomForestRegressor  # For the Random Forest regression model
import shap
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_log_error

def show_unique_values(df, column_name):
  '''
  Function to list all unique values in a given column of a DataFrame.
  '''
  if column_name not in df.columns:
    raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
    unique_products = list_unique_values(df, column_name)
    print(unique_products)
    null_stats_per_column(df,column_name)


def list_unique_values(df, column_name):
    """
    Function to list all unique values in a given column of a DataFrame.

    Parameters:
    - df: pandas DataFrame
    - column_name: Name of the column to find unique values

    Returns:
    - A list of unique values in the specified column.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")

    unique_values = df[column_name].unique()
    return unique_values


def update_NaN_To_None_or_Unspecified(df):
    """
    Updates NaN values to 'None or Unspecified' in columns where the value
    'None or Unspecified' is already present.
    """
    print("update_NaN_To_None_or_Unspecified START")

    update_to = 'None or Unspecified'

    for col in df.columns:
        unique_values = list_unique_values(df, col)  # Get unique values for the column

        # Check if 'None or Unspecified' exists and if there are NaN values
        if update_to in unique_values and pd.isnull(unique_values).any():
            df[col] = df[col].fillna(update_to)  # Fill NaN with the specified value
            #print(f"Updated '{col}' column successfully!")
        #else:
        #    print(f"Column '{col}' does not have the required conditions for update.")

    print("update_NaN_To_None_or_Unspecified END")

    return df

def columns_with_nulls(df,only_numeric_columns=True):
  '''
  Function to list all columns with null values in a given DataFrame.
  '''
  print("columns_with_nulls START/END")
  if only_numeric_columns:
    # Get numeric columns only
    numeric_columns = df.select_dtypes(include=['number'])

    # Find numeric columns with null values
    numeric_columns_with_null = numeric_columns.columns[numeric_columns.isnull().any()]
    return numeric_columns_with_null.tolist()
  else:
    return df.columns[df.isnull().any()]


def null_stats_per_column(df,chosen_column):
  '''
  Function to display the number of null rows in a given column of a DataFrame.
  '''
  print("null_stats_per_column START/END")

  # Count the number of null rows in the chosen column
  null_count = df[chosen_column].isnull().sum()

  # Calculate the percentage of nulls in the chosen column
  null_percentage = (null_count / len(df)) * 100

  # Display the result
  print(f"Column: {chosen_column}")
  print(f"Null Count: {null_count}")
  print(f"Null Percentage: {null_percentage:.2f}%")


def missing_values_histogram(df,columns):
    '''
    Function to plot a histogram of missing values in a given DataFrame.
    '''
    data=[{column:df[column].value_counts().sum()} for column in columns ]
    # Convert the list of dictionaries to a DataFrame
    formatted_data = pd.DataFrame([{name:count for single_data in data for name, count in single_data.items()}]).T.reset_index()
    formatted_data.columns = ['Column', 'Count']
    # Create a bar plot
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(x='Column', y='Count', data=formatted_data, hue='Column',legend=False)


    # label count on top of each bar
    for index, row in formatted_data.iterrows():
        ax.text(index, row['Count'] + 10, str(row['Count']), color='black', ha="center", fontsize=10)

    # Add title and axis labels
    plt.title("Column Counts in Dataframe")
    plt.xlabel("Column")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def encode_all_categories(df):
  '''
  Function to convert all string columns in a given DataFrame into categories.
  '''
  print("encode_all_categories START")
  for col in df.select_dtypes(['object']):
    df[col] = df[col].astype('category')

  for col in df.select_dtypes(['category']):
    df[col] = df[col].cat.codes

  print("encode_all_categories END")

  return df

def drop_nulls(df):
  '''
  Function to drop all rows with null values in a given DataFrame.
  '''
  df = df.dropna()
  return df

def RMSLE(y_test, y_pred):
    '''
    RSMLE approximates the percent change
    '''
    return np.sqrt(np.mean((np.log(y_pred) - np.log(y_test))**2))

def RMSE(y_, y_pred_):
    '''
    RSME
    '''
    return ((y_ - y_pred_) ** 2).mean() ** 0.5

def train_model(df,column_to_predict,test_size_value=0.3,random_state_value=42):
    '''
    Function to train a Random Forest regression model on a given DataFrame.
    '''
    print("train_model START")

    X = df.drop(columns=[column_to_predict])
    y = df[column_to_predict]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_value, random_state=random_state_value)

    # model = DecisionTreeRegressor(
    # min_samples_leaf=16
    # max_depth=3
    # )
    #model = LinearRegression()
    model = RandomForestRegressor()
    model.fit(X_train, y_train)


    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # 7. print the RMSE accuracy of the baseline (std dev)
    print("RMSE Baseline accuracy:", y_test.std())
    print("Train RMSE:", RMSE(y_train, y_train_pred))
    print("Test RMSE:", RMSE(y_test, y_test_pred))
    #print("Test RMSLE:", RMSLE(y_test, y_test_pred))

    print("train_model END")

    #display(pd.Series(model.feature_importances_, model.feature_names_in_).sort_values(ascending=False))
    return model,(X_train, X_test, y_train, y_test)

def feature_importances(model):
  '''
  Function to display the feature importances of a given Random Forest regression model.
  '''
  print("feature_importances START")

  dict(zip(model.feature_names_in_, model.feature_importances_))
  fi = pd.Series(model.feature_importances_, index=model.feature_names_in_)
  fi = fi.sort_values(ascending=False)
  print("feature_importances END")

  return fi

def explain_model_snap(model,X_test,water_fall_row=0):
  '''
  Function to explain the model's predictions using SHAP values.
  '''
  shap.initjs()
  explainer = shap.Explainer(model)
  explanation = explainer(X_test)  # New style
  shap.summary_plot(explanation, X_test)

  shap.plots.waterfall(explanation[water_fall_row])

  shap.plots.waterfall(explanation[water_fall_row+2])

  shap.plots.partial_dependence('fare', model.predict, X_test, feature_names=X_test.columns)

  shap.plots.heatmap(explanation)

  shap.plots.bar(explanation)


def find_outliers(df, column_to_predict, test_size_value=0.3, random_state_value=42):
    """
    Identify outliers using Isolation Forest and add anomaly-related columns to the DataFrame.

    Parameters:
    - df: Input DataFrame.
    - column_to_predict: Column to exclude for training the Isolation Forest.
    - test_size_value: Test size for train-test split.
    - random_state_value: Random state for reproducibility.

    Returns:
    - A new DataFrame with additional columns: 'anomaly_score', 'anomaly', and 'anomaly_label'.
    """
    print("find_outliers START")

    # Create features (X) and target (y)
    X = pd.get_dummies(df.drop(columns=[column_to_predict]), drop_first=True)
    y = df[column_to_predict]

    # Train Isolation Forest on the entire dataset (no splitting)
    iso_forest = IsolationForest(n_estimators=100, contamination=0.01, random_state=random_state_value)
    iso_forest.fit(X)

    # Predict anomaly scores and labels
    df = df.copy()  # Work on a copy of the DataFrame
    df['anomaly_score'] = iso_forest.decision_function(X)  # Quantitative anomaly score
    df['anomaly'] = iso_forest.predict(X)  # Binary anomaly label (1 for normal, -1 for anomaly)
    df['anomaly_label'] = df['anomaly'].map({1: 'Normal', -1: 'Anomaly'})  # Map labels
    print("find_outliers END")

    return df



def convert_date_columns(df, date_column):
    """
    Convert date column into multiple numeric features like year, month, day, etc.
    Handles potential issues like missing date_column or invalid data.
    """
    print("convert_date_columns START")

    # Ensure the date_column exists in the DataFrame
    if date_column not in df.columns:
        raise KeyError(f"Column '{date_column}' does not exist in the DataFrame.")

    # Convert the column to datetime
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')  # Handle invalid date strings

    # Check if any date values are invalid after conversion
    if df[date_column].isnull().all():
        raise ValueError(f"All values in '{date_column}' could not be converted to datetime.")

    # Extract date features
    df['year'] = df[date_column].dt.year
    df['month'] = df[date_column].dt.month
    df['day'] = df[date_column].dt.day
    df['day_of_week'] = df[date_column].dt.dayofweek
    df['is_weekend'] = df['day_of_week'] >= 5  # Saturday=5, Sunday=6

    # Drop the original date column
    df = df.drop(columns=[date_column])

    print("convert_date_columns END")

    return df

#    model = RandomForestRegressor(random_state=42, n_jobs=-1)
#    xs=(X_train, X_test, y_train, y_test)
def perm_importance_model(model, xs):
    '''
    Function to compute Permutation Importance of a given Random Forest regression model.
    '''
    print("perm_importance_model START")

    X_train, X_test, y_train, y_test = xs

    # Compute Permutation Importance
    perm_importance = permutation_importance(
        model, X_test, y_test, n_repeats=5, random_state=42, n_jobs=-1
    )

    # Create the results DataFrame
    importance_model = pd.DataFrame({
        "Feature": model.feature_names_in_,
        "Permutation Importance": perm_importance.importances_mean,
        "Permutation Std Deviation": perm_importance.importances_std,
        "Model Importance": model.feature_importances_,
    })

    # Add ranking for permutation and model importances
    importance_model["Permutation Rank"] = importance_model["Permutation Importance"].rank(ascending=False)
    importance_model["Model Rank"] = importance_model["Model Importance"].rank(ascending=False)

    # Sort by Permutation Importance for display
    importance_model = importance_model.sort_values(by="Permutation Importance", ascending=False)

    # Display the DataFrame
    display(importance_model)

    print("perm_importance_model END")

    return importance_model

def load_dataframe_from_drive():
    """
    Function to load a DataFrame from a Google Drive file path.
    """
    from google.colab import drive
    drive.mount('/content/drive')

    # Load the file from Google Drive
    file_path = "/content/drive/My Drive/Train.csv"  # Replace with the saved file path
    df = pd.read_csv(file_path)
    return df

def summarize_null_columns(dataframe):
    """
    Function to find columns with null values and display detailed information
    including column name, total null count, percentage of nulls, and data type.
    """
    # Find total nulls and percentage of nulls for each column
    columns_with_nulls = dataframe.isnull().sum()  # Total nulls per column
    null_percentage = (columns_with_nulls / len(dataframe)) * 100  # Percentage of nulls

    # Iterate through columns and display only those with null values
    for col in dataframe.columns:
        if columns_with_nulls[col] > 0:  # Check if the column has nulls
            col_dtype = dataframe[col].dtype  # Get column data type
            print(f"Column: {col}")
            print(f"  - Data Type: {col_dtype}")
            print(f"  - Total Null Rows: {columns_with_nulls[col]}")
            print(f"  - Percentage of Nulls: {null_percentage[col]:.2f}%")
            print("-" * 40)

def drop_columns_with_high_nulls(dataframe, threshold=0.05):
    """
    Function to drop columns from the DataFrame that have more than 'threshold' null values.

    Parameters:
    dataframe (pd.DataFrame): The input DataFrame.
    threshold (int): The maximum allowable null rows for a column. Columns with nulls > threshold will be dropped.

    Returns:
    pd.DataFrame: A new DataFrame with columns dropped based on the condition.
    """
    print("drop_columns_with_high_nulls START")

    # Identify columns with null counts greater than the threshold
    columns_to_drop = dataframe.columns[dataframe.isnull().sum()/dataframe.shape[0] > threshold]

    print("drop_columns_with_high_nulls END")

    return columns_to_drop


def update_MachineHoursCurrentMeter(df):
  print("update_MachineHoursCurrentMeter START")

  # Update the 'MachineHoursCurrentMeter' column: replace NaN or 0 with 0
  column_name = 'MachineHoursCurrentMeter'

  # Check if the column exists
  if column_name in df.columns:
      df[column_name] = df[column_name].fillna(0)  # Replace NaN with 0
      df[column_name] = df[column_name].replace(0, 0)  # Ensure 0 stays 0
      #print(f"Updated '{column_name}' column successfully!")
  #else:
      #print(f"Column '{column_name}' does not exist in the DataFrame.")

  print("update_MachineHoursCurrentMeter END")

  return df

def update_auctioneerID(df):
  print("update_auctioneerID START")

  # Assuming 'df' is your DataFrame
  # Fill null values in the 'auctioneerID' column with 100.0
  if 'auctioneerID' in df.columns:
      df['auctioneerID'] = df['auctioneerID'].fillna(100.0)
      #print("'auctioneerID' null values filled with 100.0 successfully!")
  #else:
  #    print("Column 'auctioneerID' does not exist in the DataFrame.")
  print("update_auctioneerID END")

  return df

def update_Enclosure(df):
  print("update_Enclosure START")

  # Replace null values in the 'Enclosure' column with 'N/A'
  if 'Enclosure' in df.columns:
      df['Enclosure'] = df['Enclosure'].fillna('None or Unspecified')
      #print("Replaced null values in the 'Enclosure' column with 'N/A'.")
  #else:
      #print("Column 'Enclosure' does not exist in the DataFrame.")

  print("update_Enclosure END")

  return df
'''
# Function to calculate YearMade based on ModelID
def calc_YearMade(df,model_id):
    valid_years = df.loc[(df['ModelID'] == model_id) & (df['YearMade'] > 1000), 'YearMade']
    return int(valid_years.mean()) if not valid_years.empty else 1000  # Default if no valid years exist
'''
# Function to calculate YearMade based on ModelID
def calc_YearMade(df, model_id):
    valid_years = df.loc[(df['ModelID'] == model_id) & (df['YearMade'] > 1000), 'YearMade']
    return int(valid_years.mean()) if not valid_years.empty else 1000  # Default if no valid years exist


def count_total_rows_per_column(df,column_name):
  return df[df[column_name] == 1000].shape[0]

def update_YearMade(df):
  # Update YearMade where it is 1000
  df.loc[df['YearMade'] == 1000, 'YearMade'] = df.loc[df['YearMade'] == 1000, 'ModelID'].apply(lambda model_id: calc_YearMade(df, model_id))
  # Compute the mean excluding rows where YearMade == 1000
  mean_yearmade = df.loc[df['YearMade'] != 1000, 'YearMade'].mean()

  # Update all rows where YearMade == 1000 with the calculated mean
  df.loc[df['YearMade'] == 1000, 'YearMade'] = int(round(mean_yearmade))

  return df

#First run OA
def first_run_25012025():
  df=load_dataframe_from_drive()

  missing_values_histogram(df,df.columns)

  #df.info()

  #df.describe()

  df=update_MachineHoursCurrentMeter(df)
  df=update_auctioneerID(df)

  #print("columns with null values:", columns_with_nulls(df,False))
  #summarize_null_columns(df)

  df = drop_columns_with_high_nulls(df, threshold=500)
  df=update_Enclosure(df)
  df=convert_date_columns(df,'saledate')
  df=encode_all_categories(df)
  model,xs=train_model(df=df,column_to_predict='SalePrice')#took 8 min runtime
  return model,xs

#Second run OA remove cols and outliners
def Second_run_26012025():
  df=load_dataframe_from_drive()
  df=update_MachineHoursCurrentMeter(df)
  df=update_auctioneerID(df)
  df = drop_columns_with_high_nulls(df, threshold=500)
  df=update_Enclosure(df)
  df=convert_date_columns(df,'saledate')
  df=encode_all_categories(df)

  sampled_df = df.sample(n=25000, random_state=42)  # Example: 10,000 rows

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  feature_importances(model)

  sampled_df = sampled_df.drop(columns=['is_weekend','datasource','MachineHoursCurrentMeter','day_of_week','auctioneerID'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  sampled_df=find_outliers(df=sampled_df,column_to_predict='SalePrice')

  # Assuming df_with_anomalies is the DataFrame containing anomaly labels
  anomalies = sampled_df[sampled_df['anomaly_label'] == 'Anomaly']

  # Display rows where 'anomaly_label' is 'Anomaly'
  print(anomalies)

  # Optionally, display the count of anomalies
  print(f"Number of anomalies detected: {len(anomalies)}")

  # Remove all rows where 'anomaly_label' is 'Anomaly'
  sampled_df = sampled_df[sampled_df['anomaly_label'] != 'Anomaly']

  # Drop the columns 'anomaly_label', 'anomaly_score', and 'anomaly'
  columns_to_drop = ['anomaly_label', 'anomaly_score', 'anomaly']
  sampled_df = sampled_df.drop(columns=columns_to_drop, errors='ignore')

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  perm_importance_model(model,xs)

  sampled_df = sampled_df.drop(columns=['day','month','state','MachineID','SalesID','ProductGroupDesc'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  perm_importance_model(model,xs)

  return model,xs

#First run OA 27012025
def First_run_27012025():
  df=load_dataframe_from_drive()

  df=update_NaN_To_None_or_Unspecified(df)

  df=update_MachineHoursCurrentMeter(df)
  df=update_auctioneerID(df)
  df = drop_columns_with_high_nulls(df, threshold=500)
  df=update_Enclosure(df)
  df=convert_date_columns(df,'saledate')
  df=encode_all_categories(df)

  sampled_df = df.sample(n=25000, random_state=42)  # Example: 10,000 rows

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  feature_importances(model)

  sampled_df = sampled_df.drop(columns=['is_weekend','datasource','MachineHoursCurrentMeter','day_of_week','auctioneerID'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  sampled_df=find_outliers(df=sampled_df,column_to_predict='SalePrice')

  # Assuming df_with_anomalies is the DataFrame containing anomaly labels
  anomalies = sampled_df[sampled_df['anomaly_label'] == 'Anomaly']

  # Display rows where 'anomaly_label' is 'Anomaly'
  print(anomalies)

  # Optionally, display the count of anomalies
  print(f"Number of anomalies detected: {len(anomalies)}")

  # Remove all rows where 'anomaly_label' is 'Anomaly'
  sampled_df = sampled_df[sampled_df['anomaly_label'] != 'Anomaly']

  # Drop the columns 'anomaly_label', 'anomaly_score', and 'anomaly'
  columns_to_drop = ['anomaly_label', 'anomaly_score', 'anomaly']
  sampled_df = sampled_df.drop(columns=columns_to_drop, errors='ignore')

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  perm_importance_model(model,xs)

  sampled_df = sampled_df.drop(columns=['day','month','state','MachineID','SalesID','ProductGroupDesc'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  importance_model=perm_importance_model(model,xs)

  # Drop columns where 'Permutation Importance' is less than 0.02
  importance_model_filtered = importance_model[importance_model['Permutation Importance'] >= 0.02]

  # Display the filtered DataFrame
  print(importance_model_filtered)

  # Optionally, list the dropped columns
  dropped_columns = importance_model[importance_model['Permutation Importance'] < 0.02]['Feature']
  print("Dropped columns:", dropped_columns.tolist())
  sampled_df.drop(columns=dropped_columns, inplace=True)

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  return model,xs


#First run OA 27012025
def First_run_02022025():
  pd.set_option('display.expand_frame_repr', False)

  df=load_dataframe_from_drive()
  print(count_total_rows_per_column(df,'YearMade'))
  # Update YearMade where it is 1000
  df.loc[df['YearMade'] == 1000, 'YearMade'] = df.loc[df['YearMade'] == 1000, 'ModelID'].apply(calc_YearMade)
  print(count_total_rows_per_column(df,'YearMade'))

  df=update_NaN_To_None_or_Unspecified(df)

  df=update_MachineHoursCurrentMeter(df)
  df=update_auctioneerID(df)
  df = drop_columns_with_high_nulls(df, threshold=500)
  df=update_Enclosure(df)
  df=convert_date_columns(df,'saledate')
  df=encode_all_categories(df)

  sampled_df = df.sample(n=25000, random_state=42)  # Example: 10,000 rows

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  feature_importances(model)

  sampled_df = sampled_df.drop(columns=['is_weekend','datasource','MachineHoursCurrentMeter','day_of_week','auctioneerID'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  sampled_df=find_outliers(df=sampled_df,column_to_predict='SalePrice')

  # Assuming df_with_anomalies is the DataFrame containing anomaly labels
  anomalies = sampled_df[sampled_df['anomaly_label'] == 'Anomaly']

  # Display rows where 'anomaly_label' is 'Anomaly'
  print(anomalies)

  # Optionally, display the count of anomalies
  print(f"Number of anomalies detected: {len(anomalies)}")

  # Remove all rows where 'anomaly_label' is 'Anomaly'
  sampled_df = sampled_df[sampled_df['anomaly_label'] != 'Anomaly']

  # Drop the columns 'anomaly_label', 'anomaly_score', and 'anomaly'
  columns_to_drop = ['anomaly_label', 'anomaly_score', 'anomaly']
  sampled_df = sampled_df.drop(columns=columns_to_drop, errors='ignore')

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  perm_importance_model(model,xs)

  sampled_df = sampled_df.drop(columns=['day','month','state','MachineID','SalesID','ProductGroupDesc'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  importance_model=perm_importance_model(model,xs)

  # Drop columns where 'Permutation Importance' is less than 0.02
  importance_model_filtered = importance_model[importance_model['Permutation Importance'] >= 0.02]

  # Display the filtered DataFrame
  print(importance_model_filtered)

  # Optionally, list the dropped columns
  dropped_columns = importance_model[importance_model['Permutation Importance'] < 0.02]['Feature']
  print("Dropped columns:", dropped_columns.tolist())
  sampled_df.drop(columns=dropped_columns, inplace=True)

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  return model,xs

def prepare_dataframe(df_pre:pd.DataFrame):
  #START V_2_Valiad.csv
  # Replace NaN/None with 1 before applying log (since log(1) = 0)
  #print("price log start")
  #df_pre['SalePrice'] = df_pre['SalePrice'].fillna(1)
  #df_pre['SalePrice']=np.log(df_pre['SalePrice'])
  #print("price log end")
  df_pre=encode_all_categories(df_pre)
  model,xs=train_model(df=df_pre,column_to_predict='SalePrice')
  print()
  print()
  df_pre=update_YearMade(df_pre)
  model,xs=train_model(df=df_pre,column_to_predict='SalePrice')
  print()
  print()
  df_pre=update_NaN_To_None_or_Unspecified(df_pre)
  model,xs=train_model(df=df_pre,column_to_predict='SalePrice')
  print()
  print()

  #Raised RMSE
  #df_pre=update_MachineHoursCurrentMeter(df_pre)
  #model,xs=train_model(df=df_pre,column_to_predict='SalePrice')
  #print()
  #print()

  #Raised RMSE
  #df_pre=update_auctioneerID(df_pre)
  #model,xs=train_model(df=df_pre,column_to_predict='SalePrice')
  #print()
  #print()

  #Raised RMSE
  #df_pre=update_Enclosure(df_pre)
  #model,xs=train_model(df=df_pre,column_to_predict='SalePrice')
  #print()
  #print()
  df_pre=convert_date_columns(df_pre,'saledate')
  model,xs=train_model(df=df_pre,column_to_predict='SalePrice')
  print()
  print()

  'feature_importances'
  '''
  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')
  feature_importances(model)
  '''
  #Raised RMSE
  #df_pre = df_pre.drop(columns=['is_weekend','datasource','MachineHoursCurrentMeter','day_of_week','auctioneerID'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist
  #model,xs=train_model(df=df_pre,column_to_predict='SalePrice')
  #print()
  #print()


  #Raised RMSE
  #df_pre=drop_nulls(df_pre)
  #model,xs=train_model(df=df_pre,column_to_predict='SalePrice')
  #print()
  #print()

  #Raised RMSE
  #drop_columns=drop_columns_with_high_nulls(df_pre)
  #print(drop_columns)
  #df_pre = df_pre.drop(columns=[col for col in drop_columns if col in df_pre.columns])
  #model,xs=train_model(df=df_pre,column_to_predict='SalePrice')
  #print()
  #print()

  #feature_importances(model)
  #perm_importance_model(model,xs)
  columns_to_drop = [
    "day", "month", "year", "day_of_week", "is_weekend",  # Completely zero importance
    "Turbocharged", "Thumb", "Pattern_Changer", "datasource",  # Negative importance
    "Pad_Type", "Backhoe_Mounting", "ProductGroup", "Steering_Controls",
    "Track_Type", "Stick", "Hydraulics_Flow", "Differential_Type",
    "ProductGroupDesc", "Grouser_Tracks", "Undercarriage_Pad_Width"
  ]

  df_pre = df_pre.drop(columns=columns_to_drop, errors='ignore')  # `errors='ignore'` avoids errors if some columns are missing

  return df_pre

import os
def test_model(model):
  file_path = "/content/drive/My Drive/Valid.csv"  # Replace with the saved file path
  df_valid = pd.read_csv(file_path)
  df_valid['SalePrice']=np.log1p(1)
  df_valid=prepare_dataframe(df_valid)
  X = df_valid.drop(columns=['SalePrice'])
  y = df_valid['SalePrice']
  y = model.predict(X)

  # Assign predictions to the correct rows in df_valid
  df_valid.loc[X.index, 'SalePrice'] = y
  #df_valid['SalePrice'] = np.exp(df_valid['SalePrice'])

  file_path = "/content/drive/My Drive/output.csv"  # Save to Google Drive root
  df_valid[['SalesID', 'SalePrice']].to_csv(file_path, index=False)

  print(f"File saved successfully at: {file_path}")

#TEST run OA 02022025
def TEST_run_02022025():
  pd.set_option('display.expand_frame_repr', False)
  df=load_dataframe_from_drive()
  df = df.sample(n=20000, random_state=42)
  df=prepare_dataframe(df)
  model,xs=train_model(df=df,column_to_predict='SalePrice')

  return model,xs



In [None]:
# @title

#erez functions1

def add_age_machine(df):
    # Ensure 'saledate' is in datetime format
    df['saledate'] = pd.to_datetime(df['saledate'], errors='coerce')

    # Extract the year from 'saledate'
    df['saledate_year'] = df['saledate'].dt.year

    # Calculate the machine's age
    df['age_machine'] = df['saledate_year'] - df['YearMade']

    return df

# Apply the function
df = add_age_machine(df)

# Display the DataFrame with the new 'age_machine' column
df.info()
print(df[['YearMade', 'saledate', 'age_machine']])

df

NameError: name 'df' is not defined

In [None]:
# @title
#erez functions2
for col in df.select_dtypes(include=['object']).columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Replace NaN with 0
df[col] = df[col].fillna(0).astype(int)

# Or, drop rows with NaN values
df.dropna(subset=[col], inplace=True)

df[col] = df[col].replace({'\$': '', ',': ''}, regex=True).astype(float)

for col in df.select_dtypes(include=['object']).columns:
    if df[col].str.isnumeric().all():  # Check if all values are numeric
        df[col] = df[col].astype(int)
    else:
        print(f"Cannot convert column {col} to int")

# Convert all object columns that can be converted to integers
for col in df.select_dtypes(include=['object']).columns:
    try:
        df[col] = df[col].astype(int)
    except ValueError:
        print(f"Cannot convert column {col} to int")

def robust_convert(df):
    for col in df.select_dtypes(include=['object']).columns:
        try:
            # Step 1: Remove non-numeric symbols (if any, e.g., '$')
            df[col] = df[col].replace({'\$': '', ',': '', '%': ''}, regex=True)

            # Step 2: Convert to numeric, coerce errors to NaN
            df[col] = pd.to_numeric(df[col], errors='coerce')

            # Step 3: Handle NaN values (e.g., replace with 0 or mean of column)
            df[col] = df[col].fillna(0).astype(int)
        except Exception as e:
            print(f"Failed to convert column {col}: {e}")

    return df

# Apply the robust conversion
df = robust_convert(df)
df.info()

# Separate numerical columns and non-numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
non_numerical_cols = df.select_dtypes(exclude=['float64', 'int64']).columns

# Optionally, convert datetime columns to numerical values
if 'saledate' in non_numerical_cols:
    df['saledate'] = (df['saledate'] - df['saledate'].min()) / pd.Timedelta(days=1)

# Now apply KNNImputer only on numerical columns
imputer = KNNImputer(n_neighbors=5)

# Make sure that you're passing the correct DataFrame subset
numerical_data = df[numerical_cols].copy()  # Avoid modifying the original DataFrame directly

# Perform imputation
imputed_data = imputer.fit_transform(numerical_data)

# Convert imputed data back to DataFrame and ensure column names match
df_imputed = pd.DataFrame(imputed_data, columns=numerical_cols)

# Replace the original columns with the imputed ones
df[numerical_cols] = df_imputed

# For categorical columns, you can impute using the mode
for col in non_numerical_cols:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)

# If you converted datetime columns, convert them back to datetime format
df['saledate'] = pd.to_datetime(df['saledate'], unit='D', origin=pd.Timestamp('2020-01-01'))

# Check for any remaining null values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401125 entries, 0 to 401124
Data columns (total 53 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   SalesID                   401125 non-null  int64  
 1   SalePrice                 401125 non-null  int64  
 2   MachineID                 401125 non-null  int64  
 3   ModelID                   401125 non-null  int64  
 4   datasource                401125 non-null  int64  
 5   auctioneerID              380989 non-null  float64
 6   YearMade                  401125 non-null  int64  
 7   MachineHoursCurrentMeter  142765 non-null  float64
 8   UsageBand                 0 non-null       float64
 9   saledate                  401125 non-null  int32  
 10  fiModelDesc               61647 non-null   float64
 11  fiBaseModel               252195 non-null  float64
 12  fiSecondaryDesc           292 non-null     float64
 13  fiModelSeries             25528 non-null   f

UFuncTypeError: ufunc 'divide' cannot use operands with types dtype('int32') and dtype('<m8[ns]')

In [None]:
# @title
#eeitan function
def calculate_machine_age(df):
    # Ensure that 'SaleDate' is in datetime format
    df['saledate'] = pd.to_datetime(df['saledate'])
    # Extract the year from 'SaleDate'
    df['saledate'] = df['saledate'].dt.year

    # Calculate MachineAge by subtracting 'YearMade' from the 'SaleYear'
    df['machineage'] = df['saleyear'] - df['yearmade']

    # Drop the 'SaleYear' column if you don't need it anymore
    df.drop(columns=['saleyear'], inplace=True)

    return df

df=load_dataframe_from_drive()
df['saledate']
# Example usage:
# Assuming your dataframe is named `df`
df = calculate_machine_age(df)

# Display the dataframe with the new 'MachineAge' column
print(df[['YearMade', 'SaleDate', 'MachineAge']].head())


Mounted at /content/drive


  df = pd.read_csv(file_path)


KeyError: 'saleyear'

In [None]:
#sorel functions

In [None]:
#model,xs=train_model(df=df,column_to_predict='SalePrice')#took 8 min runtime

RMSE Baseline accuracy: 22932.4005340408 Train RMSE: 2864.123815046 Test RMSE: 7633.339248575835 0 Enclosure 0.240869 YearMade 0.147308 fiProductClassDesc 0.117304 fiModelDesc 0.099289 ModelID 0.093105 year 0.080002 SalesID 0.043689 fiBaseModel 0.033948 ProductGroup 0.032850 ProductGroupDesc 0.029777 MachineID 0.023082 day 0.013131 month 0.013108 state 0.012552 auctioneerID 0.006424 day_of_week 0.006194 MachineHoursCurrentMeter 0.005897 datasource 0.000885 is_weekend 0.000586

In [24]:
##ofer runtime
#model,xs=first_run_25012025()
#model,xs=Second_run_26012025()
#model,xs=First_run_27012025()
#model,xs=First_run_02022025()
model,xs=TEST_run_02022025()
print('END TRAIN')
test_model(model)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  df = pd.read_csv(file_path)


encode_all_categories START
encode_all_categories END
train_model START
RMSE Baseline accuracy: 23250.388216928186
Train RMSE: 4254.52043045425
Test RMSE: 11191.414112496974
train_model END


train_model START
RMSE Baseline accuracy: 23250.388216928186
Train RMSE: 4231.48619601696
Test RMSE: 11043.289510198698
train_model END


update_NaN_To_None_or_Unspecified START
update_NaN_To_None_or_Unspecified END
train_model START
RMSE Baseline accuracy: 23250.388216928186
Train RMSE: 4226.195962309244
Test RMSE: 11088.978659419283
train_model END


convert_date_columns START
convert_date_columns END
train_model START
RMSE Baseline accuracy: 23250.388216928186
Train RMSE: 4238.376599228977
Test RMSE: 11063.030255206631
train_model END


feature_importances START
feature_importances END
perm_importance_model START




Unnamed: 0,Feature,Permutation Importance,Permutation Std Deviation,Model Importance,Permutation Rank,Model Rank
13,ProductSize,0.629484,0.017346,0.22518,1.0,2.0
5,YearMade,0.629215,0.008789,0.242248,2.0,1.0
0,SalesID,0.119427,0.00636,0.08849,3.0,3.0
10,fiSecondaryDesc,0.081704,0.001741,0.061258,4.0,4.0
37,Coupler_System,0.061404,0.003449,0.027951,5.0,9.0
14,fiProductClassDesc,0.037664,0.001418,0.027519,6.0,10.0
27,Blade_Width,0.027998,0.00231,0.038361,7.0,6.0
2,ModelID,0.014808,0.001666,0.034733,8.0,7.0
1,MachineID,0.012681,0.002542,0.043015,9.0,5.0
9,fiBaseModel,0.009772,0.00077,0.03075,10.0,8.0


perm_importance_model END
train_model START
RMSE Baseline accuracy: 23250.388216928186
Train RMSE: 4250.887742530549
Test RMSE: 11058.2615569221
train_model END
END TRAIN
encode_all_categories START
encode_all_categories END
train_model START
RMSE Baseline accuracy: 8.994101826297921e-15
Train RMSE: 3.397282455352979e-14
Test RMSE: 3.397282455352979e-14
train_model END


train_model START
RMSE Baseline accuracy: 8.994101826297921e-15
Train RMSE: 3.441691376337985e-14
Test RMSE: 3.441691376337985e-14
train_model END


update_NaN_To_None_or_Unspecified START
update_NaN_To_None_or_Unspecified END
train_model START
RMSE Baseline accuracy: 8.994101826297921e-15
Train RMSE: 3.419486915845482e-14
Test RMSE: 3.419486915845482e-14
train_model END


convert_date_columns START
convert_date_columns END
train_model START
RMSE Baseline accuracy: 8.994101826297921e-15
Train RMSE: 3.375077994860476e-14
Test RMSE: 3.375077994860476e-14
train_model END


feature_importances START
feature_importances END

Unnamed: 0,Feature,Permutation Importance,Permutation Std Deviation,Model Importance,Permutation Rank,Model Rank
0,SalesID,0.0,0.0,0.0,28.5,28.5
1,MachineID,0.0,0.0,0.0,28.5,28.5
30,Hydraulics,0.0,0.0,0.0,28.5,28.5
31,Pushblock,0.0,0.0,0.0,28.5,28.5
32,Ripper,0.0,0.0,0.0,28.5,28.5
33,Scarifier,0.0,0.0,0.0,28.5,28.5
34,Tip_Control,0.0,0.0,0.0,28.5,28.5
35,Tire_Size,0.0,0.0,0.0,28.5,28.5
36,Coupler,0.0,0.0,0.0,28.5,28.5
37,Coupler_System,0.0,0.0,0.0,28.5,28.5


perm_importance_model END
File saved successfully at: /content/drive/My Drive/output.csv


In [35]:
# @title
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.ensemble import RandomForestRegressor  # For the Random Forest regression model
import shap
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_log_error


file_path = "/content/drive/My Drive/Valid.csv"  # Replace with the saved file path
df_valid = pd.read_csv(file_path)
df_valid['SalePrice']=None
X = df_valid.drop(columns=['SalePrice'])
y = df_valid['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

y_test_pred = model.predict(X_test)
# 7. print the RMSE accuracy of the baseline (std dev)
print("RMSE Baseline accuracy:", y_test.std())
print("Train RMSE:", RMSE(y_train, y_train_pred))
print("Test RMSE:", RMSE(y_test, y_test_pred))



ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Differential_Type
- Drive_System
- Engine_Horsepower
- Grouser_Type
- Hydraulics
- ...
Feature names seen at fit time, yet now missing:
- year


In [None]:
#erez runtime

In [None]:
#eitan runtime

In [None]:
#sorel runtime