<a href="https://colab.research.google.com/github/colab-ds18/ML-DS18/blob/main/ML_DS18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
#ofer functions
'''

General Funtions Author OA
Date 29/01/2025
'''
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.ensemble import RandomForestRegressor  # For the Random Forest regression model
import shap
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_log_error

def fill_missing_values(df):
    for col in df.select_dtypes(include=['number']):
        df[col] = df[col].fillna(df[col].median())  # Use assignment instead of inplace=True
    for col in df.select_dtypes(include=['object']):
        df[col] = df[col].fillna('None or Unspecified')  # Use assignment instead of inplace=True
    return df  # Return modified DataFrame

def save_dataframe(df,file_name):
  from google.colab import drive
  drive.mount('/content/drive')
  file_path = "/content/drive/My Drive/"+file_name+".csv"
  df.to_csv(file_path, index=False)
  print(f"File saved successfully at: {file_path}")

def load_dataframe_from_drive(file_name='Train'):
    """
    Function to load a DataFrame from a Google Drive file path.
    """
    from google.colab import drive
    drive.mount('/content/drive')

    # Load the file from Google Drive
    file_path = "/content/drive/My Drive/"+file_name+".csv"  # Replace with the saved file path
    df = pd.read_csv(file_path)
    return df

def save_model(model,model_name):
  from google.colab import drive
  drive.mount('/content/drive')

  import joblib

  # Assuming 'rf_model' is your trained RandomForestRegressor
  model_path = '/content/drive/My Drive/'+model_name+'.pkl'
  joblib.dump(model, model_path)

  print(f"Model saved at {model_path}")

def load_model(model_name):
  import joblib

  model = joblib.load('/content/drive/My Drive/'+model_name+'.pkl')
  print("Model loaded successfully!")
  return model

def generate_X_Y_params(df,column_to_predict,test_size_value=0.3,random_state_value=42):
    X = df.drop(columns=[column_to_predict])
    y = df[column_to_predict]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_value, random_state=random_state_value)
    return X_train, X_test, y_train, y_test

def show_unique_values(df, column_name):
  '''
  Function to list all unique values in a given column of a DataFrame.
  '''
  if column_name not in df.columns:
    raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
    unique_products = list_unique_values(df, column_name)
    print(unique_products)
    null_stats_per_column(df,column_name)


def list_unique_values(df, column_name):
    """
    Function to list all unique values in a given column of a DataFrame.

    Parameters:
    - df: pandas DataFrame
    - column_name: Name of the column to find unique values

    Returns:
    - A list of unique values in the specified column.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")

    unique_values = df[column_name].unique()
    return unique_values


def update_NaN_To_None_or_Unspecified(df):
    """
    Updates NaN values to 'None or Unspecified' in columns where the value
    'None or Unspecified' is already present.
    """
    print("update_NaN_To_None_or_Unspecified START")

    update_to = 'None or Unspecified'

    for col in df.columns:
        unique_values = list_unique_values(df, col)  # Get unique values for the column

        # Check if 'None or Unspecified' exists and if there are NaN values
        if update_to in unique_values and pd.isnull(unique_values).any():
            df[col] = df[col].fillna(update_to)  # Fill NaN with the specified value

    print("update_NaN_To_None_or_Unspecified END")

    return df

def columns_with_nulls(df,only_numeric_columns=True):
  '''
  Function to list all columns with null values in a given DataFrame.
  '''
  print("columns_with_nulls START/END")
  if only_numeric_columns:
    # Get numeric columns only
    numeric_columns = df.select_dtypes(include=['number'])

    # Find numeric columns with null values
    numeric_columns_with_null = numeric_columns.columns[numeric_columns.isnull().any()]
    return numeric_columns_with_null.tolist()
  else:
    return df.columns[df.isnull().any()]


def null_stats_per_column(df,chosen_column):
  '''
  Function to display the number of null rows in a given column of a DataFrame.
  '''
  print("null_stats_per_column START/END")

  # Count the number of null rows in the chosen column
  null_count = df[chosen_column].isnull().sum()

  # Calculate the percentage of nulls in the chosen column
  null_percentage = (null_count / len(df)) * 100

  # Display the result
  print(f"Column: {chosen_column}")
  print(f"Null Count: {null_count}")
  print(f"Null Percentage: {null_percentage:.2f}%")


def missing_values_histogram(df,columns):
    '''
    Function to plot a histogram of missing values in a given DataFrame.
    '''
    data=[{column:df[column].value_counts().sum()} for column in columns ]
    # Convert the list of dictionaries to a DataFrame
    formatted_data = pd.DataFrame([{name:count for single_data in data for name, count in single_data.items()}]).T.reset_index()
    formatted_data.columns = ['Column', 'Count']
    # Create a bar plot
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(x='Column', y='Count', data=formatted_data, hue='Column',legend=False)


    # label count on top of each bar
    for index, row in formatted_data.iterrows():
        ax.text(index, row['Count'] + 10, str(row['Count']), color='black', ha="center", fontsize=10)

    # Add title and axis labels
    plt.title("Column Counts in Dataframe")
    plt.xlabel("Column")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def encode_all_categories(df):
  '''
  Function to convert all string columns in a given DataFrame into categories.
  '''
  print("encode_all_categories START")
  for col in df.select_dtypes(['object']):
    df[col] = df[col].astype('category')

  for col in df.select_dtypes(['category']):
    df[col] = df[col].cat.codes

  print("encode_all_categories END")

  return df

def drop_nulls(df):
  '''
  Function to drop all rows with null values in a given DataFrame.
  '''
  df = df.dropna()
  return df

def RMSLE(y_test, y_pred):
    '''
    RSMLE approximates the percent change
    '''
    return np.sqrt(np.mean((np.log(y_pred) - np.log(y_test))**2))

def RMSE(y_, y_pred_):
    '''
    RSME
    '''
    return ((y_ - y_pred_) ** 2).mean() ** 0.5

def train_model(df,column_to_predict,test_size_value=0.3,random_state_value=42):
    '''
    Function to train a Random Forest regression model on a given DataFrame.
    '''
    print("train_model START")

    X = df.drop(columns=[column_to_predict])
    y = df[column_to_predict]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_value, random_state=random_state_value)

    # model = DecisionTreeRegressor(
    # min_samples_leaf=16
    # max_depth=3
    # )
    #model = LinearRegression()
    model = RandomForestRegressor()
    model.fit(X_train, y_train)


    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # 7. print the RMSE accuracy of the baseline (std dev)
    print("RMSE Baseline accuracy:", y_test.std())
    print("Train RMSE:", RMSE(y_train, y_train_pred))
    print("Test RMSE:", RMSE(y_test, y_test_pred))
    #print("Test RMSLE:", RMSLE(y_test, y_test_pred))

    print("train_model END")

    #display(pd.Series(model.feature_importances_, model.feature_names_in_).sort_values(ascending=False))
    return model,(X_train, X_test, y_train, y_test)

def feature_importances(model):
  '''
  Function to display the feature importances of a given Random Forest regression model.
  '''
  print("feature_importances START")

  dict(zip(model.feature_names_in_, model.feature_importances_))
  fi = pd.Series(model.feature_importances_, index=model.feature_names_in_)
  fi = fi.sort_values(ascending=False)
  print("feature_importances END")

  return fi

def explain_model_snap(model,X_test,water_fall_row=0):
  '''
  Function to explain the model's predictions using SHAP values.
  '''
  shap.initjs()
  explainer = shap.Explainer(model)
  explanation = explainer(X_test)  # New style
  shap.summary_plot(explanation, X_test)

  shap.plots.waterfall(explanation[water_fall_row])

  shap.plots.waterfall(explanation[water_fall_row+2])

  shap.plots.partial_dependence('fare', model.predict, X_test, feature_names=X_test.columns)

  shap.plots.heatmap(explanation)

  shap.plots.bar(explanation)

from sklearn.ensemble import IsolationForest
import pandas as pd

def detect_and_filter_anomalies(df, contamination=0.05, random_state=42):
    """
    Detects anomalies in a DataFrame using IsolationForest and returns only normal data.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        contamination (float): The proportion of anomalies (default: 5%).
        random_state (int): Random state for reproducibility.

    Returns:
        normal_df (pd.DataFrame): DataFrame containing only normal rows, without anomaly columns.
    """
    df_outliers = df.copy()  # Copy original DataFrame

    # Select only numerical columns for anomaly detection
    X = df_outliers.select_dtypes(include=['number']).copy()

    # Handle missing values (important to uncomment if NaNs exist)
    X.fillna(X.median(), inplace=True)

    # Initialize IsolationForest
    iso_forest = IsolationForest(contamination=contamination, random_state=random_state)
    iso_forest.fit(X)

    # Add anomaly scores and labels
    df_outliers['anomaly_score'] = iso_forest.decision_function(X)  # Quantitative weirdness
    df_outliers['anomaly'] = iso_forest.predict(X)  # Binary anomaly label
    df_outliers['anomaly_label'] = df_outliers['anomaly'].map({1: 'Normal', -1: 'Anomaly'})

    # Keep only normal data
    normal_df = df_outliers[df_outliers['anomaly'] == 1].drop(columns=['anomaly_score', 'anomaly', 'anomaly_label'])

    return normal_df

def convert_date_columns(df, date_column):
    """
    Convert date column into multiple numeric features like year, month, day, etc.
    Handles potential issues like missing date_column or invalid data.
    """
    print("convert_date_columns START")

    # Ensure the date_column exists in the DataFrame
    if date_column not in df.columns:
        raise KeyError(f"Column '{date_column}' does not exist in the DataFrame.")

    # Convert the column to datetime
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')  # Handle invalid date strings

    # Check if any date values are invalid after conversion
    if df[date_column].isnull().all():
        raise ValueError(f"All values in '{date_column}' could not be converted to datetime.")

    # Extract date features
    df['year'] = df[date_column].dt.year
    df['month'] = df[date_column].dt.month
    df['day'] = df[date_column].dt.day
    df['day_of_week'] = df[date_column].dt.dayofweek
    df['is_weekend'] = df['day_of_week'] >= 5  # Saturday=5, Sunday=6

    # Drop the original date column
    df = df.drop(columns=[date_column])

    print("convert_date_columns END")

    return df

#    model = RandomForestRegressor(random_state=42, n_jobs=-1)
#    xs=(X_train, X_test, y_train, y_test)
def perm_importance_model(model, xs):
    '''
    Function to compute Permutation Importance of a given Random Forest regression model.
    '''
    print("perm_importance_model START")

    X_train, X_test, y_train, y_test = xs

    # Compute Permutation Importance
    perm_importance = permutation_importance(
        model, X_test, y_test, n_repeats=5, random_state=42, n_jobs=-1
    )

    # Create the results DataFrame
    importance_model = pd.DataFrame({
        "Feature": model.feature_names_in_,
        "Permutation Importance": perm_importance.importances_mean,
        "Permutation Std Deviation": perm_importance.importances_std,
        "Model Importance": model.feature_importances_,
    })

    # Add ranking for permutation and model importances
    importance_model["Permutation Rank"] = importance_model["Permutation Importance"].rank(ascending=False)
    importance_model["Model Rank"] = importance_model["Model Importance"].rank(ascending=False)

    # Sort by Permutation Importance for display
    importance_model = importance_model.sort_values(by="Permutation Importance", ascending=False)

    # Display the DataFrame
    #display(importance_model)

    print("perm_importance_model END")

    return importance_model



def summarize_null_columns(dataframe):
    """
    Function to find columns with null values and display detailed information
    including column name, total null count, percentage of nulls, and data type.
    """
    # Find total nulls and percentage of nulls for each column
    columns_with_nulls = dataframe.isnull().sum()  # Total nulls per column
    null_percentage = (columns_with_nulls / len(dataframe)) * 100  # Percentage of nulls

    # Iterate through columns and display only those with null values
    for col in dataframe.columns:
        if columns_with_nulls[col] > 0:  # Check if the column has nulls
            col_dtype = dataframe[col].dtype  # Get column data type
            print(f"Column: {col}")
            print(f"  - Data Type: {col_dtype}")
            print(f"  - Total Null Rows: {columns_with_nulls[col]}")
            print(f"  - Percentage of Nulls: {null_percentage[col]:.2f}%")
            print("-" * 40)

def drop_columns_with_high_nulls(dataframe, threshold=0.05):
    """
    Function to drop columns from the DataFrame that have more than 'threshold' null values.

    Parameters:
    dataframe (pd.DataFrame): The input DataFrame.
    threshold (int): The maximum allowable null rows for a column. Columns with nulls > threshold will be dropped.

    Returns:
    pd.DataFrame: A new DataFrame with columns dropped based on the condition.
    """
    print("drop_columns_with_high_nulls START")

    # Identify columns with null counts greater than the threshold
    columns_to_drop = dataframe.columns[dataframe.isnull().sum()/dataframe.shape[0] > threshold]

    print("drop_columns_with_high_nulls END")

    return columns_to_drop


def update_MachineHoursCurrentMeter(df):
  print("update_MachineHoursCurrentMeter START")

  # Update the 'MachineHoursCurrentMeter' column: replace NaN or 0 with 0
  column_name = 'MachineHoursCurrentMeter'

  # Check if the column exists
  if column_name in df.columns:
      df[column_name] = df[column_name].fillna(0)  # Replace NaN with 0
      df[column_name] = df[column_name].replace(0, 0)  # Ensure 0 stays 0
      #print(f"Updated '{column_name}' column successfully!")
  #else:
      #print(f"Column '{column_name}' does not exist in the DataFrame.")

  print("update_MachineHoursCurrentMeter END")

  return df

def update_auctioneerID(df):
  print("update_auctioneerID START")

  # Assuming 'df' is your DataFrame
  # Fill null values in the 'auctioneerID' column with 100.0
  if 'auctioneerID' in df.columns:
      df['auctioneerID'] = df['auctioneerID'].fillna(100.0)
      #print("'auctioneerID' null values filled with 100.0 successfully!")
  #else:
  #    print("Column 'auctioneerID' does not exist in the DataFrame.")
  print("update_auctioneerID END")

  return df

def update_Enclosure(df):
  print("update_Enclosure START")

  # Replace null values in the 'Enclosure' column with 'N/A'
  if 'Enclosure' in df.columns:
      df['Enclosure'] = df['Enclosure'].fillna('None or Unspecified')
      #print("Replaced null values in the 'Enclosure' column with 'N/A'.")
  #else:
      #print("Column 'Enclosure' does not exist in the DataFrame.")

  print("update_Enclosure END")

  return df
'''
# Function to calculate YearMade based on ModelID
def calc_YearMade(df,model_id):
    valid_years = df.loc[(df['ModelID'] == model_id) & (df['YearMade'] > 1000), 'YearMade']
    return int(valid_years.mean()) if not valid_years.empty else 1000  # Default if no valid years exist
'''
# Function to calculate YearMade based on ModelID
def calc_YearMade(df, model_id):
    valid_years = df.loc[(df['ModelID'] == model_id) & (df['YearMade'] > 1000), 'YearMade']
    return int(valid_years.mean()) if not valid_years.empty else 1000  # Default if no valid years exist


def count_total_rows_per_column(df,column_name):
  return df[df[column_name] == 1000].shape[0]

def update_YearMade(df):
  # Update YearMade where it is 1000
  df.loc[df['YearMade'] == 1000, 'YearMade'] = df.loc[df['YearMade'] == 1000, 'ModelID'].apply(lambda model_id: calc_YearMade(df, model_id))
  # Compute the mean excluding rows where YearMade == 1000
  mean_yearmade = df.loc[df['YearMade'] != 1000, 'YearMade'].mean()

  # Update all rows where YearMade == 1000 with the calculated mean
  df.loc[df['YearMade'] == 1000, 'YearMade'] = int(round(mean_yearmade))

  return df

#First run OA
def first_run_25012025():
  df=load_dataframe_from_drive()

  missing_values_histogram(df,df.columns)

  #df.info()

  #df.describe()

  df=update_MachineHoursCurrentMeter(df)
  df=update_auctioneerID(df)

  #print("columns with null values:", columns_with_nulls(df,False))
  #summarize_null_columns(df)

  df = drop_columns_with_high_nulls(df, threshold=500)
  df=update_Enclosure(df)
  df=convert_date_columns(df,'saledate')
  df=encode_all_categories(df)
  model,xs=train_model(df=df,column_to_predict='SalePrice')#took 8 min runtime
  return model,xs

#Second run OA remove cols and outliners
def Second_run_26012025():
  df=load_dataframe_from_drive()
  df=update_MachineHoursCurrentMeter(df)
  df=update_auctioneerID(df)
  df = drop_columns_with_high_nulls(df, threshold=500)
  df=update_Enclosure(df)
  df=convert_date_columns(df,'saledate')
  df=encode_all_categories(df)

  sampled_df = df.sample(n=25000, random_state=42)  # Example: 10,000 rows

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  feature_importances(model)

  sampled_df = sampled_df.drop(columns=['is_weekend','datasource','MachineHoursCurrentMeter','day_of_week','auctioneerID'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  sampled_df=find_outliers(df=sampled_df,column_to_predict='SalePrice')

  # Assuming df_with_anomalies is the DataFrame containing anomaly labels
  anomalies = sampled_df[sampled_df['anomaly_label'] == 'Anomaly']

  # Display rows where 'anomaly_label' is 'Anomaly'
  print(anomalies)

  # Optionally, display the count of anomalies
  print(f"Number of anomalies detected: {len(anomalies)}")

  # Remove all rows where 'anomaly_label' is 'Anomaly'
  sampled_df = sampled_df[sampled_df['anomaly_label'] != 'Anomaly']

  # Drop the columns 'anomaly_label', 'anomaly_score', and 'anomaly'
  columns_to_drop = ['anomaly_label', 'anomaly_score', 'anomaly']
  sampled_df = sampled_df.drop(columns=columns_to_drop, errors='ignore')

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  perm_importance_model(model,xs)

  sampled_df = sampled_df.drop(columns=['day','month','state','MachineID','SalesID','ProductGroupDesc'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  perm_importance_model(model,xs)

  return model,xs

#First run OA 27012025
def First_run_27012025():
  df=load_dataframe_from_drive()

  df=update_NaN_To_None_or_Unspecified(df)

  df=update_MachineHoursCurrentMeter(df)
  df=update_auctioneerID(df)
  df = drop_columns_with_high_nulls(df, threshold=500)
  df=update_Enclosure(df)
  df=convert_date_columns(df,'saledate')
  df=encode_all_categories(df)

  sampled_df = df.sample(n=25000, random_state=42)  # Example: 10,000 rows

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  feature_importances(model)

  sampled_df = sampled_df.drop(columns=['is_weekend','datasource','MachineHoursCurrentMeter','day_of_week','auctioneerID'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  sampled_df=find_outliers(df=sampled_df,column_to_predict='SalePrice')

  # Assuming df_with_anomalies is the DataFrame containing anomaly labels
  anomalies = sampled_df[sampled_df['anomaly_label'] == 'Anomaly']

  # Display rows where 'anomaly_label' is 'Anomaly'
  print(anomalies)

  # Optionally, display the count of anomalies
  print(f"Number of anomalies detected: {len(anomalies)}")

  # Remove all rows where 'anomaly_label' is 'Anomaly'
  sampled_df = sampled_df[sampled_df['anomaly_label'] != 'Anomaly']

  # Drop the columns 'anomaly_label', 'anomaly_score', and 'anomaly'
  columns_to_drop = ['anomaly_label', 'anomaly_score', 'anomaly']
  sampled_df = sampled_df.drop(columns=columns_to_drop, errors='ignore')

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  perm_importance_model(model,xs)

  sampled_df = sampled_df.drop(columns=['day','month','state','MachineID','SalesID','ProductGroupDesc'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  importance_model=perm_importance_model(model,xs)

  # Drop columns where 'Permutation Importance' is less than 0.02
  importance_model_filtered = importance_model[importance_model['Permutation Importance'] >= 0.02]

  # Display the filtered DataFrame
  print(importance_model_filtered)

  # Optionally, list the dropped columns
  dropped_columns = importance_model[importance_model['Permutation Importance'] < 0.02]['Feature']
  print("Dropped columns:", dropped_columns.tolist())
  sampled_df.drop(columns=dropped_columns, inplace=True)

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  return model,xs


#First run OA 27012025
def First_run_02022025():
  pd.set_option('display.expand_frame_repr', False)

  df=load_dataframe_from_drive()
  print(count_total_rows_per_column(df,'YearMade'))
  # Update YearMade where it is 1000
  df.loc[df['YearMade'] == 1000, 'YearMade'] = df.loc[df['YearMade'] == 1000, 'ModelID'].apply(calc_YearMade)
  print(count_total_rows_per_column(df,'YearMade'))

  df=update_NaN_To_None_or_Unspecified(df)

  df=update_MachineHoursCurrentMeter(df)
  df=update_auctioneerID(df)
  df = drop_columns_with_high_nulls(df, threshold=500)
  df=update_Enclosure(df)
  df=convert_date_columns(df,'saledate')
  df=encode_all_categories(df)

  sampled_df = df.sample(n=25000, random_state=42)  # Example: 10,000 rows

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  feature_importances(model)

  sampled_df = sampled_df.drop(columns=['is_weekend','datasource','MachineHoursCurrentMeter','day_of_week','auctioneerID'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  sampled_df=find_outliers(df=sampled_df,column_to_predict='SalePrice')

  # Assuming df_with_anomalies is the DataFrame containing anomaly labels
  anomalies = sampled_df[sampled_df['anomaly_label'] == 'Anomaly']

  # Display rows where 'anomaly_label' is 'Anomaly'
  print(anomalies)

  # Optionally, display the count of anomalies
  print(f"Number of anomalies detected: {len(anomalies)}")

  # Remove all rows where 'anomaly_label' is 'Anomaly'
  sampled_df = sampled_df[sampled_df['anomaly_label'] != 'Anomaly']

  # Drop the columns 'anomaly_label', 'anomaly_score', and 'anomaly'
  columns_to_drop = ['anomaly_label', 'anomaly_score', 'anomaly']
  sampled_df = sampled_df.drop(columns=columns_to_drop, errors='ignore')

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  perm_importance_model(model,xs)

  sampled_df = sampled_df.drop(columns=['day','month','state','MachineID','SalesID','ProductGroupDesc'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  importance_model=perm_importance_model(model,xs)

  # Drop columns where 'Permutation Importance' is less than 0.02
  importance_model_filtered = importance_model[importance_model['Permutation Importance'] >= 0.02]

  # Display the filtered DataFrame
  print(importance_model_filtered)

  # Optionally, list the dropped columns
  dropped_columns = importance_model[importance_model['Permutation Importance'] < 0.02]['Feature']
  print("Dropped columns:", dropped_columns.tolist())
  sampled_df.drop(columns=dropped_columns, inplace=True)

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  return model,xs

import pandas as pd

def update_NaN_in_numeric_cols(df, column_name):
    """
    Fill NaN values in a specified column with its median while keeping the original data type.

    Parameters:
    - df: The input DataFrame.
    - column_name: The name of the column to update.

    Returns:
    - Updated DataFrame.
    """
    print("update_NaN_in_numeric_cols START - ", column_name)

    if column_name in df.columns:
        fill_value = pd.to_numeric(df[column_name], errors='coerce').median()  # Get median after coercing
        df[column_name] = df[column_name].fillna(fill_value)  # Fill NaN with median while preserving dtype

    print("update_NaN_in_numeric_cols END - ", column_name)
    return df

def check_convertible_to_numeric(df):
    """
    Identify columns that can be converted to numeric and contain NaN values.

    Parameters:
    - df: The input DataFrame.

    Returns:
    - A list of column names that can be converted to numeric and contain NaNs.
    """
    convertible_cols = []

    for col in df.columns:
        try:
            # Convert column to numeric (coerce errors to NaN)
            converted_col = pd.to_numeric(df[col], errors='coerce')

            # If NaNs appear after conversion, store column name
            if converted_col.isnull().sum() > 0:
                convertible_cols.append(col)

        except Exception:
            pass  # Skip non-convertible columns

    return convertible_cols


def encode_and_impute(df):
    '''
    Function to convert categorical columns into category codes
    and impute missing values in numerical columns.
    '''
    print("Data Preprocessing START")

    # Handling categorical variables
    for col in df.select_dtypes(['object']):
        df[col] = df[col].fillna("Unknown").astype('category')

    for col in df.select_dtypes(['category']):
        df[col] = df[col].cat.codes  # Convert categories to numerical codes

    # Handling numerical missing values
    for col in df.select_dtypes(['number']):
        df[col] = df[col].fillna(df[col].median())  # Fill NaNs with median

    print("Data Preprocessing END")
    return df

import pandas as pd

def analyze_column(df, col_name):
    """
    Analyzes a specified column in a DataFrame.
    - If categorical and unique values < 30, print unique values as a list.
    - If numeric, show the 25th percentile, 75th percentile, and list values below the 25th and above the 75th percentile.
    - Show value counts for all columns.
    - Count NaN values per column.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the column.
        col_name (str): The column to analyze.
    """
    if col_name not in df.columns:
        print(f"Column '{col_name}' not found in DataFrame.")
        return

    print(f"Analysis for column: {col_name}\n" + "-" * 40)

    # Count NaN values
    nan_count = df[col_name].isna().sum()
    print(f"NaN count: {nan_count}\n")

    if df[col_name].dtype == 'object' or df[col_name].nunique() < 30:
        # Categorical or few unique values
        unique_values = df[col_name].unique().tolist()
        print(f"Unique values ({len(unique_values)}): {unique_values}\n")
    else:
        # Numeric column
        q25 = df[col_name].quantile(0.15)
        q75 = df[col_name].quantile(0.85)

        print(f"25th percentile: {q25}")
        print(f"75th percentile: {q75}\n")

        low_values = df[df[col_name] < q25][col_name].tolist()
        high_values = df[df[col_name] > q75][col_name].tolist()

        print(f"Values below 25th percentile ({len(low_values)}): {low_values}")
        print(f"Values above 75th percentile ({len(high_values)}): {high_values}\n")

    # Value counts
    print(f"Value counts:\n{df[col_name].value_counts()}\n")

def update_col_NaN(df,column_name,update_to):
   df[column_name].fillna()

def prepare_dataframe(df_pre:pd.DataFrame):
  '''04/02/2025 Start'''
  df_pre=update_YearMade(df_pre)
  print()
  df_pre=update_NaN_To_None_or_Unspecified(df_pre)
  print()

  '''As of 05022025 use update_NaN_in_numeric_cols'''
  #df_pre=update_MachineHoursCurrentMeter(df_pre)
  #print()

  df_pre=update_auctioneerID(df_pre)
  print()

  df_pre=update_Enclosure(df_pre)
  print()

  df_pre=convert_date_columns(df_pre,'saledate')
  print()


  '''Start changes 05-02-2025'''
  update_col_NaN(df_pre,'ProductSize','None or Unspecified')

  df_pre=encode_and_impute(df_pre)
  '''End changes 05-02-2025'''

  #df_pre=encode_all_categories(df_pre)
  '''04/02/2025 End'''

  return df_pre

def run_test(model):
  file_path = "/content/drive/My Drive/Valid.csv"  # Replace with the saved file path
  df_valid = pd.read_csv(file_path)
  df_valid['SalePrice']=None
  df_valid=prepare_dataframe(df_valid)
  X = df_valid.drop(columns=['SalePrice'])
  y = df_valid['SalePrice']
  y = model.predict(X)

  df_valid.loc[X.index, 'SalePrice'] = y

  file_path = "/content/drive/My Drive/output.csv"  # Save to Google Drive root
  df_valid[['SalesID', 'SalePrice']].to_csv(file_path, index=False)

  print(f"File saved successfully at: {file_path}")

def run_train():
  pd.set_option('display.expand_frame_repr', False)

  df=load_dataframe_from_drive()
  #for col in df.columns[56:88]:
  #  analyze_column(df,col)

  #df = df.sample(n=20000, random_state=42)

  df=prepare_dataframe(df)

  #model,xs=train_model(df=df,column_to_predict='SalePrice')


  '''
  df=load_dataframe_from_drive('04022025_df')
  model=load_model('04022025_model')
  xs=generate_X_Y_params(df,'SalePrice')
  '''

  return model,xs


In [45]:
print('Starting.....')
model,xs=run_train()
print('Testing...')
#run_test(model)


Starting.....
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  df = pd.read_csv(file_path)


Testing...
