<a href="https://colab.research.google.com/github/colab-ds18/ML-DS18/blob/main/ML_DS18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
'''
General Funtions Author OA
Date 29/01/2025
'''
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.ensemble import RandomForestRegressor  # For the Random Forest regression model
import shap
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_log_error

def show_unique_values(df, column_name):
  '''
  Function to list all unique values in a given column of a DataFrame.
  '''
  if column_name not in df.columns:
    raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
    unique_products = list_unique_values(df, column_name)
    print(unique_products)
    null_stats_per_column(df,column_name)


def list_unique_values(df, column_name):
    """
    Function to list all unique values in a given column of a DataFrame.

    Parameters:
    - df: pandas DataFrame
    - column_name: Name of the column to find unique values

    Returns:
    - A list of unique values in the specified column.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")

    unique_values = df[column_name].unique()
    return unique_values


def update_NaN_To_None_or_Unspecified(df):
    """
    Updates NaN values to 'None or Unspecified' in columns where the value
    'None or Unspecified' is already present.
    """
    update_to = 'None or Unspecified'

    for col in df.columns:
        unique_values = list_unique_values(df, col)  # Get unique values for the column

        # Check if 'None or Unspecified' exists and if there are NaN values
        if update_to in unique_values and pd.isnull(unique_values).any():
            df[col] = df[col].fillna(update_to)  # Fill NaN with the specified value
            print(f"Updated '{col}' column successfully!")
        else:
            print(f"Column '{col}' does not have the required conditions for update.")

    return df

def columns_with_nulls(df,only_numeric_columns=True):
  '''
  Function to list all columns with null values in a given DataFrame.
  '''
  if only_numeric_columns:
    # Get numeric columns only
    numeric_columns = df.select_dtypes(include=['number'])

    # Find numeric columns with null values
    numeric_columns_with_null = numeric_columns.columns[numeric_columns.isnull().any()]
    return numeric_columns_with_null.tolist()
  else:
    return df.columns[df.isnull().any()]


def null_stats_per_column(df,chosen_column):
  '''
  Function to display the number of null rows in a given column of a DataFrame.
  '''
  # Count the number of null rows in the chosen column
  null_count = df[chosen_column].isnull().sum()

  # Calculate the percentage of nulls in the chosen column
  null_percentage = (null_count / len(df)) * 100

  # Display the result
  print(f"Column: {chosen_column}")
  print(f"Null Count: {null_count}")
  print(f"Null Percentage: {null_percentage:.2f}%")


def missing_values_histogram(df,columns):
    '''
    Function to plot a histogram of missing values in a given DataFrame.
    '''
    data=[{column:df[column].value_counts().sum()} for column in columns ]
    # Convert the list of dictionaries to a DataFrame
    formatted_data = pd.DataFrame([{name:count for single_data in data for name, count in single_data.items()}]).T.reset_index()
    formatted_data.columns = ['Column', 'Count']
    # Create a bar plot
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(x='Column', y='Count', data=formatted_data, hue='Column',legend=False)


    # label count on top of each bar
    for index, row in formatted_data.iterrows():
        ax.text(index, row['Count'] + 10, str(row['Count']), color='black', ha="center", fontsize=10)

    # Add title and axis labels
    plt.title("Column Counts in Dataframe")
    plt.xlabel("Column")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def encode_all_categories(df):
  '''
  Function to convert all string columns in a given DataFrame into categories.
  '''
  for col in df.select_dtypes(['object']):
    df[col] = df[col].astype('category')

  for col in df.select_dtypes(['category']):
    df[col] = df[col].cat.codes

  return df

def drop_nulls(df):
  '''
  Function to drop all rows with null values in a given DataFrame.
  '''
  df = df.dropna()
  return df

def RMSLE(y_test, y_pred):
    '''
    RSMLE approximates the percent change
    '''
    return np.sqrt(np.mean((np.log(y_pred) - np.log(y_test))**2))

def RMSE(y_, y_pred_):
    '''
    RSME
    '''
    return ((y_ - y_pred_) ** 2).mean() ** 0.5

def train_model(df,column_to_predict,test_size_value=0.3,random_state_value=42):
    '''
    Function to train a Random Forest regression model on a given DataFrame.
    '''
    X = df.drop(columns=[column_to_predict])
    y = df[column_to_predict]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_value, random_state=random_state_value)

    # model = DecisionTreeRegressor(
    # min_samples_leaf=16
    # max_depth=3
    # )
    #model = LinearRegression()
    model = RandomForestRegressor()
    model.fit(X_train, y_train)


    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # 7. print the RMSE accuracy of the baseline (std dev)
    print("RMSE Baseline accuracy:", y_test.std())
    print("Train RMSE:", RMSE(y_train, y_train_pred))
    print("Test RMSE:", RMSE(y_test, y_test_pred))
    print("Test RMSLE:", RMSLE(y_test, y_test_pred))

    #display(pd.Series(model.feature_importances_, model.feature_names_in_).sort_values(ascending=False))
    return model,(X_train, X_test, y_train, y_test)

def feature_importances(model):
  '''
  Function to display the feature importances of a given Random Forest regression model.
  '''
  dict(zip(model.feature_names_in_, model.feature_importances_))
  fi = pd.Series(model.feature_importances_, index=model.feature_names_in_)
  fi = fi.sort_values(ascending=False)
  print(fi)

  return fi

def explain_model_snap(model,X_test,water_fall_row=0):
  '''
  Function to explain the model's predictions using SHAP values.
  '''
  shap.initjs()
  explainer = shap.Explainer(model)
  explanation = explainer(X_test)  # New style
  shap.summary_plot(explanation, X_test)

  shap.plots.waterfall(explanation[water_fall_row])

  shap.plots.waterfall(explanation[water_fall_row+2])

  shap.plots.partial_dependence('fare', model.predict, X_test, feature_names=X_test.columns)

  shap.plots.heatmap(explanation)

  shap.plots.bar(explanation)


def find_outliers(df, column_to_predict, test_size_value=0.3, random_state_value=42):
    """
    Identify outliers using Isolation Forest and add anomaly-related columns to the DataFrame.

    Parameters:
    - df: Input DataFrame.
    - column_to_predict: Column to exclude for training the Isolation Forest.
    - test_size_value: Test size for train-test split.
    - random_state_value: Random state for reproducibility.

    Returns:
    - A new DataFrame with additional columns: 'anomaly_score', 'anomaly', and 'anomaly_label'.
    """
    # Create features (X) and target (y)
    X = pd.get_dummies(df.drop(columns=[column_to_predict]), drop_first=True)
    y = df[column_to_predict]

    # Train Isolation Forest on the entire dataset (no splitting)
    iso_forest = IsolationForest(n_estimators=100, contamination=0.01, random_state=random_state_value)
    iso_forest.fit(X)

    # Predict anomaly scores and labels
    df = df.copy()  # Work on a copy of the DataFrame
    df['anomaly_score'] = iso_forest.decision_function(X)  # Quantitative anomaly score
    df['anomaly'] = iso_forest.predict(X)  # Binary anomaly label (1 for normal, -1 for anomaly)
    df['anomaly_label'] = df['anomaly'].map({1: 'Normal', -1: 'Anomaly'})  # Map labels

    return df



def convert_date_columns(df, date_column):
    """
    Convert date column into multiple numeric features like year, month, day, etc.
    Handles potential issues like missing date_column or invalid data.
    """
    # Ensure the date_column exists in the DataFrame
    if date_column not in df.columns:
        raise KeyError(f"Column '{date_column}' does not exist in the DataFrame.")

    # Convert the column to datetime
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')  # Handle invalid date strings

    # Check if any date values are invalid after conversion
    if df[date_column].isnull().all():
        raise ValueError(f"All values in '{date_column}' could not be converted to datetime.")

    # Extract date features
    df['year'] = df[date_column].dt.year
    df['month'] = df[date_column].dt.month
    df['day'] = df[date_column].dt.day
    df['day_of_week'] = df[date_column].dt.dayofweek
    df['is_weekend'] = df['day_of_week'] >= 5  # Saturday=5, Sunday=6

    # Drop the original date column
    df = df.drop(columns=[date_column])

    return df

#    model = RandomForestRegressor(random_state=42, n_jobs=-1)
#    xs=(X_train, X_test, y_train, y_test)
def perm_importance_model(model, xs):
    '''
    Function to compute Permutation Importance of a given Random Forest regression model.
    '''
    X_train, X_test, y_train, y_test = xs

    # Compute Permutation Importance
    perm_importance = permutation_importance(
        model, X_test, y_test, n_repeats=5, random_state=42, n_jobs=-1
    )

    # Create the results DataFrame
    importance_model = pd.DataFrame({
        "Feature": model.feature_names_in_,
        "Permutation Importance": perm_importance.importances_mean,
        "Permutation Std Deviation": perm_importance.importances_std,
        "Model Importance": model.feature_importances_,
    })

    # Add ranking for permutation and model importances
    importance_model["Permutation Rank"] = importance_model["Permutation Importance"].rank(ascending=False)
    importance_model["Model Rank"] = importance_model["Model Importance"].rank(ascending=False)

    # Sort by Permutation Importance for display
    importance_model = importance_model.sort_values(by="Permutation Importance", ascending=False)

    # Display the DataFrame
    display(importance_model)

    return importance_model

def load_dataframe_from_drive():
    """
    Function to load a DataFrame from a Google Drive file path.
    """
    from google.colab import drive
    drive.mount('/content/drive')

    # Load the file from Google Drive
    file_path = "/content/drive/My Drive/Train.csv"  # Replace with the saved file path
    df = pd.read_csv(file_path)
    return df

def summarize_null_columns(dataframe):
    """
    Function to find columns with null values and display detailed information
    including column name, total null count, percentage of nulls, and data type.
    """
    # Find total nulls and percentage of nulls for each column
    columns_with_nulls = dataframe.isnull().sum()  # Total nulls per column
    null_percentage = (columns_with_nulls / len(dataframe)) * 100  # Percentage of nulls

    # Iterate through columns and display only those with null values
    for col in dataframe.columns:
        if columns_with_nulls[col] > 0:  # Check if the column has nulls
            col_dtype = dataframe[col].dtype  # Get column data type
            print(f"Column: {col}")
            print(f"  - Data Type: {col_dtype}")
            print(f"  - Total Null Rows: {columns_with_nulls[col]}")
            print(f"  - Percentage of Nulls: {null_percentage[col]:.2f}%")
            print("-" * 40)

def drop_columns_with_high_nulls(dataframe, threshold=500):
    """
    Function to drop columns from the DataFrame that have more than 'threshold' null values.

    Parameters:
    dataframe (pd.DataFrame): The input DataFrame.
    threshold (int): The maximum allowable null rows for a column. Columns with nulls > threshold will be dropped.

    Returns:
    pd.DataFrame: A new DataFrame with columns dropped based on the condition.
    """
    # Identify columns with null counts greater than the threshold
    columns_to_drop = dataframe.columns[dataframe.isnull().sum() > threshold]

    # Drop these columns from the DataFrame
    cleaned_dataframe = dataframe.drop(columns=columns_to_drop)

    print(f"Dropped columns: {list(columns_to_drop)}")
    return cleaned_dataframe

def update_MachineHoursCurrentMeter(df):
  # Update the 'MachineHoursCurrentMeter' column: replace NaN or 0 with 0
  column_name = 'MachineHoursCurrentMeter'

  # Check if the column exists
  if column_name in df.columns:
      df[column_name] = df[column_name].fillna(0)  # Replace NaN with 0
      df[column_name] = df[column_name].replace(0, 0)  # Ensure 0 stays 0
      print(f"Updated '{column_name}' column successfully!")
  else:
      print(f"Column '{column_name}' does not exist in the DataFrame.")

  return df

def update_auctioneerID(df):
  # Assuming 'df' is your DataFrame
  # Fill null values in the 'auctioneerID' column with 100.0
  if 'auctioneerID' in df.columns:
      df['auctioneerID'] = df['auctioneerID'].fillna(100.0)
      print("'auctioneerID' null values filled with 100.0 successfully!")
  else:
      print("Column 'auctioneerID' does not exist in the DataFrame.")

  return df

def update_Enclosure(df):
  # Replace null values in the 'Enclosure' column with 'N/A'
  if 'Enclosure' in df.columns:
      df['Enclosure'] = df['Enclosure'].fillna('N/A')
      print("Replaced null values in the 'Enclosure' column with 'N/A'.")
  else:
      print("Column 'Enclosure' does not exist in the DataFrame.")

  return df

#First run OA
def first_run_25012025():
  df=load_dataframe_from_drive()

  missing_values_histogram(df,df.columns)

  #df.info()

  #df.describe()

  df=update_MachineHoursCurrentMeter(df)
  df=update_auctioneerID(df)

  #print("columns with null values:", columns_with_nulls(df,False))
  #summarize_null_columns(df)

  df = drop_columns_with_high_nulls(df, threshold=500)
  df=update_Enclosure(df)
  df=convert_date_columns(df,'saledate')
  df=encode_all_categories(df)
  model,xs=train_model(df=df,column_to_predict='SalePrice')#took 8 min runtime
  return model,xs

#Second run OA remove cols and outliners
def Second_run_26012025():
  df=load_dataframe_from_drive()
  df=update_MachineHoursCurrentMeter(df)
  df=update_auctioneerID(df)
  df = drop_columns_with_high_nulls(df, threshold=500)
  df=update_Enclosure(df)
  df=convert_date_columns(df,'saledate')
  df=encode_all_categories(df)

  sampled_df = df.sample(n=25000, random_state=42)  # Example: 10,000 rows

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  feature_importances(model)

  sampled_df = sampled_df.drop(columns=['is_weekend','datasource','MachineHoursCurrentMeter','day_of_week','auctioneerID'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  sampled_df=find_outliers(df=sampled_df,column_to_predict='SalePrice')

  # Assuming df_with_anomalies is the DataFrame containing anomaly labels
  anomalies = sampled_df[sampled_df['anomaly_label'] == 'Anomaly']

  # Display rows where 'anomaly_label' is 'Anomaly'
  print(anomalies)

  # Optionally, display the count of anomalies
  print(f"Number of anomalies detected: {len(anomalies)}")

  # Remove all rows where 'anomaly_label' is 'Anomaly'
  sampled_df = sampled_df[sampled_df['anomaly_label'] != 'Anomaly']

  # Drop the columns 'anomaly_label', 'anomaly_score', and 'anomaly'
  columns_to_drop = ['anomaly_label', 'anomaly_score', 'anomaly']
  sampled_df = sampled_df.drop(columns=columns_to_drop, errors='ignore')

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  perm_importance_model(model,xs)

  sampled_df = sampled_df.drop(columns=['day','month','state','MachineID','SalesID','ProductGroupDesc'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  perm_importance_model(model,xs)

  return model,xs

#First run OA 27012025
def First_run_27012025():
  df=load_dataframe_from_drive()

  update_NaN_To_None_or_Unspecified(df)

  df=update_MachineHoursCurrentMeter(df)
  df=update_auctioneerID(df)
  df = drop_columns_with_high_nulls(df, threshold=500)
  df=update_Enclosure(df)
  df=convert_date_columns(df,'saledate')
  df=encode_all_categories(df)

  sampled_df = df.sample(n=25000, random_state=42)  # Example: 10,000 rows

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  feature_importances(model)

  sampled_df = sampled_df.drop(columns=['is_weekend','datasource','MachineHoursCurrentMeter','day_of_week','auctioneerID'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  sampled_df=find_outliers(df=sampled_df,column_to_predict='SalePrice')

  # Assuming df_with_anomalies is the DataFrame containing anomaly labels
  anomalies = sampled_df[sampled_df['anomaly_label'] == 'Anomaly']

  # Display rows where 'anomaly_label' is 'Anomaly'
  print(anomalies)

  # Optionally, display the count of anomalies
  print(f"Number of anomalies detected: {len(anomalies)}")

  # Remove all rows where 'anomaly_label' is 'Anomaly'
  sampled_df = sampled_df[sampled_df['anomaly_label'] != 'Anomaly']

  # Drop the columns 'anomaly_label', 'anomaly_score', and 'anomaly'
  columns_to_drop = ['anomaly_label', 'anomaly_score', 'anomaly']
  sampled_df = sampled_df.drop(columns=columns_to_drop, errors='ignore')

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  perm_importance_model(model,xs)

  sampled_df = sampled_df.drop(columns=['day','month','state','MachineID','SalesID','ProductGroupDesc'], errors='ignore')  # 'errors="ignore"' prevents errors if columns don't exist

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  importance_model=perm_importance_model(model,xs)

  # Drop columns where 'Permutation Importance' is less than 0.02
  importance_model_filtered = importance_model[importance_model['Permutation Importance'] >= 0.02]

  # Display the filtered DataFrame
  print(importance_model_filtered)

  # Optionally, list the dropped columns
  dropped_columns = importance_model[importance_model['Permutation Importance'] < 0.02]['Feature']
  print("Dropped columns:", dropped_columns.tolist())
  sampled_df.drop(columns=dropped_columns, inplace=True)

  model,xs=train_model(df=sampled_df,column_to_predict='SalePrice')

  return model,xs


'''
import pandas as pd

# Compute correlation matrix
corr_matrix = df.corr()

# Find highly correlated features (e.g., correlation > 0.85)
high_corr_features = set()
threshold = 0.85

for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            colname = corr_matrix.columns[i]
            high_corr_features.add(colname)

print("Highly correlated features:", high_corr_features)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

corr_matrix = df.corr()
threshold = 0.60

# Apply threshold filter to only show correlations above 0.5
filtered_corr = corr_matrix[(abs(corr_matrix) > threshold) & (abs(corr_matrix) < 1)]

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(filtered_corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Example Correlation Heatmap")
plt.show()
'''

#erez functions


In [None]:
#erez runtime

In [None]:
#model,xs=first_run_25012025()
#model,xs=Second_run_26012025()
model,xs=First_run_27012025()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  df = pd.read_csv(file_path)


Column 'SalesID' does not have the required conditions for update.
Column 'SalePrice' does not have the required conditions for update.
Column 'MachineID' does not have the required conditions for update.
Column 'ModelID' does not have the required conditions for update.
Column 'datasource' does not have the required conditions for update.
Column 'auctioneerID' does not have the required conditions for update.
Column 'YearMade' does not have the required conditions for update.
Column 'MachineHoursCurrentMeter' does not have the required conditions for update.
Column 'UsageBand' does not have the required conditions for update.
Column 'saledate' does not have the required conditions for update.
Column 'fiModelDesc' does not have the required conditions for update.
Column 'fiBaseModel' does not have the required conditions for update.
Column 'fiSecondaryDesc' does not have the required conditions for update.
Column 'fiModelSeries' does not have the required conditions for update.
Column 

Unnamed: 0,Feature,Permutation Importance,Permutation Std Deviation,Model Importance,Permutation Rank,Model Rank
3,YearMade,0.6979017,0.01029412,0.190094,1.0,1.0
36,year,0.2188379,0.004261894,0.070269,2.0,6.0
6,fiProductClassDesc,0.2114733,0.006271672,0.103779,3.0,2.0
2,ModelID,0.1175073,0.003234003,0.092384,4.0,4.0
10,Enclosure,0.1092078,0.003038284,0.098388,5.0,3.0
28,Hydraulics_Flow,0.09267955,0.00452606,0.049389,6.0,8.0
4,fiModelDesc,0.07271174,0.002666029,0.072292,7.0,5.0
5,fiBaseModel,0.02965264,0.001503859,0.040042,8.0,10.0
9,ProductGroupDesc,0.02744949,0.0008275675,0.02591,9.0,11.0
0,SalesID,0.02520727,0.001048444,0.044204,10.0,9.0


RMSE Baseline accuracy: 22559.50925365081
Train RMSE: 3765.833770990849
Test RMSE: 10154.28819989472
Test RMSLE: 0.29776872539681026


Unnamed: 0,Feature,Permutation Importance,Permutation Std Deviation,Model Importance,Permutation Rank,Model Rank
1,YearMade,0.8412692,0.01572714,0.211053,1.0,1.0
32,year,0.32611,0.003919204,0.09715,2.0,5.0
4,fiProductClassDesc,0.2271862,0.005351673,0.119707,3.0,3.0
6,Enclosure,0.128529,0.003790878,0.120061,4.0,2.0
0,ModelID,0.1265337,0.005042767,0.110087,5.0,4.0
24,Hydraulics_Flow,0.09596203,0.004611299,0.049013,6.0,8.0
5,ProductGroup,0.08745581,0.002690573,0.047458,7.0,9.0
2,fiModelDesc,0.08317299,0.00233601,0.083772,8.0,6.0
3,fiBaseModel,0.03610517,0.002084104,0.054481,9.0,7.0
15,Hydraulics,0.01905204,0.001012802,0.012808,10.0,11.0


               Feature  Permutation Importance  Permutation Std Deviation  \
1             YearMade                0.841269                   0.015727   
32                year                0.326110                   0.003919   
4   fiProductClassDesc                0.227186                   0.005352   
6            Enclosure                0.128529                   0.003791   
0              ModelID                0.126534                   0.005043   
24     Hydraulics_Flow                0.095962                   0.004611   
5         ProductGroup                0.087456                   0.002691   
2          fiModelDesc                0.083173                   0.002336   
3          fiBaseModel                0.036105                   0.002084   

    Model Importance  Permutation Rank  Model Rank  
1           0.211053               1.0         1.0  
32          0.097150               2.0         5.0  
4           0.119707               3.0         3.0  
6           0.120

In [None]:
#model,xs=train_model(df=df,column_to_predict='SalePrice')#took 8 min runtime

RMSE Baseline accuracy: 22932.4005340408 Train RMSE: 2864.123815046 Test RMSE: 7633.339248575835 0 Enclosure 0.240869 YearMade 0.147308 fiProductClassDesc 0.117304 fiModelDesc 0.099289 ModelID 0.093105 year 0.080002 SalesID 0.043689 fiBaseModel 0.033948 ProductGroup 0.032850 ProductGroupDesc 0.029777 MachineID 0.023082 day 0.013131 month 0.013108 state 0.012552 auctioneerID 0.006424 day_of_week 0.006194 MachineHoursCurrentMeter 0.005897 datasource 0.000885 is_weekend 0.000586

In [None]:
#erez1

In [None]:
#erez2