# Automatic Data eng with functions and models

In [77]:
import numpy as np
import pandas as pd
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
import utils
import data_eng
from sklearn.preprocessing import LabelEncoder
from skrub import TableReport

In [9]:
X, y, X_test = utils.get_and_process_data()

  (non_nan_values - value).abs().argmin()
  (non_nan_values - value).abs().argmin()
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["log_bike_count"][
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

In [98]:
X_boruta = X.copy()

In [99]:
# TableReport(X_boruta)

In [100]:
# Check data types
non_numeric_columns = X_boruta.select_dtypes(exclude=["number"]).columns
print("Non-numeric columns:", non_numeric_columns)

Non-numeric columns: Index(['counter_id', 'counter_name', 'site_name', 'date',
       'counter_installation_date', 'coordinates', 'counter_technical_id'],
      dtype='object')


In [101]:
for col in non_numeric_columns:
    le = LabelEncoder()
    X_boruta[col] = le.fit_transform(X_boruta[col])

In [102]:
X_boruta = X_boruta.drop(columns=non_numeric_columns)

In [103]:
X_boruta.head()

Unnamed: 0,site_id,latitude,longitude,Station_Number,Sea_Level_Pressure_(hPa),Pressure_Tendency_(hPa/3h),Pressure_Tendency_Code,Wind_Direction_(°),Wind_Speed_(m/s),Air_Temperature_(°C),...,year,month,weekday,day,hour,is_weekend,is_school_holiday,road_work,confinement,couvre_feu
0,100049407,48.840801,2.333233,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,...,2020,9,1,1,1,0,0,0,0,0
1,100049407,48.840801,2.333233,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,...,2020,9,1,1,1,0,0,0,0,0
2,100036719,48.85372,2.35702,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,...,2020,9,1,1,1,0,0,0,0,0
3,100036719,48.85372,2.35702,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,...,2020,9,1,1,1,0,0,0,0,0
4,100063175,48.88529,2.32666,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,...,2020,9,1,1,1,0,0,0,0,0


In [None]:
# BorutaPy accepts numpy arrays only, hence the .values attribute
X_boruta_values = X_boruta.values
y_values = y.values
y_values = y.ravel()

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators="auto", verbose=2, random_state=42)

# find all relevant features
feat_selector.fit(X_boruta_values, y_values)

  y_values = y.ravel()


Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	52
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	52
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	52
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	52
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	52
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	52
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	52
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	14
Tentative: 	12
Rejected: 	26
Iteration: 	9 / 100
Confirmed: 	14
Tentative: 	12
Rejected: 	26
Iteration: 	10 / 100
Confirmed: 	14
Tentative: 	12
Rejected: 	26
Iteration: 	11 / 100
Confirmed: 	14
Tentative: 	12
Rejected: 	26
Iteration: 	12 / 100
Confirmed: 	14
Tentative: 	8
Rejected: 	30
Iteration: 	13 / 100
Confirmed: 	14
Tentative: 	8
Rejected: 	30
Iteration: 	14 / 100
Confirmed: 	14
Tentative: 	8
Rejected: 	30
Iteration: 	15 / 100
Confirmed: 	14
Tentative: 	8
Rejected: 	30
Iteration: 	16 / 100
Confirmed: 	14
Tentative: 	7
Re

In [108]:
# check selected features - first 5 features are selected
feat_selector.support_

array([ True,  True,  True, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False,  True,  True, False,
       False, False, False, False, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [109]:
feat_selector.ranking_

array([ 1,  1,  1, 34, 15, 10, 21, 12,  8,  1,  8,  3, 11, 22, 29, 25, 15,
       29, 25, 29, 25, 32, 15, 19,  7,  2, 34,  5, 34, 34,  1,  6, 25,  1,
        1, 20, 17, 29,  4, 13, 25, 18,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1])

In [111]:
X_boruta.columns[feat_selector.support_].tolist()

['site_id',
 'latitude',
 'longitude',
 'Air_Temperature_(°C)',
 'New_Snowfall_Duration_(hours)',
 'Rainfall_(6h,_mm)',
 'Rainfall_(12h,_mm)',
 'year',
 'month',
 'weekday',
 'day',
 'hour',
 'is_weekend',
 'is_school_holiday',
 'road_work',
 'confinement',
 'couvre_feu']

In [114]:
def selecting_features_with_correlations(
    df: pd.DataFrame, features, target: str | None
) -> list:
    """
    Select feature based on correlations.
    Keep features that are correlated with the target.
    Remove features that are correlated with other features.

    Args:
        df (pd.DataFrame): Dataframe with features and target.
        features (List[str] | None): List of features.
        target (str | None, optional): Target column. Defaults to TARGET.

    Returns:
        list: Selected features with correlations.
    """
    # Remove features that not correlated with the target
    correlation_with_label = (
        df[features].corr(method="spearman")[target].drop(target)
    )
    threshold_label = 0.05
    selected_features = correlation_with_label[
        correlation_with_label.abs() > threshold_label
    ].index
    correlation_matrix = df[selected_features].corr()
    threshold_features = 0.95
    to_drop = set()
    # Remove features that are correlated with other features
    # Keep the one that is more correlated with the target
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold_features:
                feature_to_drop = (
                    correlation_matrix.columns[i]
                    if abs(correlation_with_label[correlation_matrix.columns[i]])
                    < abs(correlation_with_label[correlation_matrix.columns[j]])
                    else correlation_matrix.columns[j]
                )
                to_drop.add(feature_to_drop)
    selected_features = [f for f in selected_features if f not in to_drop]
    return selected_features

In [116]:
X_corr = X_boruta.merge(y, left_index=True, right_index=True)

In [122]:
corr_features = selecting_features_with_correlations(X_corr, X_corr.columns, "log_bike_count")

In [None]:
print(corr_features)

['site_id', 'latitude', 'Air_Temperature_(°C)', 'Dew_Point_Temperature_(°C)', 'Relative_Humidity_(%)', 'Lowest_Cloud_Base_Height_(m)', 'Low_Cloud_Type', '10min_Max_Wind_Gust_(m/s)', 'Ground_State', 'Layer_1_Cloud_Cover_(oktas)', 'Layer_1_Cloud_Type', 'Layer_2_Cloud_Cover_(oktas)', 'Layer_2_Cloud_Base_Height_(m)', 'weekday', 'hour', 'is_weekend', 'road_work', 'confinement', 'couvre_feu']


In [121]:
def selecting_features_with_random_columns(
    df: pd.DataFrame, features, target
):
    """
    Select features that have less importance than random ones.

    Args:
        df (pd.DataFrame): Dataframe with features and target.
        features (List[str]): List of features.
        target (str): Target column.

    Returns:
        List[str]: Selected features with random columns.
    """
    # Create 5 random columns
    for i in range(5):
        df[f"random_{i}"] = np.random.uniform(
            low=0,
            high=1,
            size=df.shape[0],
        )
    # Combine original features with random features
    all_features = features + [f"random_{i}" for i in range(5)]
    # Initialize and fit the RandomForestClassifier
    rf_classifier = RandomForestRegressor(
        n_estimators=100,
        random_state=42,
    )
    rf_classifier.fit(df[all_features].values, df[target].values)
    # Get feature importances
    feature_importances = rf_classifier.feature_importances_
    feature_importances_df = pd.DataFrame(
        {"feature": all_features, "importance": feature_importances}
    )
    # Select features with importance greater than the maximum random importance
    max_random_importance = feature_importances_df[
        feature_importances_df["feature"].str.contains("random")
    ]["importance"].max()
    selected_features = feature_importances_df[
        feature_importances_df["importance"] > max_random_importance
    ]["feature"].tolist()
    selected_features = [
        feature for feature in selected_features if not feature.startswith("random")
    ]
    logging.info(f"Selected features with random columns : { selected_features }")
    return selected_features