# XGBoost baselines on housing and french motor datasets

In [1]:
import pandas as pd 
import os
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import metrics

# Feature selection strategies
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

# Scale feature scores
from sklearn.preprocessing import MinMaxScaler

# SKLearn estimators list
from sklearn.utils import all_estimators

from importlib import import_module

def feature_selection(df,
                       k: int = 5,
                       min_votes: float = 0.5,
                       label_column: str = None,
                       stat_filters: list = ['f_classif', 'f_regression'],
                       model_filters: dict = {'LinearRegression': 'LinearRegression',
                                             'GradientBoostingRegressor': 'GradientBoostingRegressor'},
                       max_scaled_scores: bool = True,
                       sample_ratio: float = None):
    
    """Applies selected feature selection statistical functions
    or models on our 'df_artifact'.
    Each statistical function or model will vote for it's best K selected features.
    If a feature has >= 'min_votes' votes, it will be selected.
    :param k:                 number of top features to select from each statistical
                              function or model.
                              
    :param min_votes:         minimal number of votes (from a model or by statistical
                              function) needed for a feature to be selected.
                              Can be specified by percentage of votes or absolute
                              number of votes.
                              
    :param label_column:      ground-truth (y) labels.
    
    :param stat_filters:      statistical functions to apply to the features
                              (from sklearn.feature_selection).
                              
    :param model_filters:     models to use for feature evaluation, can be specified by
                              model name (ex. LinearSVC), formalized json (contains 'CLASS',
                              'FIT', 'META') or a path to such json file.
                              
    :param max_scaled_scores: produce feature scores table scaled with max_scaler.
    
    :param sample_ratio: percentage of the dataset the user whishes to compute the feature selection process on.
    """
    
    # Ensure k is not bigger than the the total number of features
    if k > df.shape[1]:
        raise ValueError(
            f'K cannot be bigger than the total number of features ({df.shape[1]}). Please choose a smaller K.')
    elif k < 1:
        raise ValueError(f'K cannot be smaller than 1. Please choose a bigger K.')
        
    # Create a sample dataframe of the original feature vector
    if sample_ratio:
        df = df.groupby(label_column).apply(lambda x: x.sample(frac=sample_ratio)).reset_index(drop=True)
        df = df.dropna()
        
    # Set feature vector and labels
    y = df.pop(label_column)
    X = df
    
    if np.object in list(X.dtypes) and ignore_type_errors is False:
        raise ValueError(f"{df.select_dtypes(include=['object']).columns.tolist()} are neither float or int.")
        
    # Create selected statistical estimators
    stat_functions_list = {
        stat_name: SelectKBest(score_func=create_class(f'sklearn.feature_selection.{stat_name}'), k=k)
        for stat_name in stat_filters}
    requires_abs = ['chi2']
    
    # Run statistic filters
    selected_features_agg = {}
    stats_df = pd.DataFrame(index=X.columns).dropna()
    
    for stat_name, stat_func in stat_functions_list.items():
        try:
            params = (X, y) if stat_name in requires_abs else (abs(X), y)
            stat = stat_func.fit(*params)

            # Collect stat function results
            stat_df = pd.DataFrame(index=X.columns,
                                   columns=[stat_name],
                                   data=stat.scores_)
            stats_df = stats_df.join(stat_df)

            # Select K Best features
            selected_features = X.columns[stat_func.get_support()]
            selected_features_agg[stat_name] = selected_features
            
        except Exception as e:
            print(f"Couldn't calculate {stat_name} because of: {e}")
            
    # Create models from class name / json file / json params
    all_sklearn_estimators = dict(all_estimators()) if len(model_filters) > 0 else {}
    selected_models = {}
    for model_name, model in model_filters.items():
        if '.json' in model:
            current_model = json.load(open(model, 'r'))
            ClassifierClass = create_class(current_model["META"]["class"])
            selected_models[model_name] = ClassifierClass(**current_model["CLASS"])
        elif model in all_sklearn_estimators:
            selected_models[model_name] = all_sklearn_estimators[model_name]()

        else:
            try:
                current_model = json.loads(model) if isinstance(model, str) else current_model
                ClassifierClass = create_class(current_model["META"]["class"])
                selected_models[model_name] = ClassifierClass(**current_model["CLASS"])
            except:
                context.logger.info(f'unable to load {model}')
                
    # Run model filters
    models_df = pd.DataFrame(index=X.columns)
    for model_name, model in selected_models.items():

        if model_name == 'LogisticRegression':
            model.set_params(solver='liblinear')

        # Train model and get feature importance
        select_from_model = SelectFromModel(model).fit(X, y)
        feature_idx = select_from_model.get_support()
        feature_names = X.columns[feature_idx]
        selected_features_agg[model_name] = feature_names.tolist()

        # Collect model feature importance
        if hasattr(select_from_model.estimator_, 'coef_'):
            stat_df = select_from_model.estimator_.coef_
        elif hasattr(select_from_model.estimator_, 'feature_importances_'):
            stat_df = select_from_model.estimator_.feature_importances_

        stat_df = pd.DataFrame(index=X.columns,
                               columns=[model_name],
                               data=stat_df[0])
        models_df = models_df.join(stat_df)
            
    # Create feature_scores DF with stat & model filters scores
    result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False)

    if max_scaled_scores:
        normalized_df = result_matrix_df.replace([np.inf, -np.inf], np.nan).values
        min_max_scaler = MinMaxScaler()
        normalized_df = min_max_scaler.fit_transform(normalized_df)
        normalized_df = pd.DataFrame(data=normalized_df,
                                     columns=result_matrix_df.columns,
                                     index=result_matrix_df.index)

    # Create feature count DataFrame
    for test_name in selected_features_agg:
        result_matrix_df[test_name] = [1 if x in selected_features_agg[test_name] else 0 for x in X.columns]
    result_matrix_df.loc[:, 'num_votes'] = result_matrix_df.sum(axis=1)

    # How many votes are needed for a feature to be selected?
    if isinstance(min_votes, int):
        votes_needed = min_votes
    else:
        num_filters = len(stat_filters) + len(model_filters)
        votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0)))
    print(f'votes needed to be selected: {votes_needed}')

    # Create final feature dataframe
    selected_features = result_matrix_df[result_matrix_df.num_votes >= votes_needed].index.tolist()
    good_feature_df = df.loc[:, selected_features]
    final_df = pd.concat([good_feature_df, y], axis=1)
    
    return final_df


def create_class(pkg_class: str):
    """Create a class from a package.module.class string
    :param pkg_class:  full class location,
                       e.g. "sklearn.model_selection.GroupKFold"
    """
    splits = pkg_class.split(".")
    clfclass = splits[-1]
    pkg_module = splits[:-1]
    class_ = getattr(import_module(".".join(pkg_module)), clfclass)
    return class_

def OneHotEncoder(x):
    l = x.unique().tolist()
    encoded = []
    for label in x:
        encoded.append(l.index(label))
    return encoded

## Using generic feature selection as preprocess on our dataset

In [2]:
datasets = ['housing','freMTPL2freq']
baseline_model_scores = []
for dataset in datasets:
    if( dataset == 'housing' ):
        columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'MEDV']
        df = pd.read_csv(os.path.join(os.getcwd(), 'datasets', dataset + '.csv'), 
                         names = columns,
                         delimiter = r"\s+")
        target = 'MEDV'
        
    else:
        df = pd.read_csv(os.path.join(os.getcwd(), 'datasets', dataset + '.csv'))
        target = 'ClaimNb'

    for col in df.columns:
        if df[col].dtype == object:
            df[col] = OneHotEncoder(df[col])
                    
    X = df.drop(target, axis=1)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
        
    xgbr= XGBRegressor()
#     Training the model
    xgbr.fit(X_train, y_train)
    
#     predicting the model
    y_pred=xgbr.predict(X_train)
    
#     Model Evaluation and error calculations
    baseline_model_scores.append({
            'dataset': dataset,
            'dataset shape': df.shape,
            'R^2': metrics.r2_score(y_train, y_pred),
            'Adjusted R^2': 1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),
            'MAE': metrics.mean_absolute_error(y_train, y_pred),
            'MSE': metrics.mean_squared_error(y_train, y_pred),
            'RMSE': np.sqrt(metrics.mean_squared_error(y_train, y_pred))
    })
    
df = pd.DataFrame(data = baseline_model_scores)
df

Unnamed: 0,dataset,dataset shape,R^2,Adjusted R^2,MAE,MSE,RMSE
0,housing,"(506, 12)",0.999995,0.999995,0.013903,0.0004,0.02001
1,freMTPL2freq,"(678013, 12)",0.315809,0.315795,0.070972,0.039,0.197484


In [3]:
feature_selection_model_scores = []
for dataset in datasets:
    if( dataset == 'housing' ):
        columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'MEDV']
        df = pd.read_csv(os.path.join(os.getcwd(), 'datasets', dataset + '.csv'), 
                         names = columns,
                         delimiter = r"\s+")
        target = 'MEDV'
        
    else:
        df = pd.read_csv(os.path.join(os.getcwd(), 'datasets', dataset + '.csv'))
        target = 'ClaimNb'

    for col in df.columns:
        if df[col].dtype == object:
            df[col] = OneHotEncoder(df[col])
            
    df = feature_selection(df,k=int(0.8*len(df.columns)),label_column=target, sample_ratio=0.9)   
    X = df.drop([target],axis=1, inplace=False)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
        
    xgbr= XGBRegressor()
#     Training the model
    xgbr.fit(X_train, y_train)
    
#     predicting the model
    y_pred=xgbr.predict(X_train)
    
#     Model Evaluation and error calculations
    feature_selection_model_scores.append({
            'dataset': dataset,
            'dataset shape': df.shape,
            'R^2': metrics.r2_score(y_train, y_pred),
            'Adjusted R^2': 1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),
            'MAE': metrics.mean_absolute_error(y_train, y_pred),
            'MSE': metrics.mean_squared_error(y_train, y_pred),
            'RMSE': np.sqrt(metrics.mean_squared_error(y_train, y_pred))
    })
    
df2 = pd.DataFrame(data = feature_selection_model_scores)
df2

votes needed to be selected: 2
votes needed to be selected: 2


Unnamed: 0,dataset,dataset shape,R^2,Adjusted R^2,MAE,MSE,RMSE
0,housing,"(486, 10)",0.999989,0.999989,0.02062,0.000905,0.030084
1,freMTPL2freq,"(610213, 9)",0.31883,0.318819,0.071106,0.039751,0.199377
