In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import VarianceThreshold

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import xgboost as xgb

from sklearn.ensemble import StackingRegressor

from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

In [None]:
########################################

In [None]:
df = pd.read_csv("SUPCOM_Train.csv")
df.dropna(thresh=df.shape[0]*0.6,how='all',axis=1, inplace = True) #remove de variables with more than 60% of NA values

X = df.drop(['target'], 1)
y = df['target']

#name of the categorical variables 
categorical_cols = [colname for colname in X.columns if X[colname].nunique() < 10 and X[colname].dtype == "object"]

#name of the numerical variables 
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols

numerical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('scaling', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder()),
])

preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),
                                               ('cat', categorical_transformer, categorical_cols)])

kf = KFold(n_splits=5, shuffle=True, random_state=0)

regressors_list = [
    KNeighborsRegressor(5),
    RandomForestRegressor(random_state = 0),
    AdaBoostRegressor(),
    GradientBoostingRegressor(),
    xgb.XGBRegressor(eval_metric = metrics.mean_absolute_error, random_state = 0)
]


for regressor in regressors_list:
    mean_rmse = []
    print(regressor)
    for train_index, test_index in kf.split(df):
        X_train = df.iloc[train_index].loc[:, my_cols]
        X_test = df.iloc[test_index].loc[:,my_cols]
        y_train = df.iloc[train_index].loc[:,'target']
        y_test = df.loc[test_index].loc[:,'target']
        pipe = Pipeline(steps=[("preprocessor", preprocessor), ("regressor", regressor)])
        model = pipe.fit(X_train, y_train)
        y_v_pred = model.predict(X_test)
        mean_rmse.append(metrics.mean_squared_error(y_test, y_v_pred, squared = False))
        
    print(f"Mean of the Root Mean Square Error Folds on the test set: {np.mean(mean_rmse)}")
    print("###################---###################\n")

The best Root Mean Square Error are Random Forest and XGBRegressor. So I will combine them.

In [2]:
df = pd.read_csv("SUPCOM_Train.csv")
df.dropna(thresh=df.shape[0]*0.4,how='all',axis=1, inplace = True) #remove de variables with more than 40% of NA values

X = df.drop(['target', 'id'], 1)
y = df['target']

#name of the categorical variables 
# categorical_cols = [colname for colname in X.columns if X[colname].nunique() < 10 and X[colname].dtype == "object"]

#name of the numerical variables 
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = numerical_cols

# numerical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy='most_frequent')),
#                                           ('scaling', StandardScaler())])

# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder()),
# ])

# preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),
#                                                ('cat', categorical_transformer, categorical_cols)])

X = X[my_cols]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [4]:
mf = SimpleImputer(strategy='most_frequent')
mf.fit(X_train)

vt = VarianceThreshold()
vt = vt.fit(X_train)

def pre_proc(base):
    #for categorical variables, input the most frequent
    #for numerical variabels, input de most frequent
    
    base_pp = mf.transform(base)
    base_pp = pd.DataFrame(base_pp)
    
    base_pp = vt.transform(base_pp)
    base_pp = pd.DataFrame(base_pp)
    
    
    
    #base_pp = pd.get_dummies(base_pp)
    
    return base_pp
    
    

In [5]:
X_train

Unnamed: 0,BCT_CODBUR,CTR_MATFIS,FJU_CODFJU,CTR_CESSAT,ACT_CODACT,CTR_OBLDIR,CTR_OBLACP,CTR_OBLRES,CTR_OBLFOP,CTR_OBLTFP,...,TVA_MTRESTAX,TVA_CA072018,TVA_CA132018,TVA_CA192018,AX1_BRTIMP,AX2_HONORA,AX2_LOYERS,AX3_REVCAP,AX5_MNTMAR,RES_ANNIMP
16482,122,2287,99,0,6307,4,2,2,2,2,...,,,,,0.0,116580.0,0.0,0.0,0.000000e+00,2015
1606,291,13146,22,0,3604,4,2,2,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.575271e+09,2013
10765,217,11502,99,0,4414,4,2,2,1,1,...,,,,,,,,,,2015
19669,251,4361,99,0,4101,4,2,2,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4043311.0,0.000000e+00,2016
19449,241,5568,22,0,4204,4,2,2,2,2,...,0.0,0.0,0.0,0.0,,,,,,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,197,7904,21,0,5652,4,2,2,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28123820.0,2.482800e+09,2013
19648,601,12772,23,0,4802,4,2,2,2,2,...,0.0,0.0,0.0,0.0,0.0,213703200.0,0.0,27887829.0,4.601798e+09,2013
9845,37,1345,99,0,4205,4,2,2,2,2,...,0.0,0.0,0.0,0.0,0.0,24000.0,0.0,0.0,0.000000e+00,2016
10799,700,7707,22,0,4221,4,2,2,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1222740.0,0.000000e+00,2016


In [6]:
X_train_pp = pre_proc(X_train)
X_train_pp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,72,73,74,75,76,77,78,79,80,81
0,122.0,2287.0,99.0,0.0,6307.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,116580.0,0.0,0.0,0.000000e+00,2015.0
1,291.0,13146.0,22.0,0.0,3604.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.575271e+09,2013.0
2,217.0,11502.0,99.0,0.0,4414.0,4.0,2.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,2015.0
3,251.0,4361.0,99.0,0.0,4101.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4043311.0,0.000000e+00,2016.0
4,241.0,5568.0,22.0,0.0,4204.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,2015.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15966,197.0,7904.0,21.0,0.0,5652.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28123820.0,2.482800e+09,2013.0
15967,601.0,12772.0,23.0,0.0,4802.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,213703200.0,0.0,27887829.0,4.601798e+09,2013.0
15968,37.0,1345.0,99.0,0.0,4205.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,24000.0,0.0,0.0,0.000000e+00,2016.0
15969,700.0,7707.0,22.0,0.0,4221.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1222740.0,0.000000e+00,2016.0


https://towardsdatascience.com/cross-validation-and-hyperparameter-tuning-how-to-optimise-your-machine-learning-model-13f005af9d7d

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)

estimators = [('rf', RandomForestRegressor(n_estimators = 200,
                                 min_samples_split = 6,
                                 min_impurity_decrease = 0.0,
                                 max_features = 'sqrt',
                                 max_depth = 25,
                                 criterion = 'mae',
                                 bootstrap = True,
                                 random_state = 0)),
              ('xgb', xgb.XGBRegressor(tree_method = 'exact',
                         objective = 'reg:squarederror',
                         n_estimators = 1600,
                         min_child_weight = 6,
                         max_depth = 8,
                         gamma = 0,
                         eta = 0.1,
                         random_state = 0))]

reg = StackingRegressor(estimators=estimators,
                        final_estimator=GradientBoostingRegressor())

mean_rmse = []

for train_index, test_index in kf.split(df):
    
    X_train = df.iloc[train_index].loc[:, my_cols]
    X_test = df.iloc[test_index].loc[:,my_cols]
    y_train = df.iloc[train_index].loc[:,'target']
    y_test = df.loc[test_index].loc[:,'target']
    pipe = Pipeline(steps=[("preprocessor", preprocessor), ("regressor", reg)])
    model = pipe.fit(X_train, y_train)
    y_v_pred = model.predict(X_test)
    mean_rmse.append(metrics.mean_squared_error(y_test, y_v_pred, squared = False))
        
print(f"Mean of the Root Mean Square Error Folds on the test set: {np.mean(mean_rmse)}")
print("###################---###################\n")

Seems worst than RandomForest and XGB, but lets try with the submission on Zindi.

In [None]:
########################################

In [6]:
df = pd.read_csv("SUPCOM_Train.csv")
df.dropna(thresh=df.shape[0]*0.4,how='all',axis=1, inplace = True) #remove de variables with more than 40% of NA values

X = df.drop(['target', 'id'], 1)
y = df['target']

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = numerical_cols

X = X[my_cols]

mf = SimpleImputer(strategy='most_frequent')
mf.fit(X)

vt = VarianceThreshold()
vt = vt.fit(X)

def pre_proc(base):
    #for categorical variables, input the most frequent
    #for numerical variabels, input de most frequent
    
    base_pp = mf.transform(base)
    base_pp = pd.DataFrame(base_pp)
    
    base_pp = vt.transform(base_pp)
    base_pp = pd.DataFrame(base_pp)
    
    base_pp = pd.get_dummies(base_pp)
    
    return base_pp

X = pre_proc(X)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,72,73,74,75,76,77,78,79,80,81
0,44.0,6210.0,99.0,0.0,3707.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,2950000.0,0.0,0.0,6.027693e+08,2014.0
1,401.0,14383.0,22.0,0.0,6105.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,203353885.0,0.0,0.0,1.478520e+07,2013.0
2,243.0,11555.0,23.0,0.0,5751.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.079280e+07,2013.0
3,72.0,6175.0,99.0,0.0,4402.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,2017.0
4,42.0,1417.0,23.0,0.0,6302.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,2014.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21290,47.0,6766.0,22.0,0.0,4203.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.326544e+09,2014.0
21291,113.0,15006.0,21.0,0.0,5703.0,4.0,2.0,2.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.505822e+08,2016.0
21292,700.0,2429.0,22.0,0.0,4218.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,147828454.0,0.0,0.0,6.404927e+09,2013.0
21293,248.0,1323.0,22.0,0.0,3201.0,4.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.318494e+07,2014.0


In [9]:
%%time

df = pd.read_csv("SUPCOM_Train.csv")
df.dropna(thresh=df.shape[0]*0.4,how='all',axis=1, inplace = True) #remove de variables with more than 40% of NA values

X = df.drop(['target', 'id'], 1)
y = df['target']

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = numerical_cols

X = X[my_cols]

mf = SimpleImputer(strategy='most_frequent')
mf.fit(X)

vt = VarianceThreshold()
vt = vt.fit(X)

def pre_proc(base):
    #for categorical variables, input the most frequent
    #for numerical variabels, input de most frequent
    
    base_pp = mf.transform(base)
    base_pp = pd.DataFrame(base_pp)
    
    base_pp = vt.transform(base_pp)
    base_pp = pd.DataFrame(base_pp)
    
    
    
    #base_pp = pd.get_dummies(base_pp)
    
    return base_pp

X = pre_proc(X)

teste = pd.read_csv("SUPCOM_Test.csv")
teste = teste[my_cols]
teste = pre_proc(teste)

estimators = [('rf', RandomForestRegressor(n_estimators = 200,
                                         min_samples_split = 6,
                                         min_impurity_decrease = 0.0,
                                         max_features = 'sqrt',
                                         max_depth = 25,
                                         criterion = 'mae',
                                         bootstrap = True,
                                         random_state = 0)),
              ('xgb', xgb.XGBRegressor(tree_method = 'exact',
                                     objective = 'reg:squarederror',
                                     n_estimators = 1600,
                                     min_child_weight = 6,
                                     max_depth = 8,
                                     gamma = 0,
                                     eta = 0.1,
                                     random_state = 0))]

reg = StackingRegressor(estimators=estimators, final_estimator=GradientBoostingRegressor())

regressor = reg.fit(X, y)
y_v_pred = regressor.predict(teste)

teste = pd.read_csv("SUPCOM_Test.csv")

d = {'id': teste['id'], 'target': y_v_pred} 

predictions = pd.DataFrame(d)
predictions.to_csv("submission.csv", index=False)

Wall time: 1h 23min 8s


RandomForest e XGBoost stacking final_estimator GradientBoosting: 5.504050840720225

https://www.kaggle.com/code/orhankaramancode/ensemble-stacked-regressors-top-3-91-acc