In [41]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

In [42]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

# Preview data

In [43]:
df_train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [44]:
df_test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


# Check null values

In [45]:
df_train.isna().sum()

ID      0
y       0
X0      0
X1      0
X2      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 378, dtype: int64

In [46]:
df_train.isna().sum().loc[lambda x: x > 0]

Series([], dtype: int64)

In [47]:
df_test.isna().sum().loc[lambda x: x > 0]

Series([], dtype: int64)

# Separate features and label

In [48]:
X = df_train.drop(columns='y')
y = df_train.y

# Seperate columns with number and columns with string

In [49]:
num_cols = X.drop(columns='ID').select_dtypes(include='number').columns
num_cols

Index(['X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=368)

In [50]:
cat_cols = X.select_dtypes(include='object').columns
cat_cols

Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], dtype='object')

# Find columns with zero variance

In [51]:
col_with_zero_var = X.var(numeric_only=True).loc[lambda x: x == 0]
col_with_zero_var

X11     0.0
X93     0.0
X107    0.0
X233    0.0
X235    0.0
X268    0.0
X289    0.0
X290    0.0
X293    0.0
X297    0.0
X330    0.0
X347    0.0
dtype: float64

## Remove columns with zero variance

In [52]:
final_num = list(set(num_cols) - set(col_with_zero_var))

# Find unique values for each categorical columns

In [53]:
full_df = pd.concat([df_train[cat_cols], df_test[cat_cols]])

unique_categories = [full_df[x].unique() for x in full_df]
unique_categories

[array(['k', 'az', 't', 'al', 'o', 'w', 'j', 'h', 's', 'n', 'ay', 'f', 'x',
        'y', 'aj', 'ak', 'am', 'z', 'q', 'at', 'ap', 'v', 'af', 'a', 'e',
        'ai', 'd', 'aq', 'c', 'aa', 'ba', 'as', 'i', 'r', 'b', 'ax', 'bc',
        'u', 'ad', 'au', 'm', 'l', 'aw', 'ao', 'ac', 'g', 'ab', 'av', 'ag',
        'an', 'ae', 'p', 'bb'], dtype=object),
 array(['v', 't', 'w', 'b', 'r', 'l', 's', 'aa', 'c', 'a', 'e', 'h', 'z',
        'j', 'o', 'u', 'p', 'n', 'i', 'y', 'd', 'f', 'm', 'k', 'g', 'q',
        'ab'], dtype=object),
 array(['at', 'av', 'n', 'e', 'as', 'aq', 'r', 'ai', 'ak', 'm', 'a', 'k',
        'ae', 's', 'f', 'd', 'ag', 'ay', 'ac', 'ap', 'g', 'i', 'aw', 'y',
        'b', 'ao', 'al', 'h', 'x', 'au', 't', 'an', 'z', 'ah', 'p', 'am',
        'j', 'q', 'af', 'l', 'aa', 'c', 'o', 'ar', 'aj', 'ax', 'ab', 'w',
        'ad', 'u'], dtype=object),
 array(['a', 'e', 'c', 'f', 'd', 'b', 'g'], dtype=object),
 array(['d', 'b', 'c', 'a'], dtype=object),
 array(['u', 'y', 'x', 'h', 'g', 'f', 'j'

# Build pipeline of preprcessor and xgb

In [54]:
col_transformer = ColumnTransformer([
    ('num', 'passthrough', final_num),
    # encode categorical with numbers
    ('cat', OrdinalEncoder(categories=unique_categories), cat_cols)
])

preprocessor = Pipeline([
    # handle categorical features
    ('col_transformer', col_transformer),
    # scale before PCA
    ('scaler', StandardScaler()),
    # perform PCA
    ('pca', PCA())
])

pipeline = Pipeline([
    ('preprecessor', preprocessor),
    ('xgb', XGBRegressor())
])

# Use GridSearchCV to find the best hyperparameters

In [55]:
param_grid = {
    "preprecessor__pca__n_components": [10, 30, 50],
    "xgb__n_estimators": [100, 200],
}
search = GridSearchCV(pipeline, param_grid, n_jobs=-1)

In [56]:
search.fit(X, y)

GridSearchCV(estimator=Pipeline(steps=[('preprecessor',
                                        Pipeline(steps=[('col_transformer',
                                                         ColumnTransformer(transformers=[('num',
                                                                                          'passthrough',
                                                                                          ['X367',
                                                                                           'X268',
                                                                                           'X274',
                                                                                           'X197',
                                                                                           'X130',
                                                                                           'X167',
                                                                        

In [57]:
search.best_params_

{'preprecessor__pca__n_components': 50, 'xgb__n_estimators': 200}

In [58]:
search.best_score_

0.42159784763107877

In [59]:
search.cv_results_

{'mean_fit_time': array([2.26327405, 3.17262979, 3.79465537, 5.37360435, 4.68527303,
        7.00861597]),
 'std_fit_time': array([0.05069475, 0.05152528, 0.03847524, 0.06346877, 0.05166405,
        0.05976569]),
 'mean_score_time': array([0.01405606, 0.01429968, 0.01119952, 0.01069951, 0.00969987,
        0.01040115]),
 'std_score_time': array([0.00084422, 0.00172013, 0.00143505, 0.00050961, 0.00024485,
        0.00019963]),
 'param_preprecessor__pca__n_components': masked_array(data=[10, 10, 30, 30, 50, 50],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_xgb__n_estimators': masked_array(data=[100, 200, 100, 200, 100, 200],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'preprecessor__pca__n_components': 10, 'xgb__n_estimators': 100},
  {'preprecessor__pca__n_components': 10, 'xgb__n_estimators': 200},
  {'preprecessor__pca__n_com

# Predict test data with best estimator

In [60]:
search.predict(df_test)

array([ 97.46143, 135.52559,  92.88888, ...,  93.11132, 113.01195,
        92.0481 ], dtype=float32)