In [4]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import cupy as cp

In [5]:
test_df = pd.read_csv("/kaggle/input/30-days-of-ml/test.csv")
df = pd.read_csv("/kaggle/input/30-days-of-ml-5-folds/train_folds.csv")
# df.head()

features = [col for col in df.columns if col not in ('id', 'kfold', 'target')]
object_cols = [col for col in features if 'cat' in col]
xtest_df = test_df[features].copy()

final_predictions = []
scores = []
for fold in range(5):
    xtrain = df.loc[df.kfold != fold].reset_index(drop=True)
    xvalid = df.loc[df.kfold == fold].reset_index(drop=True)
    xtest = xtest_df.copy()
    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[features]
    xvalid = xvalid[features]

    print("Encoding")
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])

    print("Training")
    model = xgb.XGBRegressor(random_state=fold, tree_method='hist', device='cuda')
    # Load training and validation data onto GPU (if necessary)
    X_train_gpu = cp.array(xtrain)
    y_train_gpu = cp.array(ytrain)
    
    model.fit(X_train_gpu, y_train_gpu)
      # Handle potential device mismatch for prediction
    if model.device != 'cpu':
        model.set_params(device='cpu')    
    
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(preds_valid, yvalid, squared=False)
    scores.append(rmse)
    print(f"Fold_{fold+1} |", "RMSE:", rmse)

print("Mean RMSE:", np.mean(scores), "| Scores STD:", np.std(scores))

Encoding
Training
Fold_1 | RMSE: 0.7245302356488946
Encoding
Training
Fold_2 | RMSE: 0.7241902754354826
Encoding
Training
Fold_3 | RMSE: 0.726511525789112
Encoding
Training
Fold_4 | RMSE: 0.7269540048296184
Encoding
Training
Fold_5 | RMSE: 0.7257602217605351
Mean RMSE: 0.7255892526927286 | Scores STD: 0.0010789829421290003


In [14]:
# Standardization
test_df = pd.read_csv("/kaggle/input/30-days-of-ml/test.csv")
df = pd.read_csv("/kaggle/input/30-days-of-ml-5-folds/train_folds.csv")
# df.head()

features = [col for col in df.columns if col not in ('id', 'kfold', 'target')]
object_cols = [col for col in features if 'cat' in col]
numerical_cols = [col for col in features if col.startswith('cont')]

xtest_df = test_df[features].copy()

final_predictions = []
scores = []
for fold in range(5):
    xtrain = df.loc[df.kfold != fold].reset_index(drop=True)
    xvalid = df.loc[df.kfold == fold].reset_index(drop=True)
    xtest = xtest_df.copy()
    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[features]
    xvalid = xvalid[features]

    print("Encoding")
    ordinal_encoder = OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])

    scaler = preprocessing.StandardScaler()    
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])
    
    print("Training")
    model = xgb.XGBRegressor(random_state=fold, tree_method='hist', device='cuda')
      # Load training and validation data onto GPU (if necessary)
    X_train_gpu = cp.array(xtrain)
    y_train_gpu = cp.array(ytrain)
    
    model.fit(X_train_gpu, y_train_gpu)
      # Handle potential device mismatch for prediction
    if model.device != 'cpu':
        model.set_params(device='cpu')    

    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(preds_valid, yvalid, squared=False)
    scores.append(rmse)
    print(f"Fold_{fold+1} |", "RMSE:", rmse)

print("Mean RMSE:", np.mean(scores), "| Scores STD:", np.std(scores))

Encoding
Training
Fold_1 | RMSE: 0.7245302262120346
Encoding
Training
Fold_2 | RMSE: 0.7241323810345601
Encoding
Training
Fold_3 | RMSE: 0.7267897803225617
Encoding
Training
Fold_4 | RMSE: 0.7268506858678104
Encoding
Training
Fold_5 | RMSE: 0.7262141472591808
Mean RMSE: 0.7257034441392294 | Scores STD: 0.001149068069984286


In [15]:
# log transformation
test_df = pd.read_csv("/kaggle/input/30-days-of-ml/test.csv")
df = pd.read_csv("/kaggle/input/30-days-of-ml-5-folds/train_folds.csv")
# df.head()

features = [col for col in df.columns if col not in ('id', 'kfold', 'target')]
object_cols = [col for col in features if 'cat' in col]
numerical_cols = [col for col in features if col.startswith('cont')]

for col in numerical_cols:
    df[col] = np.log1p(df[col])
    test_df[col] = np.log1p(test_df[col])

xtest_df = test_df[features].copy()

final_predictions = []
scores = []
for fold in range(5):
    xtrain = df.loc[df.kfold != fold].reset_index(drop=True)
    xvalid = df.loc[df.kfold == fold].reset_index(drop=True)
    xtest = xtest_df.copy()
    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[features]
    xvalid = xvalid[features]

    print("Encoding")
    ordinal_encoder = OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    print("Training")
    model = xgb.XGBRegressor(random_state=fold, tree_method='hist', device='cuda')
    # Load training and validation data onto GPU (if necessary)
    X_train_gpu = cp.array(xtrain)
    y_train_gpu = cp.array(ytrain)
    
    model.fit(X_train_gpu, y_train_gpu)
      # Handle potential device mismatch for prediction
    if model.device != 'cpu':
        model.set_params(device='cpu')    

    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(preds_valid, yvalid, squared=False)
    scores.append(rmse)
    print(f"Fold_{fold+1} |", "RMSE:", rmse)

print("Mean RMSE:", np.mean(scores), "| Scores STD:", np.std(scores))

Encoding
Training
Fold_1 | RMSE: 0.7245294851363442
Encoding
Training
Fold_2 | RMSE: 0.7244413738935425
Encoding
Training
Fold_3 | RMSE: 0.7267363418452818
Encoding
Training
Fold_4 | RMSE: 0.7268505017167104
Encoding
Training
Fold_5 | RMSE: 0.725967857757496
Mean RMSE: 0.725705112069875 | Scores STD: 0.0010414912821805108


In [45]:
# polynomial features
test_df = pd.read_csv("/kaggle/input/30-days-of-ml/test.csv")
df = pd.read_csv("/kaggle/input/30-days-of-ml-5-folds/train_folds.csv")
# df.head()

object_cols = [col for col in df.columns if 'cat' in col]
numerical_cols = [col for col in df.columns if col.startswith('cont')]

poly = preprocessing.PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(df[numerical_cols])
test_poly = poly.fit_transform(test_df[numerical_cols])
df_poly = pd.DataFrame(train_poly, columns=[f"poly_{i}" for i in range(train_poly.shape[1])])
df_test_poly = pd.DataFrame(test_poly, columns=[f"poly_{i}" for i in range(test_poly.shape[1])])

final_df = pd.concat([df_poly, df.drop(numerical_cols, axis=1)], axis=1)
features = [col for col in final_df.columns if col not in ('id', 'kfold', 'target')] # contains object + poly columns
xtest_df = pd.concat([df_test_poly, test_df.drop(numerical_cols, axis=1)], axis=1)[features].copy()

final_predictions = []
scores = []
for fold in range(5):
    xtrain = final_df.loc[final_df.kfold != fold].reset_index(drop=True)
    xvalid = final_df.loc[final_df.kfold == fold].reset_index(drop=True)
    xtest = xtest_df.copy()
    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[features]
    xvalid = xvalid[features]

    print("Encoding")
    ordinal_encoder = OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    print("Training")
    model = xgb.XGBRegressor(random_state=fold, tree_method='hist', device='cuda')
    # Load training and validation data onto GPU (if necessary)
    X_train_gpu = cp.array(xtrain)
    y_train_gpu = cp.array(ytrain)
    
    model.fit(X_train_gpu, y_train_gpu)
      # Handle potential device mismatch for prediction
    if model.device != 'cpu':
        model.set_params(device='cpu')    

    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(preds_valid, yvalid, squared=False)
    scores.append(rmse)
    print(f"Fold_{fold+1} |", "RMSE:", rmse)

print("Mean RMSE:", np.mean(scores), "| Scores STD:", np.std(scores))

Encoding
Training
Fold_1 | RMSE: 0.7284822358101676
Encoding
Training
Fold_2 | RMSE: 0.7282244135361841
Encoding
Training
Fold_3 | RMSE: 0.730583235442689
Encoding
Training
Fold_4 | RMSE: 0.7290808787512799
Encoding
Training
Fold_5 | RMSE: 0.7300971300182351
Mean RMSE: 0.7292935787117111 | Scores STD: 0.0009116514614242901


In [37]:
# OneHotEncoder
test_df = pd.read_csv("/kaggle/input/30-days-of-ml/test.csv")
df = pd.read_csv("/kaggle/input/30-days-of-ml-5-folds/train_folds.csv")
# df.head()

features = [col for col in df.columns if col not in ('id', 'kfold', 'target')]
object_cols = [col for col in features if 'cat' in col]
numerical_cols = [col for col in features if col.startswith('cont')]
xtest_df = test_df[features].copy()

final_predictions = []
scores = []
for fold in range(5):
    xtrain = df.loc[df.kfold != fold].reset_index(drop=True)
    xvalid = df.loc[df.kfold == fold].reset_index(drop=True)
    xtest = xtest_df.copy()
    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[features]
    xvalid = xvalid[features]

    print("Encoding")
    onehot = preprocessing.OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    xtrain_ohe = onehot.fit_transform(xtrain[object_cols])
    xvalid_ohe = onehot.transform(xvalid[object_cols])
    xtest_ohe = onehot.transform(xtest[object_cols])

    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f"ohe_{i}" for i in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f"ohe_{i}" for i in range(xvalid_ohe.shape[1])])
    xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f"ohe_{i}" for i in range(xtest_ohe.shape[1])])

    num_xtrain = xtrain[numerical_cols]
    num_xvalid = xvalid[numerical_cols]
    num_xtest = xtest[numerical_cols]

    final_xtrain = pd.concat([num_xtrain, xtrain_ohe], axis=1)
    final_xvalid = pd.concat([num_xvalid, xvalid_ohe], axis=1)
    final_xtest = pd.concat([num_xtest, xtest_ohe], axis=1)

    
    print("Training")
    model = xgb.XGBRegressor(random_state=fold, tree_method='hist', device='cuda')
      # Load training and validation data onto GPU (if necessary)
    X_train_gpu = cp.array(final_xtrain)
    y_train_gpu = cp.array(ytrain)
    
    model.fit(X_train_gpu, y_train_gpu)
      # Handle potential device mismatch for prediction
    if model.device != 'cpu':
        model.set_params(device='cpu')  # Temporarily switch to CPU for prediction
        
    preds_valid = model.predict(final_xvalid)
    preds_test = model.predict(final_xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(preds_valid, yvalid, squared=False)
    scores.append(rmse)
    print(f"Fold_{fold+1} |", "RMSE:", rmse)

print("Mean RMSE:", np.mean(scores), "| Scores STD:", np.std(scores))

Encoding
Training
Fold_1 | RMSE: 0.7240222419972228
Encoding
Training
Fold_2 | RMSE: 0.7244751419373514
Encoding
Training
Fold_3 | RMSE: 0.7268093267896698
Encoding
Training
Fold_4 | RMSE: 0.7270873671980351
Encoding
Training
Fold_5 | RMSE: 0.7258384152651467
Mean RMSE: 0.7256464986374851 | Scores STD: 0.0012227337947936345


In [46]:
# OneHotEncoder + Polynomial Features
test_df = pd.read_csv("/kaggle/input/30-days-of-ml/test.csv")
df = pd.read_csv("/kaggle/input/30-days-of-ml-5-folds/train_folds.csv")

object_cols = [col for col in df.columns if 'cat' in col]
numerical_cols = [col for col in df.columns if col.startswith('cont')]

poly = preprocessing.PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(df[numerical_cols])
test_poly = poly.fit_transform(test_df[numerical_cols])
df_poly = pd.DataFrame(train_poly, columns=[f"poly_{i}" for i in range(train_poly.shape[1])])
df_test_poly = pd.DataFrame(test_poly, columns=[f"poly_{i}" for i in range(test_poly.shape[1])])

final_df = pd.concat([df_poly, df.drop(numerical_cols, axis=1)], axis=1)
features = [col for col in final_df.columns if col not in ('id', 'kfold', 'target')] # contains object + poly columns
xtest_df = pd.concat([df_test_poly, test_df.drop(numerical_cols, axis=1)], axis=1)[features].copy()

final_predictions = []
scores = []
for fold in range(5):
    xtrain = final_df.loc[final_df.kfold != fold].reset_index(drop=True)
    xvalid = final_df.loc[final_df.kfold == fold].reset_index(drop=True)
    xtest = xtest_df.copy()
    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[features]
    xvalid = xvalid[features]

    print("Encoding")
    onehot = preprocessing.OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    xtrain_ohe = onehot.fit_transform(xtrain[object_cols])
    xvalid_ohe = onehot.transform(xvalid[object_cols])
    xtest_ohe = onehot.transform(xtest[object_cols])

    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f"ohe_{i}" for i in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f"ohe_{i}" for i in range(xvalid_ohe.shape[1])])
    xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f"ohe_{i}" for i in range(xtest_ohe.shape[1])])

    final_xtrain = pd.concat([xtrain.drop(object_cols, axis=1), xtrain_ohe], axis=1)
    final_xvalid = pd.concat([xvalid.drop(object_cols, axis=1), xvalid_ohe], axis=1)
    final_xtest = pd.concat([xtest.drop(object_cols, axis=1), xtest_ohe], axis=1)

    
    print("Training")
    model = xgb.XGBRegressor(random_state=fold, tree_method='hist', device='cuda')
      # Load training and validation data onto GPU (if necessary)
    X_train_gpu = cp.array(final_xtrain)
    y_train_gpu = cp.array(ytrain)
    
    model.fit(X_train_gpu, y_train_gpu)
      # Handle potential device mismatch for prediction
    if model.device != 'cpu':
        model.set_params(device='cpu')  # Temporarily switch to CPU for prediction
        
    preds_valid = model.predict(final_xvalid)
    preds_test = model.predict(final_xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(preds_valid, yvalid, squared=False)
    scores.append(rmse)
    print(f"Fold_{fold+1} |", "RMSE:", rmse)

print("Mean RMSE:", np.mean(scores), "| Scores STD:", np.std(scores))

Encoding
Training
Fold_1 | RMSE: 0.7290120913100917
Encoding
Training
Fold_2 | RMSE: 0.7288331308135844
Encoding
Training
Fold_3 | RMSE: 0.7308963909587015
Encoding
Training
Fold_4 | RMSE: 0.7300750952652681
Encoding
Training
Fold_5 | RMSE: 0.7297792518653154
Mean RMSE: 0.7297191920425923 | Scores STD: 0.0007484874777521981


In [6]:
# Standardization on (OneHotEncoder + Polynomial Features)
test_df = pd.read_csv("/kaggle/input/30-days-of-ml/test.csv")
df = pd.read_csv("/kaggle/input/30-days-of-ml-5-folds/train_folds.csv")

object_cols = [col for col in df.columns if 'cat' in col]
numerical_cols = [col for col in df.columns if col.startswith('cont')]

poly = preprocessing.PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(df[numerical_cols])
test_poly = poly.fit_transform(test_df[numerical_cols])
df_poly = pd.DataFrame(train_poly, columns=[f"poly_{i}" for i in range(train_poly.shape[1])])
df_test_poly = pd.DataFrame(test_poly, columns=[f"poly_{i}" for i in range(test_poly.shape[1])])

final_df = pd.concat([df_poly, df.drop(numerical_cols, axis=1)], axis=1)
features = [col for col in final_df.columns if col not in ('id', 'kfold', 'target')] # contains object + poly columns
xtest_df = pd.concat([df_test_poly, test_df.drop(numerical_cols, axis=1)], axis=1)[features].copy()

final_predictions = []
scores = []
for fold in range(5):
    xtrain = final_df.loc[final_df.kfold != fold].reset_index(drop=True)
    xvalid = final_df.loc[final_df.kfold == fold].reset_index(drop=True)
    xtest = xtest_df.copy()
    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[features]
    xvalid = xvalid[features]

    print("Encoding")
    onehot = preprocessing.OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    xtrain_ohe = onehot.fit_transform(xtrain[object_cols])
    xvalid_ohe = onehot.transform(xvalid[object_cols])
    xtest_ohe = onehot.transform(xtest[object_cols])

    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f"ohe_{i}" for i in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f"ohe_{i}" for i in range(xvalid_ohe.shape[1])])
    xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f"ohe_{i}" for i in range(xtest_ohe.shape[1])])

    final_xtrain = pd.concat([xtrain.drop(object_cols, axis=1), xtrain_ohe], axis=1)
    final_xvalid = pd.concat([xvalid.drop(object_cols, axis=1), xvalid_ohe], axis=1)
    final_xtest = pd.concat([xtest.drop(object_cols, axis=1), xtest_ohe], axis=1)

    scaler = preprocessing.StandardScaler()    
    final_xtrain = scaler.fit_transform(final_xtrain)
    final_xvalid = scaler.transform(final_xvalid)
    final_xtest = scaler.transform(final_xtest)
    
    print("Training")
    model = xgb.XGBRegressor(random_state=fold, tree_method='hist', device='cuda')
      # Load training and validation data onto GPU (if necessary)
    X_train_gpu = cp.array(final_xtrain)
    y_train_gpu = cp.array(ytrain)
    
    model.fit(X_train_gpu, y_train_gpu)
      # Handle potential device mismatch for prediction
    if model.device != 'cpu':
        model.set_params(device='cpu')  # Temporarily switch to CPU for prediction
        
    preds_valid = model.predict(final_xvalid)
    preds_test = model.predict(final_xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(preds_valid, yvalid, squared=False)
    scores.append(rmse)
    print(f"Fold_{fold+1} |", "RMSE:", rmse)

print("Mean RMSE:", np.mean(scores), "| Scores STD:", np.std(scores))

Encoding
Training
Fold_1 | RMSE: 0.7290593366745904
Encoding
Training
Fold_2 | RMSE: 0.7288331464245963
Encoding
Training
Fold_3 | RMSE: 0.7308964625433366
Encoding
Training
Fold_4 | RMSE: 0.7299282527219462
Encoding
Training
Fold_5 | RMSE: 0.729337678799418
Mean RMSE: 0.7296109754327775 | Scores STD: 0.0007398859810732748
