In [4]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

from xgboost import XGBRegressor

In [5]:
df_train = pd.read_csv('./data/train.csv')
# df_train_extra = pd.read_csv('./data/training_extra.csv')
df_test = pd.read_csv('./data/test.csv')

df_train = df_train.set_index('id')
df_test = df_test.set_index('id')

In [6]:
df_train.head()

Unnamed: 0_level_0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [7]:
df_test.head()

Unnamed: 0_level_0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953


In [8]:
def encode_categorical_features(input: pd.DataFrame, cate_cols: list):
    
    df = input.copy()
    
    encoder = LabelEncoder()
    
    for col in cate_cols:
        df[col] = encoder.fit_transform(df[col])
    
    return df

In [9]:
cate_cols = df_train.select_dtypes(include=['object', 'category']).columns
df_process = encode_categorical_features(df_train, cate_cols)
df_process

Unnamed: 0_level_0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1,1,1,7.0,1,0,2,0,11.611723,112.15875
1,1,0,2,10.0,1,1,1,3,27.078537,68.88056
2,4,1,2,2.0,1,0,1,5,16.643760,39.17320
3,2,2,2,8.0,1,0,1,3,12.937220,80.60793
4,0,0,1,1.0,1,1,1,3,17.749338,86.02312
...,...,...,...,...,...,...,...,...,...,...
299995,0,1,2,9.0,0,0,2,1,12.730812,129.99749
299996,1,1,0,6.0,0,1,2,1,26.633182,19.85819
299997,3,0,0,9.0,1,1,0,4,11.898250,111.41364
299998,0,2,2,1.0,0,1,2,4,6.175738,115.89080


In [10]:
target = 'Price'
X = df_process.drop(target, axis=1)
y = df_process[target]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [11]:
def get_model():

    return XGBRegressor(
        n_estimators=1000,      # Number of trees (increase for better performance)
        learning_rate=0.05,     # Step size shrinkage (lower values improve generalization)
        max_depth=6,            # Maximum depth of trees (higher values may lead to overfitting)
        min_child_weight=1,     # Minimum sum of instance weight in child nodes
        subsample=0.8,          # Subsample ratio of training instances
        colsample_bytree=0.8,   # Subsample ratio of columns per tree
        gamma=0,                # Minimum loss reduction required to make a split
        reg_alpha=0,            # L1 regularization term on weights
        reg_lambda=1,           # L2 regularization term on weights
        objective='reg:squarederror',  # Loss function for regression
        random_state=42,
        verbosity=1
    )


In [12]:
model = get_model()

In [13]:
model.fit(X_train, y_train)

In [14]:
y_pred = model.predict(X_valid)

In [15]:
rmse = root_mean_squared_error(y_valid, y_pred)
rmse

39.041619889816864

In [18]:
df_test_process = encode_categorical_features(df_test, cate_cols)

In [23]:
test_predict = model.predict(df_test_process)

In [25]:
submissions = pd.DataFrame({
    'id': df_test.index,
    'Price': test_predict
})
submissions

Unnamed: 0,id,Price
0,300000,82.426270
1,300001,81.326340
2,300002,87.676292
3,300003,80.839958
4,300004,77.800850
...,...,...
199995,499995,79.235786
199996,499996,81.147209
199997,499997,89.606133
199998,499998,82.665329


In [None]:
submissions.to_csv('')