In [None]:
!pip install seaborn -q
!pip install catboost -q
!pip install lightgbm -q
!pip install xgboost -q

import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import warnings; warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from lightgbm import Dataset
from lightgbm import train ,LGBMRegressor
from sklearn.metrics import mean_poisson_deviance, mean_squared_error


train_data = pd.read_csv("./datasets/train.csv")

cat_features = ["model", "car_type", "fuel_type"]
targets = ["target_class", "target"]
features2drop = ["car_id"]

filtered_features = [i for i in train_data.columns if (i not in targets and i not in features2drop)]
num_features = [i for i in filtered_features if i not in cat_features]

print("cat features", cat_features)
print("num features", len(num_features))
print("targets", targets)

for c in cat_features:
    train_data[c] = train_data[c].astype(str)

X = train_data[filtered_features].drop(targets, axis=1, errors="ignore")
y = train_data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from catboost import CatBoostClassifier, CatBoostRegressor, Pool

model = CatBoostRegressor(cat_features=cat_features, eval_metric="RMSE")

model.fit(
    X_train,
    y_train,
    verbose=500,
    plot=False,
)

print(model.best_score_)

cbr = CatBoostRegressor(
        depth=4,
        iterations=1000,
        learning_rate=0.09,
        cat_features=cat_features,
        colsample_bylevel=0.99,
        max_bin=190,
        l2_leaf_reg=5,
        subsample=0.5,)

cbr.fit(
        X_train,
        y_train,
        eval_set=(X_test, y_test),
        verbose=500,
        plot=False)


print(cbr.best_score_)

test = pd.read_csv('./datasets/test.csv')
for c in cat_features:
    test[c] = test[c].astype(str)

x_test = test[filtered_features].drop(targets, axis=1, errors="ignore")

y_pred = cbr.predict(x_test)
CatBoostReg_result = pd.DataFrame({'car_id': test['car_id'], 'target': y_pred})
CatBoostReg_result.to_csv('result1.csv', index=False)


df = pd.read_csv("./datasets/train.csv")
cat_cols = ["car_type", "fuel_type", "model"]
drop_cols = ["car_id", "target", "target_class"]

X = df.drop(drop_cols, axis=1)
y = df["target"]

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for col in cat_cols:
    X[col] = label_encoder.fit_transform(X[col])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = Dataset(
    X_train,
    y_train,
    categorical_feature=cat_cols,
    free_raw_data=False,
)

val_data = Dataset(
    X_test,
    y_test,
    categorical_feature=cat_cols,
    free_raw_data=False,
)

reg = LGBMRegressor(metric = "RMSE")

reg.fit(
    X_train,
    y_train,
    eval_set=[
        (X_test, y_test),
    ],
)
print(reg.best_score_)

reg = LGBMRegressor(
    n_estimators=40,
    learning_rate = 0.09,
    cat_feature=[0, 1, 2],
    num_leaves = 8,
    metric="RMSE")

reg.fit(
    X_train,
    y_train,
    eval_set=[
        (X_test, y_test),
    ],
)

print(reg.best_score_)

test = pd.read_csv("./datasets/test.csv")

drop_cols = ["car_id", "target_class"]
x_test = test.drop(drop_cols, axis=1)

for col in cat_cols:
    x_test[col] = label_encoder.fit_transform(x_test[col])

y_pred = reg.predict(x_test)
LGBMReg_result = pd.DataFrame({'car_id': test['car_id'], 'target': y_pred})
LGBMReg_result.to_csv('result2.csv', index=False)

xgb_data = pd.read_csv("./datasets/train.csv")
drop_cols = ['car_id', 'target', 'target_class']
cat_cols = ['car_type', 'fuel_type', 'model']

X = xgb_data.drop(drop_cols, axis=1)
y = xgb_data['target']


for col in cat_cols:
    X[col] = X[col].astype('category')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

reg = xgb.XGBRegressor(tree_method="hist",
                       eval_metric = mean_poisson_deviance,
                       enable_categorical=True,
                       n_estimators=30, n_jobs=-1
)
reg.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=10,
        verbose=3)

y_pred = reg.predict(X_test, iteration_range=(0, reg.best_iteration + 1))
mean_squared_error(y_test, y_pred) ** 0.5

reg = xgb.XGBRegressor(tree_method="hist",
                       eval_metric = mean_poisson_deviance,
                       enable_categorical=True,
                       n_estimators=80,
                       n_jobs=-1,
                       min_child_weight=16,
                       max_bin=128,
                       reg_alpha=275,
                       reg_lambda=275,)

reg.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=10,
        verbose=3)

y_pred = reg.predict(X_test, iteration_range=(0, reg.best_iteration + 1))
mean_squared_error(y_test, y_pred) ** 0.5

test = pd.read_csv('./datasets/test.csv')

drop_cols = ['car_id', 'target_class']
x_test = test.drop(drop_cols, axis=1)

for col in cat_cols:
    x_test[col] = x_test[col].astype('category')

y_pred = reg.predict(x_test)
XGBReg_result = pd.DataFrame({'car_id': test['car_id'], 'target': y_pred})
XGBReg_result.to_csv('result3.csv', index=False)