In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import optuna

In [4]:
train = pd.read_csv("/kaggle/input/install-future-program-ankara-hackathon/train.csv")
test = pd.read_csv("/kaggle/input/install-future-program-ankara-hackathon/test.csv")

In [5]:
test["CPULOAD"] = np.nan
all_data = pd.concat([train, test], sort=False).drop("index", axis=1)

In [6]:
def create_features(df):
    df = df.copy()
    df["DATETIME"] = pd.to_datetime(df["DATETIME"])
    df["month"] = df["DATETIME"].dt.month
    df["day"] = df["DATETIME"].dt.day
    df["hour"] = df["DATETIME"].dt.hour
    df["minute"] = df["DATETIME"].dt.minute

    return df

In [7]:
def outlier_thresholds(df, col_name, q1=0.2, q3=0.8):
    quartile1 = df[col_name].quantile(q1)
    quartile3 = df[col_name].quantile(q3)
    iqr = quartile3 - quartile1
    up = quartile3 + 1.5 * iqr
    low = quartile1 - 1.5 * iqr
    return low, up

In [8]:
def check_outlier(df, col_name):
    low, up = outlier_thresholds(df, col_name)
    return True if df[(df[col_name] > up) | (df[col_name] < low)].any(axis = None) else False

In [9]:
def replace_with_thresholds(df, veriable):
    low, up = outlier_thresholds(df, veriable)
    df.loc[(df[veriable] < low), veriable] = low
    df.loc[(df[veriable] > up), veriable] = up

In [10]:
def remove_outlier(df):
    df = df.copy()
    server_types = df["SERVER"].unique()
    for server in server_types:
        server_df = df[df["SERVER"] == server]
        cpu_types = server_df["CPU"].unique()
        for cpu in cpu_types:
            cpu_df = server_df[server_df["CPU"] == cpu]
            if check_outlier(cpu_df, "CPULOAD"):
                replace_with_thresholds(cpu_df, "CPULOAD")
            server_df[server_df["CPU"] == cpu] = cpu_df
        df[df["SERVER"] == server] = server_df
    
    return df

In [11]:
all_data = create_features(all_data)

In [12]:
all_data = remove_outlier(all_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  server_df[server_df["CPU"] == cpu] = cpu_df
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  server_df[server_df["CPU"] == cpu] = cpu_df
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  server_df[server_df["CPU"] == cpu] = cpu_df
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [13]:
all_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CPULOAD,312696.0,10.25925,9.875165,0.06,2.05,6.77,16.03,60.92
month,317304.0,4.334241,0.673215,3.0,4.0,4.0,5.0,5.0
day,317304.0,17.108161,9.126417,1.0,9.0,18.0,25.0,31.0
hour,317304.0,11.511459,6.920314,0.0,6.0,12.0,18.0,23.0
minute,317304.0,22.483549,16.7619,0.0,0.0,15.0,30.0,45.0


In [14]:
def one_hot_encoder(df, categorical_cols, drop_first=True):
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=drop_first)
    return df

In [15]:
all_data = one_hot_encoder(all_data, ["SERVER", "CPU"])

In [16]:
train = all_data[all_data["CPULOAD"].notnull()]
test = all_data[all_data["CPULOAD"].isnull()].drop("CPULOAD", axis=1)

In [17]:
X = train.drop(["CPULOAD", "DATETIME"], axis=1)
y = train["CPULOAD"]

In [18]:
train_size = int(X.shape[0] * 0.8)
X_train = X.iloc[:train_size]
X_val = X.iloc[train_size:]
y_train = y.iloc[:train_size]
y_val = y.iloc[train_size:]

In [24]:
def objective(trial):
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    model = CatBoostRegressor(**params, silent=True)
    model.fit(X_train, y_train, cat_features=["month", "day", "hour", "minute"])
    predictions = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, predictions))
    return rmse

In [25]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

[I 2023-07-07 20:50:57,859] A new study created in memory with name: no-name-72d3d34e-fc57-45e1-8a00-de953ad10d46
[I 2023-07-07 20:51:11,443] Trial 0 finished with value: 9.964578321325606 and parameters: {'learning_rate': 0.0020157376688085074, 'depth': 1, 'subsample': 0.2250770783096379, 'colsample_bylevel': 0.15396439579280918, 'min_data_in_leaf': 76}. Best is trial 0 with value: 9.964578321325606.
[I 2023-07-07 20:52:30,335] Trial 1 finished with value: 4.858682577607124 and parameters: {'learning_rate': 0.044213323548477214, 'depth': 5, 'subsample': 0.3643132333790203, 'colsample_bylevel': 0.817485204765412, 'min_data_in_leaf': 66}. Best is trial 1 with value: 4.858682577607124.
[I 2023-07-07 20:53:23,874] Trial 2 finished with value: 5.486374015988706 and parameters: {'learning_rate': 0.010689730327514394, 'depth': 4, 'subsample': 0.31212523052186625, 'colsample_bylevel': 0.6087596983439276, 'min_data_in_leaf': 96}. Best is trial 1 with value: 4.858682577607124.
[I 2023-07-07 20:

KeyboardInterrupt: 

In [26]:
print('Best hyperparameters:', study.best_params)
print('Best RMSE:', study.best_value)

Best hyperparameters: {'learning_rate': 0.044213323548477214, 'depth': 5, 'subsample': 0.3643132333790203, 'colsample_bylevel': 0.817485204765412, 'min_data_in_leaf': 66}
Best RMSE: 4.858682577607124


In [28]:
model = CatBoostRegressor(**study.best_params)

In [21]:
# model = CatBoostRegressor()

In [29]:
model.fit(X, y, 
          verbose=False, 
          cat_features=["month", "day", "hour", "minute"])

<catboost.core.CatBoostRegressor at 0x7816e0596380>

In [30]:
val_pred = model.predict(X_val)

In [31]:
np.sqrt(mean_squared_error(y_val, val_pred))

3.6092241549958493

In [32]:
submission = pd.read_csv("/kaggle/input/install-future-program-ankara-hackathon/sample_submission.csv")

In [33]:
pred = model.predict(test.drop("DATETIME", axis=1))

In [34]:
submission["CPULOAD"] = pred

In [36]:
submission.to_csv("submission.csv", index=False)