<a href="https://colab.research.google.com/github/dessqa/wids_2022/blob/main/ml_catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 60)

# Matplotlib for visualization
import matplotlib.pyplot as plt
%matplotlib inline

# Set default font size
plt.rcParams['font.size'] = 24

from IPython.core.pylabtools import figsize

# Seaborn for visualization
import seaborn as sns
sns.set(font_scale = 2)

# Imputing missing values and scaling values
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import MinMaxScaler

# Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
# Read in data into dataframes 
train_features = pd.read_csv('training_features.csv')
test_features = pd.read_csv('testing_features.csv')
train_labels = pd.read_csv('training_labels.csv')
test_labels = pd.read_csv('testing_labels.csv')
predict_ = pd.read_csv('prediction.csv')

# Display sizes of data
print('Training Feature Size: ', train_features.shape)
print('Testing Feature Size:  ', test_features.shape)
print('Training Labels Size:  ', train_labels.shape)
print('Testing Labels Size:   ', test_labels.shape)
print('Predict Feature Size:   ', predict_.shape)

Training Feature Size:  (53029, 83)
Testing Feature Size:   (22728, 83)
Training Labels Size:   (53029, 1)
Testing Labels Size:    (22728, 1)
Predict Feature Size:    (9705, 83)


In [3]:
# Create an imputer object with a median filling strategy
imputer = SimpleImputer(strategy='median')

# Train on the training features
imputer.fit(train_features)

# Transform both training data and testing data
X = imputer.transform(train_features)
X_test = imputer.transform(test_features)
pred = imputer.transform(predict_)

In [4]:
print('Missing values in training features: ', np.sum(np.isnan(X)))
print('Missing values in testing features:  ', np.sum(np.isnan(X_test)))
print('Missing values in predict features:  ', np.sum(np.isnan(pred)))


Missing values in training features:  0
Missing values in testing features:   0
Missing values in predict features:   0


In [5]:
# Create the scaler object with a range of 0-1
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit on the training data
scaler.fit(X)

# Transform both the training and testing data
X = scaler.transform(X)
X_test = scaler.transform(X_test)

pred = scaler.transform(pred)

In [6]:
# Convert y to one-dimensional array (vector)
y = np.array(train_labels).reshape((-1, ))
y_test = np.array(test_labels).reshape((-1, ))

ModuleNotFoundError: ignored

In [14]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
[K     |████████████████████████████████| 76.1 MB 1.3 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.4


In [15]:
from catboost import CatBoostRegressor

In [18]:
catb1 = CatBoostRegressor(
    verbose=1000,
    early_stopping_rounds=9,
    #random_state=41,
    random_seed=417,
    max_depth=12,
    # task_type='GPU',
    learning_rate=0.022699999,
    iterations=24000,
    eval_metric='RMSE'    
)

In [21]:
# 1 модель для коммерческого здания
catb1.fit(X, y)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 58.5643609	total: 226ms	remaining: 1h 30m 33s
1000:	learn: 34.4252314	total: 1m 49s	remaining: 41m 59s
2000:	learn: 29.9400020	total: 3m 38s	remaining: 40m
3000:	learn: 27.3026698	total: 5m 23s	remaining: 37m 45s
4000:	learn: 25.2821732	total: 7m 9s	remaining: 35m 45s
5000:	learn: 23.6577195	total: 8m 54s	remaining: 33m 50s
6000:	learn: 22.2948743	total: 10m 40s	remaining: 32m 2s
7000:	learn: 21.1302464	total: 12m 27s	remaining: 30m 15s
8000:	learn: 20.0835870	total: 14m 15s	remaining: 28m 30s
9000:	learn: 19.1392953	total: 16m 3s	remaining: 26m 45s
10000:	learn: 18.2771641	total: 17m 51s	remaining: 24m 59s
11000:	learn: 17.4757972	total: 19m 38s	remaining: 23m 12s
12000:	learn: 16.7596121	total: 21m 25s	remaining: 21m 25s
13000:	learn: 16.1461448	total: 23m 12s	remaining: 19m 37s
14000:	learn: 15.5793870	total: 25m	remaining: 17m 51s
15000:	learn: 15.0543024	total: 26m 47s	remaining: 16m 4s
16000:	learn: 14.5631666	total: 28m 33s	remaining: 14m 16s
17000:	learn: 14.1256265	t

<catboost.core.CatBoostRegressor at 0x7fb268a0a390>

In [22]:
sub_predict1 = catb1.predict(X_test)


In [23]:
from sklearn.metrics import mean_absolute_error


In [24]:
mean_absolute_error(y_test, sub_predict1)


19.323311341573746

In [25]:
sub_predict = catb1.predict(pred)


In [26]:
submit_sample = pd.read_csv('sample_solution.csv')

In [27]:
submit_sample.site_eui = sub_predict
submit_sample.to_csv("submission.csv", index=False)