In [None]:
!pip install catboost

In [5]:
import pandas as pd
import numpy as np

import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from catboost import Pool, CatBoostRegressor

In [6]:
RAND = sum(ord(x) for x in 'NEVER SURRENDER')


from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/My Drive/
from google.colab import output
output.enable_custom_widget_manager()

Mounted at /content/drive/
/content/drive/My Drive


In [7]:
def metrics_(real, pred):
    print(f'RMSE = {mean_squared_error(real, pred)**.5}')
    print(f'R2 = {r2_score(real, pred)}')
    print(f'MAE = {mean_absolute_error(real, pred)}')


In [8]:
df = pd.read_csv('tocolab.csv', index_col=[0])
df.columns = df.columns.str.replace('(.)([A-Z])', r'\1_\2', regex=True).str.lower()

In [9]:
%%time
#http https www
df['full_description'] = df['full_description'].str.replace(r'((https?:\/\/)|w{3}).*?( |$)',' ', regex=True)
df['full_description'] = (df['full_description'].str.replace(r'[^A-Za-z\']',' ', regex=True)
                                                .str.lower()
                                                .str.strip()
)
df['full_description'] = df['full_description'].str.replace(r'\W{2,}',' ', regex=True)

CPU times: user 48.6 s, sys: 943 ms, total: 49.6 s
Wall time: 53.4 s


2000 последних элементов будет "тестовой", чисто для удобства проверять ручками. В этой части он не нужен.

In [10]:
test = df.tail(2000).reset_index(drop=True)
df = df[:-2000].reset_index(drop=True)

In [11]:
df_train, df_valid = train_test_split(df, test_size=0.2, random_state=RAND)

In [12]:
print(df_train.shape,df_valid.shape)

(194214, 4) (48554, 4)


**Тренировка CatBoost**<br>
Предварительно были подобраны параметры, тут тренировалось несколько моделей.

Один из вариантов тренировки ниже, финальная модель получилась с логарифмом целевого.

In [36]:
X_train = df_train[['full_description','location_normalized','category']]
y_train = df_train['salary_normalized']
X_valid = df_valid[['full_description','location_normalized','category']]
y_valid = df_valid['salary_normalized']

In [37]:
X_train[['location_normalized','category']] = X_train[['location_normalized','category']].astype('category')
X_valid[['location_normalized','category']] = X_valid[['location_normalized','category']].astype('category')

In [14]:
X_test = test[['full_description','location_normalized','category']]
y_test = test['salary_normalized']

In [41]:
catboost_params = {
    'iterations': 12000,
    'eval_metric': 'RMSE',
    'verbose': 1000,
    'random_state': RAND,
    'depth': 5, 
    'learning_rate': 0.12
}
#  'one_hot_max_size': 30

In [42]:
text_features = ['full_description']
cat_features = list(X_train.columns)
train_pool = Pool(
    X_train, 
    y_train, 
    cat_features=cat_features, 
    text_features=text_features
)
val_pool = Pool(
    X_valid, 
    y_valid, 
    cat_features=cat_features, 
    text_features=text_features
)

In [43]:
model = CatBoostRegressor(**catboost_params, task_type="GPU")
model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=20, use_best_model=True)

0:	learn: 17213.6830871	test: 16988.5418278	best: 16988.5418278 (0)	total: 661ms	remaining: 2h 12m 6s
1000:	learn: 10320.2772946	test: 10715.4936895	best: 10715.4936895 (1000)	total: 6m 58s	remaining: 1h 16m 39s
2000:	learn: 9418.1531415	test: 10269.7582303	best: 10269.7582303 (2000)	total: 13m 30s	remaining: 1h 7m 30s
3000:	learn: 8789.8934867	test: 10005.8453351	best: 10005.8453351 (3000)	total: 20m 12s	remaining: 1h 35s
4000:	learn: 8299.5586608	test: 9829.0440781	best: 9829.0440781 (4000)	total: 26m 41s	remaining: 53m 21s
5000:	learn: 7883.6462771	test: 9682.0078195	best: 9682.0078195 (5000)	total: 33m 50s	remaining: 47m 21s
6000:	learn: 7527.4219027	test: 9571.9861253	best: 9571.9861253 (6000)	total: 41m 2s	remaining: 41m 1s
7000:	learn: 7214.2540584	test: 9478.4154943	best: 9478.4154943 (7000)	total: 48m 17s	remaining: 34m 29s
8000:	learn: 6931.9574192	test: 9397.3923075	best: 9397.3923075 (8000)	total: 55m 26s	remaining: 27m 42s
9000:	learn: 6674.9959013	test: 9324.9507940	best:

<catboost.core.CatBoostRegressor at 0x7f965a9242b0>

In [62]:
model.score(val_pool)

0.7252585305051216

In [61]:
model.get_params()

{'iterations': 12000,
 'learning_rate': 0.12,
 'depth': 5,
 'loss_function': 'RMSE',
 'verbose': 1000,
 'eval_metric': 'RMSE',
 'random_state': 1114}

In [66]:
pred = model.predict(X_test)

In [68]:
metrics_(pred, y_test)

RMSE = 8569.768591965525
R2 = 0.5426865417974165
MAE = 5413.0825636713225


### Кусок подбора параметров.

Сначала был он, основные параметры подобраны. Частично слетели выводы из-за того, что закончилось GPU Colab. Доделывалось уже на том, что выше. 

In [57]:
Xf_train = df[['full_description','location_normalized','category']]
yf_train = np.log(df['salary_normalized'])
Xf_train[['location_normalized','category']] = Xf_train[['location_normalized','category']].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [58]:
trainf_pool = Pool(
    Xf_train, 
    yf_train, 
    cat_features=cat_features, 
    text_features=text_features
)



In [None]:
cat_model = CatBoostRegressor(**catboost_params)  #, task_type="GPU"
grid_search_result = cat_model.grid_search(grid, trainf_pool, cv=2)

0:	learn: -3.5100205	test: -3.5051956	best: -3.5051956 (0)	total: 73.5ms	remaining: 2m 26s
500:	learn: 0.2644826	test: 0.2733541	best: 0.2733541 (500)	total: 24.7s	remaining: 1m 13s
1000:	learn: 0.2675325	test: 0.2743896	best: 0.2743929 (996)	total: 51.1s	remaining: 51s
1500:	learn: 0.2690872	test: 0.2745481	best: 0.2745628 (1485)	total: 1m 15s	remaining: 25.1s
1999:	learn: 0.2703216	test: 0.2744605	best: 0.2745628 (1485)	total: 1m 42s	remaining: 0us

bestTest = 0.2745628119
bestIteration = 1485

0:	loss: 0.2745628	best: 0.2745628 (0)	total: 2m 16s	remaining: 15m 58s
0:	learn: -3.0029149	test: -2.9949758	best: -2.9949758 (0)	total: 70.1ms	remaining: 2m 20s
500:	learn: 0.2693532	test: 0.2745683	best: 0.2746051 (427)	total: 24.5s	remaining: 1m 13s
1000:	learn: 0.2725149	test: 0.2742176	best: 0.2746051 (427)	total: 49.1s	remaining: 49s
1500:	learn: 0.2747217	test: 0.2740276	best: 0.2746051 (427)	total: 1m 13s	remaining: 24.5s
1999:	learn: 0.2764189	test: 0.2737309	best: 0.2746051 (427)	to

In [None]:
grid_search_result['params']

{'depth': 6, 'learning_rate': 0.1, 'l2_leaf_reg': 5.0}

In [None]:
cat_model = CatBoostRegressor(**catboost_params, **grid_search_result['params'], task_type="GPU")
cat_model.fit(trainf_pool)

0:	learn: 17124.2531548	total: 73.6ms	remaining: 3m 40s
500:	learn: 9596.5892555	total: 16.9s	remaining: 1m 24s
1000:	learn: 8372.9043019	total: 32.6s	remaining: 1m 5s
1500:	learn: 7515.4873340	total: 48.3s	remaining: 48.3s
2000:	learn: 6830.7770319	total: 1m 3s	remaining: 31.9s
2500:	learn: 6262.2652315	total: 1m 19s	remaining: 15.9s
2999:	learn: 5785.5069494	total: 1m 38s	remaining: 0us


In [None]:
cat_model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,full_description,85.459559
1,category,8.765777
2,location_normalized,5.774665
