In [1]:
# !pip install catboost==1.2

In [18]:
import pandas as pd
import re
from datetime import datetime as dt
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from catboost.utils import eval_metric
import numpy as np

### Read df

In [3]:
df = pd.read_csv('data/embeddings.csv')

In [4]:
#df = df['text','linkedid','cat','embeddings']
df = df[['text', 'cat', 'embedding']]
df.head(3)

Unnamed: 0,text,cat,embedding
0,центра пётр наталья чем могу помочь,1.0,[ 1.33301303e-01 1.38765112e-01 -5.30672930e-...
1,до свидания,1.0,[-6.47988096e-02 5.10487437e-01 -6.43889159e-...
2,ожидайте звонок в течение часа,1.0,[-1.26493618e-01 3.18099082e-01 -8.92290846e-...


### Create embeddings columns

In [5]:
# Defining a function to extract the float values from the string representation
def extract_values(embedding_str):
    # Using regular expression to find all floating-point numbers in the string
    values = re.findall(r'[-+]?\d*\.\d+e[+-]\d+|[-+]?\d+\.\d+|\d+', embedding_str)
    return [float(value) for value in values]

# Applying the function to the "embedding" column to create a DataFrame with new columns
embedding_values_df = df['embedding'].apply(extract_values).apply(pd.Series)

# Renaming the columns to "emb_0," "emb_1," etc.
embedding_values_df.columns = [f'emb_{i}' for i in range(embedding_values_df.shape[1])]

# Concatenating the new columns with the original DataFrame
final_df = pd.concat([df.drop(columns=['embedding']), embedding_values_df], axis=1)

# Displaying the first few rows of the final DataFrame
final_df.head(3)

Unnamed: 0,text,cat,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,...,emb_374,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383
0,центра пётр наталья чем могу помочь,1.0,0.133301,0.138765,-0.053067,0.216577,-0.115041,0.215542,0.418259,0.268942,...,-0.104017,0.080503,0.31719,0.141871,-0.503967,-0.094222,-0.272026,0.085119,0.067764,-0.176453
1,до свидания,1.0,-0.064799,0.510487,-0.064389,-0.150583,-0.298217,-0.038162,0.586201,0.03576,...,-0.107713,0.58501,0.12722,-0.14059,-0.249223,-0.065519,0.000537,0.334044,-0.096987,-0.46587
2,ожидайте звонок в течение часа,1.0,-0.126494,0.318099,-0.089229,0.02989,-0.0874,0.033627,0.595477,0.404032,...,-0.004513,0.180154,0.365843,-0.024833,-0.129964,-0.091756,-0.099174,0.082055,-0.041417,-0.255874


### CatBoost regression

In [6]:
final_df.to_csv('catboost_in.csv')

In [None]:
# final_df = pd.read_csv('catboost_in.csv')

In [7]:
# define dataset
X = final_df.drop(final_df[['text', 'cat']], axis=1)
y = final_df['cat']

In [8]:
# dataset preparation
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)
eval_dataset = Pool(data=X_validation,
        label=y_validation)

In [9]:
len(y_train), sum(y_train)

(293252, 274880.0)

In [10]:
len(y_validation), sum(y_validation)

(97751, 91603.0)

In [33]:
model = CatBoostRegressor(
            boost_from_average=True,
            one_hot_max_size = 128,
            depth = 12,
            langevin = True,
            verbose=False,
            task_type="GPU"
            )

In [34]:
# train
print(dt.now())
model.fit(X_train, y_train, use_best_model=True, eval_set=eval_dataset)
print(dt.now())

2023-08-26 03:14:37.475400
2023-08-26 03:15:39.884868


In [35]:
# evaluate model score
pred = model.predict(X_validation)
params = model.get_params()
response = ''
response += str(params)+'\n'
response += '\n'+params['loss_function']+' loss: '+ str(eval_metric(y_validation.to_numpy(), pred, params['loss_function']))
response += '\nFitted: '+str(model.is_fitted())
response += '\nModel score:\n'+str(model.score(X,y))
response += '\nFeature importance:'
try:
    importance = model.get_feature_importance()
    for i in range(len(model.feature_names_)):
        response += '\n'+str(np.round(importance[i],2)) + ' ' + model.feature_names_[i]
except Exception as e:
    response += '\n'+str(e)       

response = str(eval_metric(y_validation.to_numpy(), pred, params['loss_function'])[0]) + '\n' + response
print(response)

0.23723375851366227
{'depth': 12, 'loss_function': 'RMSE', 'verbose': False, 'one_hot_max_size': 128, 'task_type': 'GPU', 'langevin': True, 'boost_from_average': True}

RMSE loss: [0.23723375851366227]
Fitted: True
Model score:
0.2083137711417694
Feature importance:
0.25 emb_0
2.04 emb_1
0.09 emb_2
0.1 emb_3
0.65 emb_4
0.26 emb_5
0.17 emb_6
0.09 emb_7
0.3 emb_8
0.0 emb_9
0.44 emb_10
0.19 emb_11
0.75 emb_12
0.47 emb_13
1.63 emb_14
0.29 emb_15
0.13 emb_16
0.16 emb_17
0.68 emb_18
0.8 emb_19
0.07 emb_20
0.21 emb_21
0.4 emb_22
0.08 emb_23
0.05 emb_24
0.83 emb_25
0.58 emb_26
0.22 emb_27
0.14 emb_28
0.22 emb_29
0.29 emb_30
0.21 emb_31
0.49 emb_32
0.27 emb_33
0.36 emb_34
0.17 emb_35
0.05 emb_36
0.33 emb_37
0.27 emb_38
0.31 emb_39
0.06 emb_40
0.51 emb_41
0.04 emb_42
0.4 emb_43
0.26 emb_44
0.55 emb_45
0.44 emb_46
0.25 emb_47
0.03 emb_48
0.26 emb_49
0.07 emb_50
0.03 emb_51
0.3 emb_52
0.21 emb_53
0.2 emb_54
0.33 emb_55
0.14 emb_56
0.06 emb_57
0.06 emb_58
0.09 emb_59
0.01 emb_60
0.17 emb_61
0.08 em

In [36]:
model_name = 'catboost.model'
# save model
model.save_model('data/'+model_name)

In [38]:
# Sort the feature importance
sorted_importance = sorted(zip(model.feature_names_, importance), key=lambda x: x[1], reverse=True)

# Append sorted feature importance to response
response = '\nSorted Feature importance:'
for i, (feature, imp) in enumerate(sorted_importance):
    response += f'\n{np.round(imp, 2)} {feature}'

print(response)


Sorted Feature importance:
2.23 emb_105
2.04 emb_1
1.63 emb_14
1.47 emb_134
1.27 emb_156
1.0 emb_133
0.97 emb_271
0.97 emb_109
0.92 emb_126
0.9 emb_207
0.85 emb_348
0.85 emb_247
0.84 emb_230
0.83 emb_25
0.82 emb_280
0.8 emb_19
0.76 emb_97
0.75 emb_12
0.71 emb_79
0.68 emb_18
0.65 emb_174
0.65 emb_192
0.65 emb_4
0.64 emb_63
0.62 emb_335
0.62 emb_273
0.59 emb_279
0.58 emb_26
0.58 emb_196
0.57 emb_262
0.55 emb_45
0.55 emb_283
0.51 emb_41
0.5 emb_296
0.49 emb_191
0.49 emb_32
0.47 emb_151
0.47 emb_277
0.47 emb_350
0.47 emb_13
0.46 emb_380
0.45 emb_227
0.45 emb_89
0.44 emb_182
0.44 emb_10
0.44 emb_312
0.44 emb_46
0.43 emb_177
0.43 emb_281
0.42 emb_292
0.42 emb_129
0.42 emb_221
0.42 emb_342
0.42 emb_122
0.41 emb_334
0.41 emb_194
0.4 emb_22
0.4 emb_218
0.4 emb_43
0.4 emb_369
0.4 emb_327
0.39 emb_188
0.38 emb_255
0.37 emb_143
0.37 emb_185
0.37 emb_345
0.36 emb_113
0.36 emb_233
0.36 emb_146
0.36 emb_266
0.36 emb_34
0.36 emb_138
0.35 emb_103
0.35 emb_364
0.35 emb_320
0.35 emb_94
0.35 emb_298
0.35

### Sort original DF by the most important feature

In [41]:
important_df = final_df[['text', 'cat', 'emb_105']]
important_df.head(3)

Unnamed: 0,text,cat,emb_105
0,центра пётр наталья чем могу помочь,1.0,-0.016394
1,до свидания,1.0,-0.25124
2,ожидайте звонок в течение часа,1.0,-0.022775


In [48]:
len(important_df)

391003

In [49]:
# Remove duplicates
important_df = important_df.drop_duplicates()
len(important_df)

268074

In [50]:
# Sort the DataFrame
sorted_df = important_df.sort_values(by='emb_105', ascending=True)

In [51]:
for idx, row in sorted_df[:500].iterrows():
    print(row['cat'], row.emb_105, row.text)

1.0 -0.734595835 джанан
1.0 -0.73031199 хан
1.0 -0.726594269 банан
1.0 -0.726593971 банан
1.0 -0.721832454 аж
1.0 -0.718275011 ферран
1.0 -0.718231976 кран
1.0 -0.693701863 кан
1.0 -0.660895944 в клане
1.0 -0.650688469 брант
0.0 -0.65068841 брант
1.0 -0.641317904 планшет
1.0 -0.640630782 фр
1.0 -0.625479281 манжет
1.0 -0.622695923 келлер
1.0 -0.612567544 план
1.0 -0.608062208 канеш
1.0 -0.597115993 жан ван
1.0 -0.593762934 лиран
1.0 -0.59192276 атлант
0.0 -0.59192276 атлант
1.0 -0.591922462 атлант
0.0 -0.591922462 атлант
1.0 -0.591922402 атлант
1.0 -0.589240015 банка
1.0 -0.585074425 планерная
1.0 -0.585074186 планерная
1.0 -0.584420383 ван
1.0 -0.584419966 ван
1.0 -0.583566368 а брант
1.0 -0.582050383 коран
1.0 -0.579773605 барант
1.0 -0.57703954 пан
1.0 -0.574379206 ветеран
1.0 -0.569002926 дан
1.0 -0.569002807 дан
1.0 -0.569002628 дан
1.0 -0.566635132 алан
1.0 -0.566634953 алан
1.0 -0.566634834 алан
1.0 -0.566634655 алан
1.0 -0.566287637 ваня
0.0 -0.562460959 кит
1.0 -0.562460482 ки

In [54]:
for idx, row in sorted_df[-500:].iterrows():
    print(row['cat'], row.emb_105, row.text)

1.0 0.360162169 я думаю что нет
0.0 0.360170752 ну мы
1.0 0.360357225 до ремонта что
1.0 0.360465735 гости да
1.0 0.360755473 ну да где работа
1.0 0.361055344 икону
1.0 0.361508578 снег
1.0 0.361684591 газовая да
1.0 0.361923039 давно дом
1.0 0.361973405 на песню
1.0 0.362042546 вы галочкой
1.0 0.362440616 на русском да
1.0 0.36248672 ну что ж понятно
0.0 0.362572193 где
1.0 0.362572193 где
1.0 0.362572223 где
0.0 0.362572283 где
1.0 0.362572402 где
0.0 0.362572402 где
1.0 0.362572432 где
1.0 0.3625727 где
1.0 0.362597764 ты чудо я могу
1.0 0.36284554 ты чудо
1.0 0.36284712 ну что да
1.0 0.362917125 поэтому точнее
1.0 0.363129228 до ясенева
1.0 0.363221377 гостя
1.0 0.363276452 по-домашнему
1.0 0.363340765 да память девичья
1.0 0.363551766 потом я
1.0 0.363562852 по звонку да
1.0 0.363610536 госет
1.0 0.363696814 голос
1.0 0.364066184 дом два
1.0 0.364066362 дом два
1.0 0.364066541 дом два
1.0 0.364462465 да ну там
1.0 0.36452648 геннадий георгиевич да
1.0 0.364578873 гаснет пламя да
1