In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
import numpy as np

In [2]:
regular = pd.read_csv('regular.csv', header = None)
regular.columns = ['name']
regular['spam'] = 0
spam = pd.read_csv('spam.csv', header = None)
spam.columns = ['name']
spam['spam'] = 1

In [3]:
def data_augmentation(df):
    df['latin'] = df['name'].apply(lambda x: len(re.findall(r'[a-zA-Z]', x)))
    df['cyrilic'] = df['name'].apply(lambda x: len(re.findall(r'[а-яА-Я]', x)))
    df['numbers'] = df['name'].apply(lambda x: len(re.findall(r'[0-9]', x)))
    df['others'] = df['name'].apply(lambda x: len(re.findall(r'[^\x00-\x7Fа-яА-ЯёЁ0-9]', x)))
    df['others_l'] = df['name'].apply(lambda x: len(re.findall(r'[^\x00-\x7Fа-яА-ЯёЁ0-9]',x[:int(len(x)/2)])))
    df['others_r'] = df['name'].apply(lambda x: len(re.findall(r'[^\x00-\x7Fа-яА-ЯёЁ0-9]',x[int(len(x)/2):])))
    df['lc'] = df['cyrilic']/df['latin']
    return df

In [4]:
spam = data_augmentation(spam)
regular = data_augmentation(regular)

In [5]:
spam.head(3)

Unnamed: 0,name,spam,latin,cyrilic,numbers,others,others_l,others_r,lc
0,💋💋💋 ПOUСК_SЕКS_PАРТНЕРOV 💋💋💋,1,7,11,0,6,3,3,1.571429
1,❤️❤️❤️ КRASИBЫE ДЕВУШКИ 💋💋💋,1,5,10,0,9,6,3,2.0
2,❤️❤️❤️ ПОИSК_ПАPЫ_HA_ВE4ЕR ❤️❤️❤️,1,6,9,1,12,6,6,1.5


In [6]:
regular.head(3)

Unnamed: 0,name,spam,latin,cyrilic,numbers,others,others_l,others_r,lc
0,Yerzhan Auyezov,0,14,0,0,0,0,0,0.0
1,Даниил Панченко,0,0,14,0,0,0,0,inf
2,Алексей Бурков,0,0,13,0,0,0,0,inf


### Prepare data

In [7]:
X = pd.concat([spam,regular])
y = X.spam
X.drop(['spam'], axis=1, inplace = True)
X.head(3)

Unnamed: 0,name,latin,cyrilic,numbers,others,others_l,others_r,lc
0,💋💋💋 ПOUСК_SЕКS_PАРТНЕРOV 💋💋💋,7,11,0,6,3,3,1.571429
1,❤️❤️❤️ КRASИBЫE ДЕВУШКИ 💋💋💋,5,10,0,9,6,3,2.0
2,❤️❤️❤️ ПОИSК_ПАPЫ_HA_ВE4ЕR ❤️❤️❤️,6,9,1,12,6,6,1.5


In [8]:
y.head(3)

0    1
1    1
2    1
Name: spam, dtype: int64

In [9]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

In [10]:
cat_features = ['name']

In [11]:
eval_dataset = Pool(
    data=X_validation,
    label=y_validation,
    cat_features=cat_features
)

### Train

In [12]:
model = CatBoostClassifier()
model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=eval_dataset
)

Learning rate set to 0.015617
0:	learn: 0.6716372	test: 0.6754698	best: 0.6754698 (0)	total: 51ms	remaining: 51s
1:	learn: 0.6519503	test: 0.6566047	best: 0.6566047 (1)	total: 52.1ms	remaining: 26s
2:	learn: 0.6270837	test: 0.6320545	best: 0.6320545 (2)	total: 53.5ms	remaining: 17.8s
3:	learn: 0.6123507	test: 0.6193833	best: 0.6193833 (3)	total: 55ms	remaining: 13.7s
4:	learn: 0.5986890	test: 0.6043490	best: 0.6043490 (4)	total: 55.7ms	remaining: 11.1s
5:	learn: 0.5837916	test: 0.5878318	best: 0.5878318 (5)	total: 56.5ms	remaining: 9.36s
6:	learn: 0.5660057	test: 0.5692111	best: 0.5692111 (6)	total: 57.5ms	remaining: 8.16s
7:	learn: 0.5476810	test: 0.5505424	best: 0.5505424 (7)	total: 58.4ms	remaining: 7.24s
8:	learn: 0.5260069	test: 0.5279370	best: 0.5279370 (8)	total: 59ms	remaining: 6.5s
9:	learn: 0.5094461	test: 0.5120357	best: 0.5120357 (9)	total: 59.7ms	remaining: 5.91s
10:	learn: 0.4957713	test: 0.5050756	best: 0.5050756 (10)	total: 60.6ms	remaining: 5.45s
11:	learn: 0.4803451	t

273:	learn: 0.0163586	test: 0.0346863	best: 0.0346863 (273)	total: 253ms	remaining: 671ms
274:	learn: 0.0162844	test: 0.0345772	best: 0.0345772 (274)	total: 254ms	remaining: 670ms
275:	learn: 0.0162101	test: 0.0344256	best: 0.0344256 (275)	total: 255ms	remaining: 668ms
276:	learn: 0.0161294	test: 0.0343314	best: 0.0343314 (276)	total: 255ms	remaining: 667ms
277:	learn: 0.0160703	test: 0.0342867	best: 0.0342867 (277)	total: 256ms	remaining: 665ms
278:	learn: 0.0159824	test: 0.0340927	best: 0.0340927 (278)	total: 257ms	remaining: 664ms
279:	learn: 0.0158976	test: 0.0339201	best: 0.0339201 (279)	total: 258ms	remaining: 662ms
280:	learn: 0.0158275	test: 0.0337911	best: 0.0337911 (280)	total: 258ms	remaining: 661ms
281:	learn: 0.0157445	test: 0.0336900	best: 0.0336900 (281)	total: 259ms	remaining: 659ms
282:	learn: 0.0156811	test: 0.0336160	best: 0.0336160 (282)	total: 260ms	remaining: 658ms
283:	learn: 0.0156164	test: 0.0335229	best: 0.0335229 (283)	total: 260ms	remaining: 656ms
284:	learn

562:	learn: 0.0067763	test: 0.0169738	best: 0.0169738 (562)	total: 433ms	remaining: 336ms
563:	learn: 0.0067621	test: 0.0169412	best: 0.0169412 (563)	total: 433ms	remaining: 335ms
564:	learn: 0.0067476	test: 0.0169064	best: 0.0169064 (564)	total: 434ms	remaining: 334ms
565:	learn: 0.0067331	test: 0.0168718	best: 0.0168718 (565)	total: 434ms	remaining: 333ms
566:	learn: 0.0067191	test: 0.0168396	best: 0.0168396 (566)	total: 435ms	remaining: 332ms
567:	learn: 0.0067052	test: 0.0168076	best: 0.0168076 (567)	total: 436ms	remaining: 331ms
568:	learn: 0.0066913	test: 0.0167757	best: 0.0167757 (568)	total: 436ms	remaining: 330ms
569:	learn: 0.0066779	test: 0.0167466	best: 0.0167466 (569)	total: 437ms	remaining: 330ms
570:	learn: 0.0066642	test: 0.0167150	best: 0.0167150 (570)	total: 437ms	remaining: 329ms
571:	learn: 0.0066509	test: 0.0166862	best: 0.0166862 (571)	total: 438ms	remaining: 328ms
572:	learn: 0.0066373	test: 0.0166548	best: 0.0166548 (572)	total: 438ms	remaining: 327ms
573:	learn

874:	learn: 0.0041636	test: 0.0109375	best: 0.0109375 (874)	total: 613ms	remaining: 87.6ms
875:	learn: 0.0041584	test: 0.0109256	best: 0.0109256 (875)	total: 614ms	remaining: 86.8ms
876:	learn: 0.0041544	test: 0.0109158	best: 0.0109158 (876)	total: 614ms	remaining: 86.1ms
877:	learn: 0.0041493	test: 0.0109039	best: 0.0109039 (877)	total: 615ms	remaining: 85.4ms
878:	learn: 0.0041442	test: 0.0108920	best: 0.0108920 (878)	total: 615ms	remaining: 84.7ms
879:	learn: 0.0041402	test: 0.0108823	best: 0.0108823 (879)	total: 616ms	remaining: 83.9ms
880:	learn: 0.0041351	test: 0.0108705	best: 0.0108705 (880)	total: 616ms	remaining: 83.2ms
881:	learn: 0.0041300	test: 0.0108587	best: 0.0108587 (881)	total: 617ms	remaining: 82.5ms
882:	learn: 0.0041249	test: 0.0108470	best: 0.0108470 (882)	total: 617ms	remaining: 81.8ms
883:	learn: 0.0041199	test: 0.0108352	best: 0.0108352 (883)	total: 618ms	remaining: 81.1ms
884:	learn: 0.0041159	test: 0.0108256	best: 0.0108256 (884)	total: 618ms	remaining: 80.3ms

<catboost.core.CatBoostClassifier at 0x7f3470f93370>

In [13]:
model.save_model('catboost_spam.model')

In [14]:
model = CatBoostClassifier()
model.load_model('catboost_spam.model')

<catboost.core.CatBoostClassifier at 0x7f3470f22130>

### predict

In [15]:
name = '💥💥💥 ФЛIPТ БЕ3 РAMOK 🌶🌶🌶'
validation = data_augmentation(pd.DataFrame([name], columns = ['name']))
pred = model.predict(validation)
spam = pred[0]
spam

1

In [16]:
name = 'Yuriy 👤 Grinev'
validation = data_augmentation(pd.DataFrame([name], columns = ['name']))
pred = model.predict(validation)
spam = pred[0]
spam

0