In [103]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

import optuna
from tqdm import tqdm

import pymorphy3
from catboost import CatBoostClassifier, Pool, cv
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import stop_words
import string
import re

import warnings
warnings.filterwarnings('ignore')

random_state = 52

%matplotlib inline

sns.set(style='darkgrid')

In [104]:
train_df = pd.read_csv('train.csv')

train_df.head()

Unnamed: 0,ID,url,title,label
0,0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",0
1,1,www.kp.by,Эта песня стала известна многим телезрителям б...,0
2,2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,0
3,3,colorbox.spb.ru,Не Беси Меня Картинки,0
4,4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,0


In [105]:
train_df.drop(columns=['ID'], inplace=True)

In [106]:
russian_stopwords = stop_words.get_stop_words('ru')

def remove_punctuation(text):
    return ''.join([ch if ch not in string.punctuation else ' ' for ch in text])
    
def remove_numbers(text):
    return ''.join([i if not i.isdigit() else ' ' for i in text])

def remove_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text, flags=re.I)

prep_title = [remove_multiple_spaces(remove_punctuation(text.lower())) for text in train_df['title'].astype('str')]
train_df['text_title'] = prep_title
train_df['text_url'] = train_df['url'].apply(lambda x: ' '.join(x.split('.')))

train_df.head()

Unnamed: 0,url,title,label,text_title,text_url
0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",0,экс министр экономики молдовы главе мидэи цель...,m kp md
1,www.kp.by,Эта песня стала известна многим телезрителям б...,0,эта песня стала известна многим телезрителям б...,www kp by
2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,0,банши 4 сезон 2 серия бремя красоты смотреть о...,fanserials tv
3,colorbox.spb.ru,Не Беси Меня Картинки,0,не беси меня картинки,colorbox spb ru
4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,0,в новомосковске сыграют следж хоккеисты алекси...,tula-sport ru


In [107]:
stop_words = ['porn', 'sex', 'erotic', 'adults', 'xxx', 'naked', 'intimate', 'hardcore', 'lesbians', 'fetish', 'masturbation', 
            'anal', 'oral', 'pornstars', 'sexual', 'sperm', 'swingers', 'bondage', 'bdsm', 'vibrator', 'cunnilingus', 'dildo',
            'ejaculation', 'hot', 'guy', 'xvideos', 'blowjob', 'cumshot', 'incest', 'tits', 'fuck',
            'порно', 'секс', 'эротика', 'взрослый', 'голый', 'интим', 'жесткий', 'лесбиянка', 'фетиш', 'дрочка', 'анал',
            'оральный', 'порнозвезда', 'сперма', 'мастурбация', 'групповой', 'свингеры', 'бондаж', 'бдсм', 'вибратор', 'кунилингус',
            'фото', 'инцест', 'волосатый', 'минет', 'домашний', 'мамочка', 'зрелый', 'молоденький', 'сучка', 'групповуха',
            'хардкор', 'попка', 'сосать', 'milfs', 'moms', 'cocks', 'pussy', 'nudes', 'sexsy', 'mommy', 'videos',
             'хуй', 'nude', 'video', 'sexy', 'porno', 'кончать', 'член', 'оргазм', 'привязать', '24xxx', 'грудастый', 'por', 'sexo',
             'sweet', 'camshooker', 'долбиться', 'блондинка', 'gay', 'big', 'butts', 'teens', 'pornhub', '18yo', 'creampied', 'тёлка',
             'раздеться', 'красотка', 'страпонить', 'горячий', 'biqle', 'девка', 'писька', 'задница', 'хоумвидео', 'шлюшка', 'трахаться',
             'трахать', 'эротический', 'эскорт', 'пизда', 'дрючить', 'небритый', 'дрочить', 'клитор', 'стояк', 'отсосать', 'шлюха',
             'бритый', 'жопа']

morph = pymorphy3.MorphAnalyzer()

stop_words = [morph.parse(word)[0].normal_form for word in stop_words]

In [108]:
train_df['url_len'] = train_df.url.str.len()
train_df['title_len'] = train_df.title.str.len()

In [109]:
def generate_trigrams(text):
    trigrams = [text[i:i+3] for i in range(len(text) - 2)]
    return trigrams

In [112]:
def vectorize_title(train_df, train=True):
    morph = pymorphy3.MorphAnalyzer()
    
    lemm_texts_list = []
    lemm_trigram = []
    for text in tqdm(train_df['text_title']):
        text_lem = [morph.parse(word)[0].normal_form for word in text.split(' ') if word not in russian_stopwords]

        
        # if train and len(text_lem) <= 0:
        #     lemm_texts_list.append('')
        #     continue
        lemm_trigram.append(generate_trigrams(text))   
        lemm_texts_list.append(' '.join(text_lem))

        
        
    train_df['text_lemm'] = lemm_texts_list
    # if train:
    #     train_df = train_df[train_df['text_lemm'] != '']
        
    train_df['trigram_title'] = lemm_trigram
    train_df['stop_count_title'] = train_df['text_lemm'].apply(lambda x: sum(1 for word in x.split(' ') if word in stop_words))
    train_df['stop_count_title'] += train_df['trigram_title'].apply(lambda x: sum(1 for word in x if word in ['sex', 'por', 'xxx',
                                                                                                         'hdx', 'orn', 'gir',
                                                                                                         'irl', 'rls', 'vka',
                                                                                                         'dev', 'rno', 'sek'
                                                                                                         'kis','adu', 'dul',
                                                                                                         'ebu', 'boo', 'oob',
                                                                                                         'obs', 'пор', 'орн',
                                                                                                         'рно', 'сек', 'екс']))
    
    train_df.drop(columns=['text_title'], inplace=True)
    train_df.drop(columns=['title'], inplace=True)
    train_df.drop(columns=['trigram_title'], inplace=True)
    
    return train_df

In [113]:
def vectorize_url(train_df):
    lemm_texts_list = []
    lemm_trigram = []
    for text in tqdm(train_df['text_url']):
        text_lem = [word for word in text.split(' ') if word not in ['www', 'by', 'ru', 'com', 'ua', 'net', 'org']]
        
        lemm_texts_list.append(' '.join(text_lem))

        lemm_trigram.append(generate_trigrams(text))
        
    train_df['lemm_url'] = lemm_texts_list
    train_df['trigram_url'] = lemm_trigram
    train_df['stop_count_url'] = train_df['trigram_url'].apply(lambda x: sum(1 for word in x for s in word.split('-') if s in 
                                                               ['sex', 'por', 'xxx',
                                                                'hdx', 'orn', 'gir',
                                                                'irl', 'rls', 'vka',
                                                                'dev', 'rno', 'sek'
                                                                'kis', '365',
                                                                'adu', 'dul', 'ebu',
                                                                'ful', 'boo', 'oob',
                                                                'obs', 'lov', 'ove']))
    train_df.drop(columns=['text_url'], inplace=True)
    train_df.drop(columns=['url'], inplace=True)
    train_df.drop(columns=['trigram_url'], inplace=True)

    return train_df

In [114]:
X = train_df
y = train_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, test_size=0.25, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=random_state, test_size=0.25, stratify=y_train)

In [115]:
X_f = pd.concat([X_train, X_val])
y_f = pd.concat([y_train, y_val])
X_f = vectorize_title(X_f)

X_train = vectorize_title(X_train)
X_val = vectorize_title(X_val, train=False)
X_test = vectorize_title(X_test, train=False)

100%|█████████████████████████████████| 101481/101481 [01:22<00:00, 1224.36it/s]
100%|███████████████████████████████████| 76110/76110 [01:04<00:00, 1182.49it/s]
100%|███████████████████████████████████| 25371/25371 [00:22<00:00, 1141.30it/s]
100%|███████████████████████████████████| 33828/33828 [00:27<00:00, 1244.01it/s]


In [116]:
y_val = X_val['label']
X_val = vectorize_url(X_val).drop(columns=['label'])

y_train = X_train['label']
X_train = vectorize_url(X_train).drop(columns=['label'])

y_test = X_test['label']
X_test = vectorize_url(X_test).drop(columns=['label'])

y_f = X_f['label']
X_f = vectorize_url(X_f).drop(columns=['label'])

100%|█████████████████████████████████| 25371/25371 [00:00<00:00, 244247.91it/s]
100%|██████████████████████████████████| 76110/76110 [00:01<00:00, 55244.88it/s]
100%|█████████████████████████████████| 33828/33828 [00:00<00:00, 251636.37it/s]
100%|███████████████████████████████| 101481/101481 [00:00<00:00, 296396.84it/s]


In [117]:
def fit_model(train_pool, test_pool, **kwargs):
    model = CatBoostClassifier(random_seed=random_state, iterations=1000,
                               eval_metric='TotalF1', **kwargs)
    
    return model.fit(train_pool, eval_set=test_pool, 
                     verbose=300, use_best_model=True)

In [118]:
train_pool = Pool(data=X_train, label=y_train, 
                  text_features=['text_lemm', 'lemm_url',])
valid_pool = Pool(data=X_val, label=y_val, 
                  text_features=['text_lemm', 'lemm_url',])
test_pool = Pool(data=X_test, label=y_test, 
                  text_features=['text_lemm', 'lemm_url',])

Xf_pool = Pool(data=X_f, label=y_f, 
                  text_features=['text_lemm', 'lemm_url',])

In [19]:
model = fit_model(train_pool, valid_pool, learning_rate=0.3,
                  dictionaries = [{
                      'dictionary_id':'Word',
                      'max_dictionary_size': '50000'
                  }],
                 feature_calcers = ['BoW:top_tokens_count=10000'])
y_pred = model.predict(test_pool)
f1_score(y_test, y_pred)

0:	learn: 0.9355192	test: 0.9321189	best: 0.9321189 (0)	total: 214ms	remaining: 3m 34s
300:	learn: 0.9948734	test: 0.9871869	best: 0.9872833 (289)	total: 1m 14s	remaining: 2m 53s
600:	learn: 0.9983336	test: 0.9880139	best: 0.9880139 (596)	total: 2m 30s	remaining: 1m 39s
900:	learn: 0.9995842	test: 0.9883040	best: 0.9883500 (878)	total: 3m 46s	remaining: 24.9s
999:	learn: 0.9995842	test: 0.9884959	best: 0.9884959 (999)	total: 4m 12s	remaining: 0us

bestTest = 0.9884958653
bestIteration = 999



0.9567634342186534

In [119]:
model = fit_model(Xf_pool, test_pool, learning_rate=0.48,
                  dictionaries = [{
                      'dictionary_id':'Word',
                      'max_dictionary_size': '50000'
                  }],
                 feature_calcers = ['BoW:top_tokens_count=10000'])
y_pred = model.predict(test_pool)
f1_score(y_test, y_pred)

0:	learn: 0.9536559	test: 0.9549840	best: 0.9549840 (0)	total: 297ms	remaining: 4m 56s
300:	learn: 0.9973711	test: 0.9935138	best: 0.9935138 (296)	total: 1m 37s	remaining: 3m 46s
600:	learn: 0.9993292	test: 0.9941445	best: 0.9941445 (599)	total: 3m 12s	remaining: 2m 7s
900:	learn: 0.9995267	test: 0.9944123	best: 0.9944718 (712)	total: 4m 47s	remaining: 31.6s
999:	learn: 0.9995563	test: 0.9943541	best: 0.9944718 (712)	total: 5m 19s	remaining: 0us

bestTest = 0.9944717698
bestIteration = 712

Shrink model to first 713 iterations.


0.9774654712866488

In [122]:
X_train

Unnamed: 0,url_len,title_len,text_lemm,stop_count_title,lemm_url,stop_count_url
126744,7,83.0,ответ mail ru индивидуализация помочь пожалуйс...,0,mail,0
50620,12,88.0,вакуумный усилитель тормоз toyota gaia acm10 a...,0,baza drom,0
90787,10,95.0,роздравнадзор сообщать отмена государственный ...,0,gmpnews,0
134973,14,45.0,мягкий игрушка джек рассесть терьер babytoy,0,babytoy,0
94159,30,141.0,электробатарея экономный отопление продажа цен...,0,ekonomnoe-otoplenie uaprom,0
...,...,...,...,...,...,...
4324,15,23.0,are not gay porn please,4,kindprotect xyz,0
9116,18,67.0,вакансия ведущий разработчик пермь работа ивс ...,0,ekaterinburg hh,0
1172,17,54.0,attractive demise 4 vore fan page 9 eggporncomics,2,eggporncomics,2
79031,11,36.0,тоо quantom проверка контрагент,0,pk uchet kz,0


In [120]:
test_df = pd.read_csv('test.csv')

test_df.head()
test_df = test_df 

In [121]:
morph = pymorphy3.MorphAnalyzer()

prep_title = [remove_multiple_spaces(remove_numbers(remove_punctuation(text.lower()))) for text in test_df['title'].astype('str')]
test_df['text_title'] = prep_title
test_df['text_url'] = test_df['url'].apply(lambda x: ' '.join(x.split('.')))

test_df['url_len'] = test_df.url.str.len()
test_df['title_len'] = test_df.title.str.len()
#test_df['count_word'] = test_df['text_title'].apply(lambda x: len(x.split(' ')))
lemm_trigram_t = []
lemm_texts_list = []
for text in tqdm(test_df['text_title']):
    text_lem = [morph.parse(word)[0].normal_form for word in text.split(' ') if word not in russian_stopwords]
    lemm_texts_list.append(' '.join(text_lem))
    lemm_trigram_t.append(generate_trigrams(text))

test_df['text_lemm'] = lemm_texts_list
#test_df = test_df[test_df['text_lemm'] != '']

lemm_texts_list = []
lemm_trigram = []
for text in tqdm(test_df['text_url']):
    text_lem = [word for word in text.split(' ') if word not in ['www', 'by', 'ru', 'com', 'ua', 'net', 'org']]
        
    lemm_texts_list.append(' '.join(text_lem))
    trigram = lemm_trigram.append(generate_trigrams(text))

test_df['trigram_title'] = lemm_trigram
test_df['lemm_url'] = lemm_texts_list
test_df['trigram_url'] = lemm_trigram
test_df['stop_count_title'] = test_df['text_lemm'].apply(lambda x: sum(1 for word in x.split(' ') if word in stop_words))

test_df['stop_count_title'] += test_df['trigram_title'].apply(lambda x: sum(1 for word in x if word in ['sex', 'por', 'xxx',
                                                                                                         'hdx', 'orn', 'gir',
                                                                                                         'irl', 'rls', 'vka',
                                                                                                         'dev', 'rno', 'sek'
                                                                                                         'kis','adu', 'dul',
                                                                                                         'ebu', 'boo', 'oob',
                                                                                                         'obs', 'пор', 'орн',
                                                                                                         'рно', 'сек', 'екс']))
test_df['stop_count_url'] = test_df['trigram_url'].apply(lambda x: sum(1 for word in x for s in word.split('-') if s in 
                                                               ['sex', 'por', 'xxx',
                                                                'hdx', 'orn', 'gir',
                                                                'irl', 'rls', 'vka',
                                                                'dev', 'rno', 'sek'
                                                                'kis', '365',
                                                                'adu', 'dul', 'ebu',
                                                                'ful', 'boo', 'oob',
                                                                'obs', 'lov', 'ove']))


100%|█████████████████████████████████| 165378/165378 [02:17<00:00, 1198.82it/s]
100%|███████████████████████████████| 165378/165378 [00:01<00:00, 106827.55it/s]


In [49]:
id = test_df['ID']
id

0         135309
1         135310
2         135311
3         135312
4         135313
           ...  
165373    300682
165374    300683
165375    300684
165376    300685
165377    300686
Name: ID, Length: 165378, dtype: int64

In [123]:
test_df = test_df[['url_len', 'title_len', 'text_lemm', 'stop_count_title', 'lemm_url', 'stop_count_url',]]
test_pool = Pool(data=test_df, text_features=['text_lemm', 'lemm_url',])

In [124]:
test_df['label'] = model.predict(test_pool)

In [125]:
answer = pd.merge(test_df, id, left_index=True, right_index=True)

In [126]:
answer[['ID', 'label']].to_csv('test_pred.csv', index=False)

!cat ml_baseline.csv | head

cat: ml_baseline.csv: No such file or directory
