In [2]:
import pandas as pd
import numpy as np
import warnings
import joblib
import gc
import re
import nltk


warnings.filterwarnings(action="ignore")


from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [None]:
nltk.download("stopwords")

## исходные данные

In [5]:
train_avito_data = pd.read_csv("data/train.csv")

In [3]:
val_avito_data = pd.read_csv("data/val.csv")

## сэмплирование train

In [242]:
sampler = RandomUnderSampler(random_state=0)

X_train = train_avito_data.drop("is_bad", axis=1).copy()
y_train = train_avito_data.is_bad.copy()

X_res, y_res = sampler.fit_resample(X_train, y_train)

In [244]:
train_avito_data_res = X_res.copy()
train_avito_data_res["is_bad"] = y_res.copy()

# Подготовка данных

1) выделим параметры `phone_number_NOT_given_with_digits`, `contains_phone_number`, `contains_link`, `contains_vk`, `contains_fb`, `contains_instagram`. Затем сохраним те, которые будут соответствовать фродовому случаю.

2) По-моему мнению, в контексте авито, чтобы не портить опыт пользователей продавцов (которые приносят основный доход) в нашем случае лучше недоловить фродстеров (мошенников), чем переловить даже неважные кейсы. Другими словами помечать только те кейсы, где есть явный паттерн нарушения.


- добавил вайбер `contains_viber`

In [271]:
#Create lemmatizer and stopwords list
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

#Preprocess function
def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    return text

# для подсчета цифр в тексте
def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    return text


def contains_contact_info_minimal(df):

    df["description_title"] = df["description"] + " " + df["title"]
    
    # слова-цифры
    words_numbers = ["ноль","один","два", "три", "четыре", "пять", "шесть", "семь","девять", "десять",
                    "одиннадцать", "двенадцать", "тринадцать", "четырнадцать", "пятнадцать", "шесть"
                    "шестнадцать","семнадцать","восемнадцать","девятнадцать","двадцать","тридцать",
                    "сорок","пятьдесят","шестьдесят","семьдесят","восемьдесят","девяносто",
                    "сто","двести","триста","четыреста","пятьсот","шестьсот","семьсот","восемьсот","девятьсот"]
    
    df["contains_word_number"] = df["description_title"].apply(lambda x: \
                                    any(word_number in x for word_number in words_numbers))
        
    df["contains_link"] = df["description_title"].str.contains("https://") |\
                                df["description_title"].str.contains(".ru")

    df["contains_vk"] = df["description_title"].str.contains("vk") | \
                                df["description_title"].str.contains("вконтакте") | \
                                df["description_title"].str.contains("в контакте")

    df["contains_inst"] = df["description_title"].str.contains("instagram") | \
                                df["description_title"].str.contains("инстагр")

    df["contains_fb"] = df["description_title"].str.contains("facebook") | \
                                df["description_title"].str.contains("фейсбук")
    
    df["contains_mail"] = df["description_title"].str.contains("mail") | \
                                df["description_title"].str.contains("@")
    
    df["contains_viber"] = df["description_title"].str.contains("viber") | \
                                df["description_title"].str.contains("вайбер")
    
    df["contains_whatsapp"] = df["description_title"].str.contains("whatsapp") | \
                                df["description_title"].str.contains("ватсап")
    
    #с наиб. вероятностью содержится номер телефона
    df["contains_phone_number"] = df["description_title"]\
                                    .str.contains("^((8|\+7)[\- ]?)?(\(?\d{3}\)?[\- ]?)?[\d\- ]{7,10}$")
    
    # еще одна проверка на наличие телефона
    df["description_numbers"] = df["description"].apply(lambda x: "".join(re.findall(r'(\d+)', x)))
    
    df["contains_phone_number_in_extracted_numbers"] = df["description_numbers"]\
                                    .str.contains("^((8|\+7)[\- ]?)?(\(?\d{3}\)?[\- ]?)?[\d\- ]{7,10}$")
    
    return df


def count_digits(string):
    return sum(item.isdigit() for item in string)


def prepare_data(train_data=None, val_data=None):
    
    
    if train_data is not None and val_data is None:
        df = train_data.copy()
    if val_data is not None and train_data is not None:
        df = val_data.copy()
    
    #проведем лематизацию title и description
    df["description_lem"] = df["description"].apply(lambda x: preprocess_text(x))
    df["title_lem"] = df["title"].apply(lambda x: preprocess_text(x))
    
    #кол-во пересечений по словам в сырой и лем версии
    # гипотеза - цель фродстера пропиариться в описании, а не продать что-то, поэтому пересечений в его случае
    # должно быть меньше
    df["intersection_title_desc"] = [len(set(a).intersection(b)) for a, b in zip(df.title, df.description)]
    df["intersection_title_desc_lem"] = [len(set(a).intersection(b)) for a, b \
                                         in zip(df.title_lem, df.description_lem)]
    

    
    # улучшим разметку фрода в train
    if train_data is not None and val_data is None:
        
        fraud = df[df["is_bad"]==1].copy()
        
        fraud = contains_contact_info_minimal(fraud)
        
        developed_fraud = fraud.query("contains_word_number == True or \
                contains_link == True or contains_vk == True or contains_inst == True or \
                contains_fb == True or contains_phone_number==True or contains_mail==True or\
                contains_viber == True or contains_phone_number_in_extracted_numbers==True or\
                contains_whatsapp == True").copy()

        df.loc[df.index.isin(developed_fraud.index), "fraud_developed"] = 1
        df.loc[~(df.index.isin(developed_fraud.index)), "fraud_developed"] = 0
        
        # вернем старое название
        df = df.drop("is_bad", axis=1).rename(columns={"fraud_developed": "is_bad"})
        
        del fraud
        del developed_fraud
        
        #gc.collect()
    
    
    
    print("step_1 completed")
    
    
    
    # из EDA получилось выделить слова-тригеры, здесь использован эвристический способ расчета  
    # метрики задевания слов тригеров
    counter = Counter()

    df['title_lem'].apply(lambda x: counter.update(x.split(" ")))
    
    words_triggers_title = ['м²', 'эта', 'квартира','участок','сотня','продавать','дом','ваз','5','6',
                            '4','3','2','1','iphone','lada','ижс','новый','гараж','шина','котенок',
                            'дача','коляска','комната','платье','колесо','7','продаваться','ремонт',
                            'рука','диван','велосипед','priora','8','газ','кровать','детский','диск',
                            'отдавать','зимний','газель','работа','samara','студия','щенок','дверь',
                            'детская','samsung', 'маникюр','toyota','запчасть','днп','снт','резина',
                            'наушник','репетитор','машина', 'корова']
    
    df_words_triggers_title = pd.DataFrame(counter.most_common(500))

    df_words_triggers_title[0] = df_words_triggers_title[0].apply(preprocess_text)

    df_words_triggers_title[1] = df_words_triggers_title[1]/ 1000

    df_words_triggers_title = df_words_triggers_title[df_words_triggers_title[0].str.strip()!=""]

    df_words_triggers_title = df_words_triggers_title.rename(columns={0:"word", 1:"count"})\
                                    .groupby("word").apply(sum).sort_values("count", ascending=False)\
                                    ["count"].reset_index()
    
    df_words_triggers_title = df_words_triggers_title[df_words_triggers_title["word"]\
                                    .isin(words_triggers_title)]
    
    df["title_lem_proba_fraud"] = df["title_lem"].str.lower().apply(lambda x: df_words_triggers_title\
                            [df_words_triggers_title["word"].isin(x.split(" "))]["count"].sum())
    
    
    counter = Counter()

    df['description_lem'].apply(lambda x: counter.update(x.split(" ")))
    
    words_triggers_description = ['телефон', 'звонить', 'состояние', 'номер', 'вопрос', 'продавать','цена', 
                                  'квартира','хороший', '8','—\n','торг','это','работа','год','6','5','3',
                                  '4','1','продаваться','2','любой','дом', 'наш','7','очень','магазин',
                                  'отличный','новый','тело','–\n','мочь','большой','наличие','доставка',
                                  'г','связь','обращаться','дома','россия','писать','весь','запчасть','сайт',
                                  '↓','руб','заказ','9']
    
    df_words_triggers_description = pd.DataFrame(counter.most_common(500))

    df_words_triggers_description[0] = df_words_triggers_description[0].apply(preprocess_text)

    df_words_triggers_description[1] = df_words_triggers_description[1]/ 1000

    df_words_triggers_description = df_words_triggers_description[df_words_triggers_description[0].str.strip()!=""]

    df_words_triggers_description = df_words_triggers_description.rename(columns={0:"word", 1:"count"})\
                                    .groupby("word").apply(sum).sort_values("count", ascending=False)\
                                    ["count"].reset_index()
    
    df_words_triggers_description = df_words_triggers_description[df_words_triggers_description["word"]\
                                    .isin(words_triggers_description)]
    
    df["desc_lem_proba_fraud"] = df["description_lem"].str.lower().apply(lambda x: df_words_triggers_description\
                            [df_words_triggers_description["word"].isin(x.split(" "))]["count"].sum())
    
    
    
    print("step_2 completed")
    
    
    
    # уберем явных аутлаеров в категории цены (в train), чтобы они не испортили замену на средние значения
    if train_data is not None and val_data is None:    
        df = df[np.abs(df.price-df.price.mean()) <= (3*df.price.std())]
    
    # обработка пропущенных значени в price (есть пропущенные в обоих train и val)
    df["price"] = df.loc[:,"price"].fillna(df.groupby("subcategory")["price"]\
                               .transform("mean"))

    # добавим метрику отличия от средней цены по субкатегории
    df["average_price_in_subcategory"] = df.groupby('subcategory')['price'].transform('mean')
    
    #поскольку метрика цены давала нам прирост точности 6%, попробуем использовать еще и прозв-ую хар-ку
    df["price_diff_from_subcategory"] = df["price"] - df["average_price_in_subcategory"]

    # добавим вспомогательные, простые параметры текста title, description для модели
    df["description_len"] = df["description"].apply(len)
    df["title_len"] = df["title"].apply(len)
    df["number_of_words_description"] = df["description"].apply(lambda x: len(x.split(" ")))
    df["number_of_words_title"] = df["title"].apply(lambda x: len(x.split(" ")))
    df["number_of_sentences_description"] = df["description"].apply(lambda x: x.count(".")+\
                                                                    x.count("!")+x.count("?"))
    
    # те же самые параметры для лематизированной версии
    df["lem_description_len"] = df["description_lem"].apply(len)
    df["lem_title_len"] = df["title_lem"].apply(len)
    df["lem_number_of_words_description"] = df["description_lem"].apply(lambda x: len(x.split(" ")))
    df["lem_number_of_words_title"] = df["title_lem"].apply(lambda x: len(x.split(" ")))
    df["lem_number_of_sentences_description"] = df["description_lem"].apply(lambda x: x.count(".")+\
                                                                    x.count("!")+x.count("?"))
    
    print("step_3 completed")
    
    
    df["description_title"] = df["description"] + " " + df["title"]
    
    df["number_of_digits"] = df["description_title"].apply(count_digits)
    
    # посчитаем кол-во капса в описании + тайтле, тексте (предположение, что фродстеры более склонны к привлечению внимания)
    df['desc_title_number_of_uppercase'] = df['description_title'].str.findall(r'[A-Z]').str.len()
    
    # посчитаем отношение длины символов (не буквы и цифры) ко всему тексту (значки опять же оружие фродстеров)
    df["description_title_ratio_of_non_words_number"] = 1 - df['description_title'].apply(lambda x: \
                                                      sum(c.isalpha() for c in "".join(str(x).split())) \
                                                    + sum(c.isdigit() for c in "".join(str(x).split()))) \
                                        / df["description_title"].apply(lambda x: len("".join(str(x).split())))
    
    # подсчитаем среднюю длину слова в title, description в исходной и лематизированной версии
    df["avearge_word_len_title"] = df["title"].apply(lambda x: sum(len(word) for word in str(x).split(" ")) \
                                                     / len(str(x).split(" ")))
    df["avearge_word_len_title_lem"] = df["title_lem"].apply(lambda x: sum(len(word) for word in str(x).split(" ")) \
                                                             / len(str(x).split(" ")))
    df["avearge_word_len_desc"] = df["description"].apply(lambda x: sum(len(word) for word in str(x).split(" ")) \
                                                          / len(str(x).split(" ")))
    df["avearge_word_len_desc_lem"] = df["description_lem"].apply(lambda x: sum(len(word) \
                                                            for word in str(x).split(" ")) / len(str(x).split(" ")))
    
    #label encoding для 'subcategory', 'region', 'city' (эти показатели остояли свою важность)
    object_cols = ['subcategory', 'region', 'city']
    
    # важно, чтобы label encoding из train полностью соответствовал validate
    if val_data is not None and train_data is not None:
        label_encoder_subcategory = LabelEncoder()
        label_encoder_region = LabelEncoder()
        label_encoder_city = LabelEncoder()
        
        label_encoder_subcategory.fit(train_data["subcategory"])
        label_encoder_region.fit(train_data["region"])
        label_encoder_city.fit(train_data["city"])
        
        df["subcategory"] = label_encoder_subcategory.transform(df["subcategory"])
        df["region"] = label_encoder_region.transform(df["region"])
        df["city"] = label_encoder_city.transform(df["city"])
    
    if train_data is not None and val_data is None:
        label_encoder = LabelEncoder()
        for col in object_cols:
            df[col] = label_encoder.fit_transform(df[col])
    
    
    
    print("Final step_4 completed, data is prepared, enjoy!")
    
    
    
    return df

# Разбиение данных

In [272]:
train_data = prepare_data(train_avito_data_res)

step_1 completed
step_2 completed
Final step_4 completed, data is prepared, enjoy!


In [273]:
X_train = train_data.drop("is_bad", axis=1).copy()
y_train = train_data.is_bad.copy()

In [274]:
val_data = prepare_data(train_avito_data, val_avito_data)

step_1 completed
step_2 completed
Final step_4 completed, data is prepared, enjoy!


In [275]:
X_val = val_data.drop("is_bad", axis=1).copy()
y_val = val_data.is_bad.copy()

In [276]:
train_data.to_pickle("data/train_data_with_metrics_resampled.p", compression="gzip")
val_data.to_pickle("data/val_data_with_metrics_resampled.p", compression="gzip")

In [278]:
X_train.columns

Index(['title', 'description', 'subcategory', 'category', 'price', 'region',
       'city', 'datetime_submitted', 'description_lem', 'title_lem',
       'intersection_title_desc', 'intersection_title_desc_lem',
       'title_lem_proba_fraud', 'desc_lem_proba_fraud',
       'average_price_in_subcategory', 'price_diff_from_subcategory',
       'description_len', 'title_len', 'number_of_words_description',
       'number_of_words_title', 'number_of_sentences_description',
       'lem_description_len', 'lem_title_len',
       'lem_number_of_words_description', 'lem_number_of_words_title',
       'lem_number_of_sentences_description',
       'lem_to_original_description_len', 'lem_to_original_title_len',
       'lem_to_original_number_of_words_description',
       'lem_to_original_number_of_words_title',
       'lem_to_original_sentences_description', 'description_title',
       'number_of_digits', 'desc_title_number_of_uppercase',
       'description_title_ratio_of_non_words_number', 'avea

# выбор данных для обучения и выбор модели классификации


- т.к. данные сильно несбалансированы, то стоит сделать undersampling
- за базовую модель возьем catboost
- найдем наилучшие параметры с помощью gridsearch

## Feature selection

In [3]:
train_data = pd.read_pickle("data/train_data_with_metrics_resampled.p", compression="gzip")
train_data = train_data.drop_duplicates()

val_data = pd.read_pickle("data/val_data_with_metrics_resampled.p", compression="gzip")

In [4]:
train_data = train_data.drop(['title', 'description', 'category', 'datetime_submitted',
                             "description_lem", "title_lem", "description_title"], axis = 1).copy()

val_data = val_data.drop(['title', 'description', 'category', 'datetime_submitted',
                             "description_lem", "title_lem", "description_title"], axis = 1).copy()

In [5]:
train_data = train_data.drop(["lem_to_original_sentences_description",
                "lem_to_original_description_len", "lem_to_original_title_len",
                "lem_to_original_number_of_words_description", "lem_to_original_number_of_words_title"], axis=1)

train_data["price_diff_from_subcategory"] = abs(train_data["price_diff_from_subcategory"])

In [6]:
val_data = val_data.drop(["lem_to_original_sentences_description",
                "lem_to_original_description_len", "lem_to_original_title_len",
                "lem_to_original_number_of_words_description", "lem_to_original_number_of_words_title"], axis=1)

val_data["price_diff_from_subcategory"] = abs(val_data["price_diff_from_subcategory"])

In [7]:
X_train = train_data.drop("is_bad", axis=1).copy()
y_train = train_data.is_bad.copy()

X_val = val_data.drop("is_bad", axis=1).copy()
y_val = val_data.is_bad.copy()

In [54]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

bestfeatures = SelectKBest(score_func=chi2, k=27)

fit = bestfeatures.fit(X_train, y_train)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)

featureScores.columns = ['Specs','Score']  #naming the dataframe columns
featureScores.sort_values("Score", ascending=False)

Unnamed: 0,Specs,Score
9,price_diff_from_subcategory,30212750000.0
1,price,24924650000.0
8,average_price_in_subcategory,10564040000.0
10,description_len,1854779.0
15,lem_description_len,1595202.0
20,number_of_digits,609286.6
21,desc_title_number_of_uppercase,587163.5
17,lem_number_of_words_description,337326.5
6,title_lem_proba_fraud,257534.3
3,city,231080.0


In [9]:
X_train = X_train[sorted(X_train.columns)]
X_val = X_val[sorted(X_val.columns)]

In [141]:
# для
model_1 = XGBClassifier(random_state = 1, 
                       learning_rate=0.01)

model_1.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [142]:
y_pred_1 = model_1.predict_proba(X_val).T[1]

In [82]:
accuracy_score(y_pred, y_val)

0.7729876208659235

In [83]:
confusion_matrix(y_pred, y_val)

array([[12211,  3641],
       [   45,   340]])

In [84]:
roc_auc_score(y_pred, y_val)

0.8267148886944495

In [143]:
model_2 = CatBoostClassifier(random_state=16, 
                            iterations=110,
                            learning_rate=0.1,
                            depth=6,
                            )

model_2.fit(X_train, y_train)

0:	learn: 0.6590800	total: 220ms	remaining: 24s
1:	learn: 0.6326330	total: 389ms	remaining: 21s
2:	learn: 0.6097775	total: 575ms	remaining: 20.5s
3:	learn: 0.5934699	total: 741ms	remaining: 19.6s
4:	learn: 0.5791101	total: 936ms	remaining: 19.7s
5:	learn: 0.5686880	total: 1.09s	remaining: 18.8s
6:	learn: 0.5574059	total: 1.25s	remaining: 18.5s
7:	learn: 0.5480197	total: 1.41s	remaining: 18s
8:	learn: 0.5400411	total: 1.57s	remaining: 17.6s
9:	learn: 0.5336025	total: 1.72s	remaining: 17.2s
10:	learn: 0.5283404	total: 1.88s	remaining: 16.9s
11:	learn: 0.5243932	total: 2.14s	remaining: 17.5s
12:	learn: 0.5200681	total: 2.29s	remaining: 17.1s
13:	learn: 0.5152016	total: 2.48s	remaining: 17s
14:	learn: 0.5115656	total: 2.65s	remaining: 16.8s
15:	learn: 0.5088257	total: 2.81s	remaining: 16.5s
16:	learn: 0.5065214	total: 2.96s	remaining: 16.2s
17:	learn: 0.5040072	total: 3.14s	remaining: 16s
18:	learn: 0.5018871	total: 3.3s	remaining: 15.8s
19:	learn: 0.4996705	total: 3.46s	remaining: 15.6s
2

<catboost.core.CatBoostClassifier at 0x1c4c5373c8>

In [144]:
y_pred_2 = model_2.predict_proba(X_val).T[1]

In [93]:
confusion_matrix(y_pred_2, y_val)

array([[12254,  3860],
       [    2,   121]])

In [94]:
roc_auc_score(y_pred_2, y_val)

0.8720982915426773

In [105]:
roc_auc_score(y_comb, y_val)

0.8292712477110563

In [134]:
model_3 = RandomForestClassifier(random_state = 1, n_estimators=200, 
                                max_depth = 6, warm_start=True)

model_3.fit(X_train, y_train)

RandomForestClassifier(max_depth=6, n_estimators=200, random_state=1,
                       warm_start=True)

In [145]:
y_pred_3 = model_3.predict_proba(X_val).T[1]

In [109]:
roc_auc_score(y_pred_3, y_val)

0.8474948995783651

In [111]:
confusion_matrix(y_pred_3, y_val)

array([[12247,  3853],
       [    9,   128]])

In [208]:
model_4 = KNeighborsClassifier(n_neighbors=7, weights="uniform",leaf_size=20)
model_4.fit(X_train, y_train)

param_grid = {
    'n_neighbors': [3,7,13,18],
    "weights": ["uniform", "distance"],
    "metric": ["eucledian","manhattan"],
    "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
    "leaf_size": [20,30,40],
    
}

In [212]:
y_pred_4 = model_4.predict_proba(X_val).T[1]

In [211]:
roc_auc_score(y_pred_4, y_val)

0.5816806795819155

# комбинация результатов

In [271]:
y_comb = 0.3*y_pred_1 + 0.3*y_pred_2 + 0.3* y_pred_3 + 0.1* y_pred_4
y_comb = np.where(y_comb<0.77, 0, 1)

In [272]:
roc_auc_score(y_comb, y_val)

0.87752587481518

In [268]:
confusion_matrix(y_comb, y_val)

array([[12256,  3941],
       [    0,    40]])

## K-fold на трэйн выборке

In [None]:
kf = KFold(n_splits=5,shuffle=False)
kf.split(X_train)    

accuracy_model = []
errors = []



r_s = range(1,20)

for r in r_s:
    clf = XGBClassifier(random_state=r, base_score=0.9,  eval_metric='auc')
    for train_index, test_index in kf.split(X_train_main):
        # Split train-test
        X_train_fold, y_train_fold = X_train.iloc[train_index], y_train.iloc[train_index]
        # Train the model
        model = clf.fit(X_train_fold, y_train_fold)
        # Append to accuracy_model the accuracy of the model
        try:
            score = roc_auc_score(y_test, model.predict(X_val))
            print(f"rand st {r}: {score}")
        except: 
            errors.append("Oops")