In [99]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

In [126]:
val_data = pd.read_csv("data/val.csv")

In [4]:
train_avito_data = pd.read_csv("data/train.csv")

In [7]:
def count_digits(string):
    return sum(item.isdigit() for item in string)

## train

In [5]:
fraud = train_avito_data[train_avito_data["is_bad"]==1].copy()

fraud["#_of_digits"] = fraud["description"].apply(count_digits)

# цифрами точно не передан номер, с учетом домашнего телефона
fraud["phone_number_NOT_given_with_digits"] = fraud["#_of_digits"] < 6


fraud["contains_link"] = fraud["description"].str.contains("https://") |\
                            fraud["description"].str.contains(".ru")

fraud["contains_vk"] = fraud["description"].str.contains("vk") | \
                        fraud["description"].str.contains("вконтакте") | \
                        fraud["description"].str.contains("в контакте")

fraud["contains_inst"] = fraud["description"].str.contains("instagram") | \
                        fraud["description"].str.contains("инстагр")

fraud["contains_fb"] = fraud["description"].str.contains("facebook") | \
                        fraud["description"].str.contains("фейсбук")


words_numbers = ["ноль","один","два", "три", "четыре", "пять", "шесть", "семь","девять", "десять",
                    "одиннадцать", "двеннадцать", "триннадцать", "четырнадцать", "пятнадцать", "шесть"
                    "шестнадцать","семнадцать","восемнадцать","девятнадцать","двадцать","тридцать",
                    "сорок","пятьдесят","шестьдесят","семьдесят","восемьдесят","девяносто",
                    "сто","двести","триста","четыреста","пятьсот","шестьсот","семьсот","восемьсот","девятьсот"]

fraud["contains_word_number"] = fraud["description"].apply(lambda x: \
                                    any(word_number in x for word_number in words_numbers))


# точные кейсы, где содержится номер телефона
fraud["contains_phone_number"] = fraud["description"]\
                                .str.contains("^((8|\+7)[\- ]?)?(\(?\d{3}\)?[\- ]?)?[\d\- ]{7,10}$") 

developed_fraud = fraud.query("phone_number_NOT_given_with_digits == False or contains_word_number == True or \
            contains_link == True or contains_vk == True or contains_inst == True or \
            contains_fb == True or contains_phone_number==True").copy()

train_avito_data.loc[train_avito_data.index.isin(developed_fraud.index), "fraud_developed"] = 1
train_avito_data.loc[~(train_avito_data.index.isin(developed_fraud.index)), "fraud_developed"] = 0

avito_data = pd.merge(train_avito_data, fraud)

avito_data_prepared = avito_data[["subcategory", "category", "price", "region", \
                                  "city", "fraud_developed", "#_of_digits", "phone_number_NOT_given_with_digits", 
                                  "contains_link", "contains_vk", "contains_inst", "contains_fb", 
                                  "contains_phone_number", "contains_word_number"]].copy()

category = pd.get_dummies(avito_data_prepared["category"], prefix='category')

avito_data_prepared = pd.concat([avito_data_prepared.drop("category", axis=1), category], axis=1)

avito_data_prepared = avito_data_prepared.rename(columns={'category_Бытовая электроника': "category_consumer_electronics",
                                     'category_Для бизнеса': "category_for_business",
                                     'category_Для дома и дачи': "category_for_house", 
                                     'category_Животные': "category_animals", 
                                     'category_Личные вещи': "category_personal_belongings",
                                     'category_Недвижимость': "category_real_estate", 
                                     'category_Работа': "category_job", 
                                     'category_Транспорт': "category_transport",
                                     'category_Услуги': "category_services", 
                                     'category_Хобби и отдых': "category_hobby_rest"})

label_encoder = LabelEncoder()

object_cols = ['subcategory', 'region', 'city']

for col in object_cols:
    avito_data_prepared[col] = label_encoder.fit_transform(avito_data_prepared[col])
    
bool_cols = ["phone_number_NOT_given_with_digits", 
                                  "contains_link", "contains_vk", "contains_inst", "contains_fb", 
                                  "contains_phone_number", "contains_word_number"]

avito_data_prepared[bool_cols] = avito_data_prepared[bool_cols].astype(int)

avito_data_prepared["price"] = avito_data_prepared.loc[:,"price"].\
                               fillna(avito_data_prepared.groupby("subcategory")["price"]\
                               .transform("mean")).copy()

avito_data_prepared = avito_data_prepared.rename(columns={"fraud_developed":"is_bad"})

## val

In [127]:
val_fraud = val_data[val_data["is_bad"]==1].copy()

val_fraud["#_of_digits"] = val_fraud["description"].apply(count_digits)

# цифрами точно не передан номер, с учетом домашнего телефона
val_fraud["phone_number_NOT_given_with_digits"] = val_fraud["#_of_digits"] < 6


val_fraud["contains_link"] = val_fraud["description"].str.contains("https://") |\
                            val_fraud["description"].str.contains(".ru")

val_fraud["contains_vk"] = val_fraud["description"].str.contains("vk") | \
                        val_fraud["description"].str.contains("вконтакте") | \
                        val_fraud["description"].str.contains("в контакте")

val_fraud["contains_inst"] = val_fraud["description"].str.contains("instagram") | \
                        val_fraud["description"].str.contains("инстагр")

val_fraud["contains_fb"] = val_fraud["description"].str.contains("facebook") | \
                        val_fraud["description"].str.contains("фейсбук")


words_numbers = ["ноль","один","два", "три", "четыре", "пять", "шесть", "семь","девять", "десять",
                    "одиннадцать", "двеннадцать", "триннадцать", "четырнадцать", "пятнадцать", "шесть"
                    "шестнадцать","семнадцать","восемнадцать","девятнадцать","двадцать","тридцать",
                    "сорок","пятьдесят","шестьдесят","семьдесят","восемьдесят","девяносто",
                    "сто","двести","триста","четыреста","пятьсот","шестьсот","семьсот","восемьсот","девятьсот"]

val_fraud["contains_word_number"] = val_fraud["description"].apply(lambda x: \
                                    any(word_number in x for word_number in words_numbers))


# точные кейсы, где содержится номер телефона
val_fraud["contains_phone_number"] = val_fraud["description"]\
                                .str.contains("^((8|\+7)[\- ]?)?(\(?\d{3}\)?[\- ]?)?[\d\- ]{7,10}$") 

# developed_val_fraud = val_fraud.query("phone_number_NOT_given_with_digits == False or contains_word_number == True or \
#             contains_link == True or contains_vk == True or contains_inst == True or \
#             contains_fb == True or contains_phone_number==True").copy()

# val_data.loc[val_data.index.isin(developed_val_fraud.index), "val_fraud_developed"] = 1
# val_data.loc[~(val_data.index.isin(developed_val_fraud.index)), "val_fraud_developed"] = 0

# avito_data = pd.merge(val_data, val_fraud)

val_avito_data_prepared = val_fraud[["subcategory", "category", "price", "region", 
                                  "city", "is_bad","#_of_digits", "phone_number_NOT_given_with_digits", 
                                  "contains_link", "contains_vk", "contains_inst", "contains_fb", 
                                  "contains_phone_number", "contains_word_number"]].copy()

category = pd.get_dummies(val_avito_data_prepared["category"], prefix='category')

val_avito_data_prepared = val_avito_data_prepared.rename(columns={'category_Бытовая электроника': "category_consumer_electronics",
                                     'category_Для бизнеса': "category_for_business",
                                     'category_Для дома и дачи': "category_for_house", 
                                     'category_Животные': "category_animals", 
                                     'category_Личные вещи': "category_personal_belongings",
                                     'category_Недвижимость': "category_real_estate", 
                                     'category_Работа': "category_job", 
                                     'category_Транспорт': "category_transport",
                                     'category_Услуги': "category_services", 
                                     'category_Хобби и отдых': "category_hobby_rest"})

label_encoder_subcategory = LabelEncoder()
label_encoder_region = LabelEncoder()
label_encoder_city = LabelEncoder()
        
label_encoder_subcategory.fit(train_avito_data["subcategory"])
label_encoder_region.fit(train_avito_data["region"])
label_encoder_city.fit(train_avito_data["city"])
        
val_avito_data_prepared["subcategory"] = label_encoder_subcategory.transform(val_avito_data_prepared["subcategory"])
val_avito_data_prepared["region"] = label_encoder_region.transform(val_avito_data_prepared["region"])
val_avito_data_prepared["city"] = label_encoder_city.transform(val_avito_data_prepared["city"])
    
bool_cols = ["phone_number_NOT_given_with_digits", 
                                  "contains_link", "contains_vk", "contains_inst", "contains_fb", 
                                  "contains_phone_number", "contains_word_number"]

val_avito_data_prepared[bool_cols] = val_avito_data_prepared[bool_cols].astype(int)

val_avito_data_prepared["price"] = val_avito_data_prepared.loc[:,"price"].\
                               fillna(val_avito_data_prepared.groupby("subcategory")["price"]\
                               .transform("mean")).copy()


  return func(self, *args, **kwargs)


## model

In [53]:
avito_data_prepared = avito_data_prepared.rename(columns={"fraud_developed":"is_bad"})

In [93]:
X_train = avito_data_prepared.drop("is_bad", axis=1).copy()
y_train = avito_data_prepared.is_bad.copy()

In [108]:
X_val = val_avito_data_prepared.drop("is_bad", axis=1).copy()
y_val = val_avito_data_prepared.is_bad.copy()

In [109]:
model = CatBoostClassifier(random_state=42, 
                           iterations=100,
                           learning_rate=0.1,
                           depth=6)

In [110]:
model.fit(X_train, y_train)

0:	learn: 0.3298808	total: 124ms	remaining: 12.3s
1:	learn: 0.1513747	total: 211ms	remaining: 10.3s
2:	learn: 0.0681032	total: 315ms	remaining: 10.2s
3:	learn: 0.0334122	total: 428ms	remaining: 10.3s
4:	learn: 0.0176302	total: 521ms	remaining: 9.9s
5:	learn: 0.0098084	total: 605ms	remaining: 9.47s
6:	learn: 0.0059007	total: 696ms	remaining: 9.25s
7:	learn: 0.0037977	total: 778ms	remaining: 8.94s
8:	learn: 0.0025514	total: 859ms	remaining: 8.69s
9:	learn: 0.0018189	total: 941ms	remaining: 8.47s
10:	learn: 0.0013426	total: 1.03s	remaining: 8.34s
11:	learn: 0.0009993	total: 1.11s	remaining: 8.14s
12:	learn: 0.0007975	total: 1.19s	remaining: 7.95s
13:	learn: 0.0006269	total: 1.26s	remaining: 7.77s
14:	learn: 0.0005335	total: 1.34s	remaining: 7.62s
15:	learn: 0.0004601	total: 1.42s	remaining: 7.48s
16:	learn: 0.0004016	total: 1.52s	remaining: 7.4s
17:	learn: 0.0003481	total: 1.59s	remaining: 7.23s
18:	learn: 0.0003167	total: 1.67s	remaining: 7.11s
19:	learn: 0.0002804	total: 1.73s	remaining

<catboost.core.CatBoostClassifier at 0x126126fd0>

In [111]:
y_pred = model.predict(X_val)

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=1]="Транспорт": Cannot convert 'b'\xd0\xa2\xd1\x80\xd0\xb0\xd0\xbd\xd1\x81\xd0\xbf\xd0\xbe\xd1\x80\xd1\x82'' to float

In [None]:
roc_auc_score(y_pred, y_val)