# Задача

![задание](logo.png)

Весь процесс доставки — от приема в отделении до вручения получателю — состоит из большого числа операций. Отправление запаковывают, перевозят на склад и транспортируют между сортировочными пунктами. Если доставка едет из-за границы, то дополнительно появляются операции на зарубежной и российской таможне.

Ускорить и удешевить доставку помогают крупные логистические хабы. Там мелкие грузы сортируются и отправляются в соседние регионы или собираются в новые контейнеры для отправки в другие макрорегионы. Таким образом, почтовые отправления путешествуют по сети сортировочных центров, как кровь по капиллярам, и в конце концов добираются в любые точки нашей страны.

Несмотря на высокий уровень системы безопасности, по-прежнему остается риск пропаж или порчи отправлений: перемещений и операций с посылками очень много, кроме того, в процессе может сыграть человеческий фактор.

Точное предсказание пропаж и их локализация позволит повысить надежность системы — гарантировать доставку отправлений в срок и снизить расходы на транспортировку. Предлагаем участникам чемпионата решить эту задачу — разработать модель предсказания потери почтовых отправлений.

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import uniform
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE 
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegressionCV
import tensorflow as tf 
import numpy as np
from sklearn.metrics import roc_auc_score
%matplotlib inline
from sklearn.metrics import recall_score
import warnings
from sklearn import preprocessing

2022-11-19 19:42:47.084251: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-19 19:42:47.219223: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-19 19:42:47.219243: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-19 19:42:47.254709: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-19 19:42:47.888858: W tensorflow/stream_executor/platform/de

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv("train_dataset_train.csv",low_memory=False)
df

Unnamed: 0,id,oper_type + oper_attr,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,mailtype,mailctg,mailrank,directctg,transport_pay,postmark,name_mfi,weight_mfi,price_mfi,dist_qty_oper_login_1,total_qty_oper_login_1,total_qty_oper_login_0,total_qty_over_index_and_type,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,label
0,6818780,1043_-1,628629,Участок,7503.0,N,0.0,Y,N,87.0,5.0,1.0,0.0,2.0,0.00,0.0,"Fishhook,USB",41.0,150.0,42.0,720176.0,58950.0,779126.0,8290896.0,0,0,0,0,0
1,9907176,1023_-1,102976,ММПО,7503.0,N,0.0,N,N,107.0,5.0,1.0,0.0,2.0,0.00,0.0,"screen protector,Case(Q613B),case(208B40-DB)",68.0,400.0,914.0,48856658.0,83318932.0,132175590.0,136819803.0,0,0,0,0,0
2,3304275,1018_-1,620962,Цех,7503.0,N,0.0,Y,N,50.0,5.0,1.0,0.0,2.0,0.00,0.0,"Pendant Necklaces,Rings for Women,Necklaces",56.0,218.0,62.0,3246292.0,3233068.0,6479360.0,52708071.0,0,1,0,0,0
3,9020937,1019_-1,344964,Цех,7503.0,N,0.0,Y,N,416.0,5.0,1.0,0.0,2.0,35.34,0.0,Motorcycle Signal Lamp,33.0,100.0,55.0,2060928.0,653280.0,2714208.0,19562334.0,0,0,0,0,0
4,3082311,1020_-1,629819,Участок,7503.0,N,0.0,Y,N,795.0,5.0,1.0,0.0,2.0,52.52,0.0,backpack,716.0,1000.0,16.0,316919.0,27911.0,344830.0,4719186.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5999995,9958614,1022_-1,102976,ММПО,7503.0,N,0.0,N,N,25.0,5.0,0.0,0.0,2.0,0.00,0.0,KEY CHAIN,24.0,100.0,1089.0,64270133.0,116432632.0,180702765.0,188407812.0,0,0,1,0,0
5999996,2234489,1022_-1,241963,Цех,7503.0,N,0.0,Y,N,83.0,5.0,1.0,0.0,2.0,0.00,0.0,Dolls,100.0,1832.0,31.0,1767370.0,144063.0,1911433.0,15582018.0,0,0,0,0,0
5999997,4304572,1041_-1,102971,Цех,7506.0,N,0.0,N,N,1700.0,5.0,1.0,0.0,2.0,94.09,0.0,Down jacket,952.0,800.0,186.0,60613352.0,10648.0,60624000.0,75592387.0,0,0,0,0,0
5999998,6550634,1018_-1,102152,Цех,7506.0,N,0.0,N,N,269.0,5.0,1.0,0.0,2.0,0.00,0.0,0,0.0,0.0,105.0,15091338.0,4972424.0,20063762.0,39988530.0,0,1,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000000 entries, 0 to 5999999
Data columns (total 29 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   id                             int64  
 1   oper_type + oper_attr          object 
 2   index_oper                     object 
 3   type                           object 
 4   priority                       float64
 5   is_privatecategory             object 
 6   class                          float64
 7   is_in_yandex                   object 
 8   is_return                      object 
 9   weight                         float64
 10  mailtype                       float64
 11  mailctg                        float64
 12  mailrank                       float64
 13  directctg                      float64
 14  transport_pay                  float64
 15  postmark                       float64
 16  name_mfi                       object 
 17  weight_mfi                     float64
 18  pr

## Добавим посылкам отдельный признак регион.

In [5]:
df['area']=df['index_oper'].str[0:3]
le_area  = preprocessing.LabelEncoder()
le_area.fit(df["area"])
df["area"]=le_area.transform(df["area"])
df["area"]

0          182
1            3
2          175
3           93
4          183
          ... 
5999995      3
5999996     74
5999997      3
5999998      3
5999999     22
Name: area, Length: 6000000, dtype: int64

## Добавим регионам общий обьем посылок который через них проходит.

In [6]:
data_total_area=pd.DataFrame(df.groupby('area')['total_qty_over_index'].sum()).rename(columns = {'total_qty_over_index':'total_qty_over_index_area'})
data_total_area.to_csv('data_total_area.csv')
df = pd.merge(df,data_total_area, on="area", how='left')


## Считаем потерянные в каждом регионе посылки.

In [7]:
data_failed=pd.DataFrame(df.groupby('area')['label'].sum()).rename(columns = {'label':'sum_label'})
data_failed.to_csv('data_failed.csv')
df = pd.merge(df,data_failed, on="area", how='left')

## Процент потерянных

In [8]:
df['percent_loss'] =df['sum_label']/df['total_qty_over_index_area'] 

## Переведем остальные переменные в котигориальные.

In [9]:
le_priority  = preprocessing.LabelEncoder()
le_priority.fit(df["priority"])
df["priority"]=le_priority.transform(df["priority"])
df["priority"]

0          1
1          1
2          1
3          1
4          1
          ..
5999995    1
5999996    1
5999997    3
5999998    3
5999999    1
Name: priority, Length: 6000000, dtype: int64

In [10]:
le_is_in_yandex   = preprocessing.LabelEncoder()
le_is_in_yandex.fit(df["is_in_yandex"])
df["is_in_yandex"]=le_is_in_yandex.transform(df["is_in_yandex"])
df["is_in_yandex"]

0          2
1          1
2          2
3          2
4          2
          ..
5999995    1
5999996    2
5999997    1
5999998    1
5999999    2
Name: is_in_yandex, Length: 6000000, dtype: int64

In [11]:
le_is_return    = preprocessing.LabelEncoder()
le_is_return.fit(df["is_return"])
df["is_return"]=le_is_in_yandex.transform(df["is_return"])
df["is_return"]

0          1
1          1
2          1
3          1
4          1
          ..
5999995    1
5999996    1
5999997    1
5999998    1
5999999    1
Name: is_return, Length: 6000000, dtype: int64

In [12]:
le_oper_type= preprocessing.LabelEncoder()
le_oper_type.fit(df["oper_type + oper_attr"])
df["oper_type + oper_attr"]=le_oper_type.transform(df["oper_type + oper_attr"])
df["oper_type + oper_attr"]

0          26
1          20
2          15
3          16
4          17
           ..
5999995    19
5999996    19
5999997    24
5999998    15
5999999    99
Name: oper_type + oper_attr, Length: 6000000, dtype: int64

In [13]:
le_index_oper= preprocessing.LabelEncoder()
le_index_oper.fit(df["index_oper"])
df["index_oper"]=le_index_oper.transform(df["index_oper"])
df["index_oper"]

0          17235
1             23
2          16331
3           6325
4          17315
           ...  
5999995       23
5999996     4697
5999997       18
5999998        6
5999999      751
Name: index_oper, Length: 6000000, dtype: int64

In [14]:
le_type2= preprocessing.LabelEncoder()
le_type2.fit(df["type"])
df["type"]=le_type2.transform(df["type"])
df["type"]

0          18
1           4
2          19
3          19
4          18
           ..
5999995     4
5999996    19
5999997    19
5999998    19
5999999    13
Name: type, Length: 6000000, dtype: int64

In [15]:
le_is_privatecategory= preprocessing.LabelEncoder()
le_is_privatecategory.fit(df["is_privatecategory"])
df["is_privatecategory"]=le_is_privatecategory.transform(df["is_privatecategory"])
df["is_privatecategory"]

0          1
1          1
2          1
3          1
4          1
          ..
5999995    1
5999996    1
5999997    1
5999998    1
5999999    1
Name: is_privatecategory, Length: 6000000, dtype: int64

In [16]:
le_name_mfi= preprocessing.LabelEncoder()
le_name_mfi.fit(df["name_mfi"])
df["name_mfi"]=le_name_mfi.transform(df["name_mfi"])
df["name_mfi"]

0           46654
1          194484
2           92224
3           79671
4          141303
            ...  
5999995     63431
5999996     36730
5999997     37295
5999998       101
5999999     62590
Name: name_mfi, Length: 6000000, dtype: int64

In [17]:
df["weightkg"] = round(df["weight"]/1000,2)
df['is_wrong']=df['is_wrong_sndr_name']+df['is_wrong_rcpn_name']+df['is_wrong_phone_number']+df['is_wrong_address']
df['total_mean']=df['total_qty_over_index_and_type']/df['total_qty_over_index']

# Добавим характеристики с наибольшей вероятностью потери посылки см файл analytics.

In [18]:
name_mfi_fail=[101,     
                ]         
df['name_mfi_fail']=0
for i in name_mfi_fail:
    df[df['name_mfi'] == i]['name_mfi_fail']=1
df

mailtype_fail=[4     
                ,1     
                ,8     
                ,6       
                ]         
df['mailtype_fail']=0
for i in mailtype_fail:
    df[df['mailtype'] == i]['mailtype_fail']=1
df

oper_type_fail=[4     
                ,69     
                ,73     
                ,27       
                ,17,      
                98,      
                81,      
                26,        
                74,        
                72,         
                82,         
                71,         
                76,]         
df['oper_type_fail']=0
for i in oper_type_fail:
    df[df['oper_type + oper_attr'] == i]['oper_type_fail']=1
df

type_fail=[
4,     
3,      
13,      
19,      
18,       
1,        
11,       
10,       
12,        
0,         
7,         
14,         
6,         
16,         
8,          
]         


df['type_fail']=0
for i in type_fail:
    df[df['type'] == i]['type_fail']=1
df

priority_fail=[
1,
2,
3,
]         


df['priority_fail']=0
for i in priority_fail:
    df[df['priority'] == i]['priority_fail']=1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['name_mfi'] == i]['name_mfi_fail']=1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['mailtype'] == i]['mailtype_fail']=1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['oper_type + oper_attr'] == i]['oper_type_fail']=1
A value is trying to be set on a copy of a slice from a DataFr

## Выделим выборки из финального датасета!

In [19]:
df

Unnamed: 0,id,oper_type + oper_attr,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,mailtype,mailctg,mailrank,directctg,transport_pay,postmark,name_mfi,weight_mfi,price_mfi,dist_qty_oper_login_1,total_qty_oper_login_1,total_qty_oper_login_0,total_qty_over_index_and_type,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,label,area,total_qty_over_index_area,sum_label,percent_loss,weightkg,is_wrong,total_mean,name_mfi_fail,mailtype_fail,oper_type_fail,type_fail,priority_fail
0,6818780,26,17235,18,1,1,0.0,2,1,87.0,5.0,1.0,0.0,2.0,0.00,0.0,46654,41.0,150.0,42.0,720176.0,58950.0,779126.0,8290896.0,0,0,0,0,0,182,2.748856e+11,826,3.004886e-09,0.09,0,0.093974,0,0,0,0,0
1,9907176,20,23,4,1,1,0.0,1,1,107.0,5.0,1.0,0.0,2.0,0.00,0.0,194484,68.0,400.0,914.0,48856658.0,83318932.0,132175590.0,136819803.0,0,0,0,0,0,3,3.484228e+14,109350,3.138428e-10,0.11,0,0.966056,0,0,0,0,0
2,3304275,15,16331,19,1,1,0.0,2,1,50.0,5.0,1.0,0.0,2.0,0.00,0.0,92224,56.0,218.0,62.0,3246292.0,3233068.0,6479360.0,52708071.0,0,1,0,0,0,175,3.939877e+12,741,1.880769e-10,0.05,1,0.122929,0,0,0,0,0
3,9020937,16,6325,19,1,1,0.0,2,1,416.0,5.0,1.0,0.0,2.0,35.34,0.0,79671,33.0,100.0,55.0,2060928.0,653280.0,2714208.0,19562334.0,0,0,0,0,0,93,6.073324e+11,345,5.680579e-10,0.42,0,0.138747,0,0,0,0,0
4,3082311,17,17315,18,1,1,0.0,2,1,795.0,5.0,1.0,0.0,2.0,52.52,0.0,141303,716.0,1000.0,16.0,316919.0,27911.0,344830.0,4719186.0,0,0,0,0,0,183,4.110716e+10,253,6.154645e-09,0.80,0,0.073070,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5999995,9958614,19,23,4,1,1,0.0,1,1,25.0,5.0,0.0,0.0,2.0,0.00,0.0,63431,24.0,100.0,1089.0,64270133.0,116432632.0,180702765.0,188407812.0,0,0,1,0,0,3,3.484228e+14,109350,3.138428e-10,0.02,1,0.959104,0,0,0,0,0
5999996,2234489,19,4697,19,1,1,0.0,2,1,83.0,5.0,1.0,0.0,2.0,0.00,0.0,36730,100.0,1832.0,31.0,1767370.0,144063.0,1911433.0,15582018.0,0,0,0,0,0,74,2.701693e+11,144,5.329992e-10,0.08,0,0.122669,0,0,0,0,0
5999997,4304572,24,18,19,3,1,0.0,1,1,1700.0,5.0,1.0,0.0,2.0,94.09,0.0,37295,952.0,800.0,186.0,60613352.0,10648.0,60624000.0,75592387.0,0,0,0,0,0,3,3.484228e+14,109350,3.138428e-10,1.70,0,0.801986,0,0,0,0,0
5999998,6550634,15,6,19,3,1,0.0,1,1,269.0,5.0,1.0,0.0,2.0,0.00,0.0,101,0.0,0.0,105.0,15091338.0,4972424.0,20063762.0,39988530.0,0,1,0,0,0,3,3.484228e+14,109350,3.138428e-10,0.27,1,0.501738,0,0,0,0,0


In [20]:
df.isna().sum()

id                                  0
oper_type + oper_attr               0
index_oper                          0
type                                0
priority                            0
is_privatecategory                  0
class                               0
is_in_yandex                        0
is_return                           0
weight                              0
mailtype                            0
mailctg                             0
mailrank                            0
directctg                           0
transport_pay                       0
postmark                            0
name_mfi                            0
weight_mfi                          0
price_mfi                           0
dist_qty_oper_login_1               0
total_qty_oper_login_1              0
total_qty_oper_login_0              0
total_qty_over_index_and_type       0
total_qty_over_index                0
is_wrong_sndr_name                  0
is_wrong_rcpn_name                  0
is_wrong_pho

In [21]:
df = df.fillna(0)

In [22]:
#df.to_csv('train_frame.csv', index=False)

Обьединим список не нужных строк с списком строк типа object

In [23]:
col_obj = df.select_dtypes(include=['object']).columns.values
col_obj = list(set(col_obj) ^ set(["id", "label",'mailrank']))

In [24]:
X = df.drop(col_obj, axis = 1)
y = df["label"]

X=X[['type', 'priority', 'class', 'dist_qty_oper_login_1',
       'total_qty_oper_login_0', 'total_qty_over_index_and_type',
       'is_wrong_phone_number', 'is_wrong','area','total_qty_over_index_area','name_mfi_fail']]

Xst = df.drop(col_obj, axis = 1)
yst = df["label"]

In [25]:
X

Unnamed: 0,type,priority,class,dist_qty_oper_login_1,total_qty_oper_login_0,total_qty_over_index_and_type,is_wrong_phone_number,is_wrong,area,total_qty_over_index_area,name_mfi_fail
0,18,1,0.0,42.0,58950.0,779126.0,0,0,182,2.748856e+11,0
1,4,1,0.0,914.0,83318932.0,132175590.0,0,0,3,3.484228e+14,0
2,19,1,0.0,62.0,3233068.0,6479360.0,0,1,175,3.939877e+12,0
3,19,1,0.0,55.0,653280.0,2714208.0,0,0,93,6.073324e+11,0
4,18,1,0.0,16.0,27911.0,344830.0,0,0,183,4.110716e+10,0
...,...,...,...,...,...,...,...,...,...,...,...
5999995,4,1,0.0,1089.0,116432632.0,180702765.0,1,1,3,3.484228e+14,0
5999996,19,1,0.0,31.0,144063.0,1911433.0,0,0,74,2.701693e+11,0
5999997,19,3,0.0,186.0,10648.0,60624000.0,0,0,3,3.484228e+14,0
5999998,19,3,0.0,105.0,4972424.0,20063762.0,0,1,3,3.484228e+14,0


# Отбор признаков для обучения!

In [None]:
scaler=MinMaxScaler()
k=11 # Количество признаков для обучения! 11
trans=SelectKBest(chi2, k=k)
X = scaler.fit_transform(X)
X =trans.fit_transform(X, y)
sm = SMOTE(random_state=42)
X, y = sm.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y)

In [None]:
X_train.shape

## Обучение модели

In [None]:
pipe = LogisticRegression(random_state=1)

In [None]:
pipe.fit(X_train, y_train)

## Оценка точности

In [None]:
pred =pipe.predict(X_test)

In [None]:
from sklearn.metrics import recall_score

score = recall_score(y_test, pred, average = "macro" )
print("Recall", score)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,  pipe.predict_proba(X_test)[:, 1])

In [None]:
print(classification_report(y_test, pred))

In [None]:
print('Требуемая величина', 0.1*score+0.9*roc_auc_score(y_test,  pipe.predict_proba(X_test)[:, 1]))

In [None]:
from keras import models
from keras import layers
import tensorflow as tf 
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(25,activation='sigmoid',input_shape=(k,)))
    model.add(layers.Dense(10,activation='relu'))
    model.add(layers.Dense(1,activation='sigmoid'))
    model.compile(optimizer='rmsprop',# Вы также можете указать параметры для оптимизатора через optimizer = optimizer.RMSprop (lr = 0.001)
                  loss='binary_crossentropy', # Эквивалент потерь = loss.binary_crossentropy
                  metrics=[tf.keras.metrics.Recall()])
    return model
model = build_model()


history = model.fit(X_train, y_train,
                    epochs=2, # Итерировать 3 раз по полному набору данных
                    batch_size=64, # Размер каждой партии 64
                    validation_data=(X_test,y_test))

In [None]:
predictions = model.predict(X_test)
threshold=0.5
pre=np.where(predictions > threshold, 1,0)
print(classification_report(y_test, pre))

# Предсказание 

In [None]:
pred_data=pd.read_csv('test_dataset_test.csv',low_memory=False)


In [None]:
df=pred_data

In [None]:
df['area']=df['index_oper'].str[0:3]
df["area"]=le_area.transform(df["area"])
df = pd.merge(df,data_total_area, on="area", how='left')
df = pd.merge(df,data_failed, on="area", how='left')
df['percent_loss']=df['sum_label']/df['total_qty_over_index_area'] 
df["priority"]=le_priority.fit_transform(df["priority"])
df["is_in_yandex"]=le_is_in_yandex.fit_transform(df["is_in_yandex"])
df["is_return"]=le_is_in_yandex.fit_transform(df["is_return"])
df["oper_type + oper_attr"]=le_oper_type.fit_transform(df["oper_type + oper_attr"])
df["index_oper"]=le_index_oper.fit_transform(df["index_oper"])
df["type"]=le_type2.fit_transform(df["type"])
df["is_privatecategory"]=le_is_privatecategory.fit_transform(df["is_privatecategory"])
df["name_mfi"]=le_name_mfi.fit_transform(df["name_mfi"])
df["weightkg"] = round(df["weight"]/1000,2)
df['is_wrong']=df['is_wrong_sndr_name']+df['is_wrong_rcpn_name']+df['is_wrong_phone_number']+df['is_wrong_address']
df['total_mean']=df['total_qty_over_index_and_type']/df['total_qty_over_index']

In [None]:
df['name_mfi_fail']=0
for i in name_mfi_fail:
    df[df['name_mfi'] == i]['name_mfi_fail']=1
    
df['mailtype_fail']=0
for i in mailtype_fail:
    df[df['mailtype'] == i]['mailtype_fail']=1
    
df['oper_type_fail']=0
for i in oper_type_fail:
    df[df['oper_type + oper_attr'] == i]['oper_type_fail']=1
    
df['type_fail']=0
for i in type_fail:
    df[df['type'] == i]['type_fail']=1

df['priority_fail']=0
for i in priority_fail:
    df[df['priority'] == i]['priority_fail']=1

In [None]:
df = df.fillna(0)


In [None]:
df


In [None]:
df.info()

In [None]:
df=df.drop(['id','mailrank'], axis = 1)
df=df[['type', 'priority', 'class', 'dist_qty_oper_login_1',
       'total_qty_oper_login_0', 'total_qty_over_index_and_type',
       'is_wrong_phone_number', 'is_wrong','area','total_qty_over_index_area','name_mfi_fail']]

In [None]:
df = scaler.transform(df)
df =trans.transform(df)

# Предсказание logreg

In [None]:
pred_data['label']=pipe.predict(df)

In [None]:
pred_data[['id','label']].to_csv('bestmodel.csv', index=False)

### 

# Предсказание nero

In [None]:
predictions = model.predict(df)
threshold=0.5
pre=np.where(predictions > threshold, 1,0)
pred_data['label']=pre
pred_data[['id','label']].to_csv('bestmodelnero.csv', index=False)

In [None]:
pred_data[['id','label']]