In [1]:
from pathlib import Path
import pandas as pd
import gc
from sklearn.preprocessing import LabelEncoder
import pickle
import bz2

In [2]:
data_path = Path('./data/').resolve()
!ls {data_path}

09_prepared.pkl.bz2 11_prepared.pkl.bz2 doc_lbe.pkl         product_groups.txt
10_prepared.pkl.bz2 card_lbe.pkl        product_groups.docx


In [3]:
with bz2.open(data_path / '09_prepared.pkl.bz2', 'rb') as f:
    df1 = pickle.load(f)
with bz2.open(data_path / '10_prepared.pkl.bz2', 'rb') as f:
    df2 = pickle.load(f)
with bz2.open(data_path / '11_prepared.pkl.bz2', 'rb') as f:
    df3 = pickle.load(f)

In [4]:
df = pd.concat([df1, df2, df3])

In [8]:
df.head()

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,id_card_int,id_doc_int
0,2020-09-01 17:01:25,0E05D308-5CEC-EA11-B444-005056A7539A,0,8594499,52,271,1.0,108.0,False,1538855,1283228
1,2020-09-01 17:35:36,610205A1-61EC-EA11-B444-005056A7539A,0,1452388,52,271,1.0,108.0,False,267307,8873113
2,2020-09-01 19:19:21,8AF19602-70EC-EA11-B444-005056A7539A,0,3493538,52,271,1.0,107.95,False,610220,12712899
3,2020-09-01 21:53:34,EB6C71A3-84EC-EA11-B444-005056A7539A,0,2491281,52,271,1.0,108.0,False,441497,21535283
4,2020-09-01 18:42:31,0706023F-6BEC-EA11-B444-005056A7539A,0,5732396,61,97,2.0,88.0,False,1065358,642341


In [9]:
df.drop(['id_card', 'id_doc', 'is_green'], axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,date,id_order,id_tov,id_kontr,quantity,sum,id_card_int,id_doc_int
0,2020-09-01 17:01:25,0,52,271,1.0,108.0,1538855,1283228
1,2020-09-01 17:35:36,0,52,271,1.0,108.0,267307,8873113
2,2020-09-01 19:19:21,0,52,271,1.0,107.95,610220,12712899
3,2020-09-01 21:53:34,0,52,271,1.0,108.0,441497,21535283
4,2020-09-01 18:42:31,0,61,97,2.0,88.0,1065358,642341


In [12]:
products = pd.read_csv(
    filepath_or_buffer=f'{data_path}/product_groups.txt',
    sep=';'
)
products.head()

Unnamed: 0,id_tov,id_group,name_group
0,52,10004,Кисломолочные продукты
1,75,10004,Кисломолочные продукты
2,77,10004,Кисломолочные продукты
3,143,10004,Кисломолочные продукты
4,151,10004,Кисломолочные продукты


In [4]:
pkl_file = open(f'{data_path}/card_lbe.pkl', 'rb')
card_lbe = pickle.load(pkl_file)
pkl_file = open(f'{data_path}/doc_lbe.pkl', 'rb')
doc_lbe = pickle.load(pkl_file)

In [5]:
def dict_to_df(df, key_name, value_name):
    return {key: value for key, value in zip(df[key_name], df[value_name])}

In [7]:
card_lbe_dict = dict_to_df(card_lbe, 'id_card', 'id_card_int')
doc_lbe_dict = dict_to_df(doc_lbe, 'id_doc', 'id_doc_int')

In [None]:
df['id_card_int'] = 0
df['id_doc_int'] = 0

In [23]:
def get_label(row):
    return doc_lbe_dict.get(row.id_doc, -1)

In [16]:
df['id_card_int'] = df.apply(get_label, axis=1)

In [24]:
df['id_doc_int'] = df.apply(get_label, axis=1)

### Remove all transactions between shops
#### In our case, it is not quite trivial to identify all the transactions between shopes. We know for sure that the sum is 0, but it could still be discount or promotional items. So I decided to take a trashhold on the amount equal to mean + std ~ 5

In [13]:
df['quantity'].describe()

count    1.347206e+08
mean     1.400151e+00
std      2.997064e+01
min     -2.755000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      5.000000e+03
Name: quantity, dtype: float64

In [14]:
between_shop_transactions = df[(df['sum'] == 0) & (df['quantity'] > 5) & (df['id_order'] == 0)]

In [15]:
between_shop_transactions

Unnamed: 0,date,id_order,id_tov,id_kontr,quantity,sum,id_card_int,id_doc_int
27520,2020-09-01 13:32:18,0,1113,-9999,5000.0,0.0,806096,21763622
37567,2020-09-01 15:53:48,0,1113,-9999,2000.0,0.0,1438102,4834001
43897,2020-09-01 12:51:22,0,1113,-9999,5000.0,0.0,72344,21320546
57565,2020-09-01 21:20:20,0,1113,-9999,5000.0,0.0,427981,14419056
69522,2020-09-01 11:14:28,0,1113,-9999,4000.0,0.0,1528414,8674916
...,...,...,...,...,...,...,...,...
46462107,2020-11-30 23:06:09,0,1113,-9999,5000.0,0.0,516678,22372786
46465112,2020-11-30 13:28:49,0,1113,-9999,1000.0,0.0,1147227,3186500
46470920,2020-11-30 08:54:36,0,1113,-9999,1500.0,0.0,1176200,2799846
46470921,2020-11-30 20:08:09,0,1113,-9999,5000.0,0.0,745459,3133830


In [16]:
between_shop_transactions_id_card_int = between_shop_transactions.id_card_int.unique()
len(between_shop_transactions_id_card_int)

6541

### Remove all transactions with sum < 0 and quantity < 0
#### I think that transactions with a negative sum or quantative may also count as outlaiers

In [17]:
negative_sum_quantity_transactions = df[(df['sum'] < 0) | (df['quantity'] < 0)]
negative_sum_quantity_transactions

Unnamed: 0,date,id_order,id_tov,id_kontr,quantity,sum,id_card_int,id_doc_int
4206092,2020-09-04 09:10:21,0,23074,19269,1.268,-470.43,1301664,11939194
4206093,2020-09-04 09:10:21,0,23074,19269,-1.268,470.43,1301664,11939194
4206102,2020-09-04 09:09:24,0,23074,19269,1.268,-470.43,1419650,22549976
4206103,2020-09-04 09:09:24,0,23074,19269,-1.268,470.43,1419650,22549976
4210123,2020-09-04 08:37:00,0,23074,19269,-1.084,402.16,1035577,6745380
...,...,...,...,...,...,...,...,...
5450157,2020-09-04 09:21:17,0,23074,19269,0.915,-339.47,574393,18763274
5450158,2020-09-04 09:21:17,0,23074,19269,-0.915,339.47,574393,18763274
5450159,2020-09-04 09:21:17,0,23074,19269,0.915,-339.47,574393,18763274
5450160,2020-09-04 09:21:17,0,23074,19269,-0.915,339.47,574393,18763274


In [19]:
negative_sum_quantity_transactions_id_card_int = negative_sum_quantity_transactions.id_card_int.unique()
len(negative_sum_quantity_transactions_id_card_int)

58

In [49]:
products[products['id_tov'] == 647]

Unnamed: 0,id_tov,id_group,name_group
3450,647,10155,Упаковка


### Check Упаковка
#### Nothing suspecious I think

In [21]:
df[(df['id_tov'] == 647)]['quantity'].describe()

count    7.768718e+06
mean     1.176063e+00
std      5.916345e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      5.030000e+02
Name: quantity, dtype: float64

### Check number of buys per customer. For that we should calculate number of checks per each cusomer for 3 month

In [24]:
grouped_by_id_card_int = df[['id_card_int', 'id_doc_int']].groupby(['id_card_int'])

In [35]:
number_of_docs_per_customer = grouped_by_id_card_int['id_doc_int'].nunique()

In [38]:
number_of_docs_per_customer.describe()

count    1.944075e+06
mean     1.204576e+01
std      2.275525e+02
min      1.000000e+00
25%      2.000000e+00
50%      5.000000e+00
75%      1.400000e+01
max      3.162770e+05
Name: id_doc_int, dtype: float64

In [45]:
number_of_docs_per_customer.sort_values(ascending=False)

id_card_int
883386     316277
1387578       454
914348        439
317600        414
531034        411
            ...  
1072206         1
1605742         1
1605741         1
101808          1
1681231         1
Name: id_doc_int, Length: 1944075, dtype: int64

In [47]:
too_much_buys = {883386}

### Check total sum per customer

In [50]:
grouped_by_id_card_int = df[['id_card_int', 'sum']].groupby(['id_card_int']).sum()

In [53]:
grouped_by_id_card_int.sort_values(by='sum', ascending=False)

Unnamed: 0_level_0,sum
id_card_int,Unnamed: 1_level_1
883386,60016231.51
1577248,1279932.55
568930,379927.61
1028081,292730.00
497966,284392.33
...,...
849914,0.00
1801523,0.00
1380493,0.00
1850061,0.00


In [54]:
grouped_by_id_card_int.sort_values(by='sum', ascending=False).describe()

Unnamed: 0,sum
count,1944075.0
mean,8075.36
std,45281.23
min,0.0
25%,728.94
50%,2495.56
75%,8917.0
max,60016230.0


### Dump


In [58]:
result = set(between_shop_transactions_id_card_int) | too_much_buys | set(negative_sum_quantity_transactions_id_card_int)
len(result)

6599

In [60]:
result_df = pd.DataFrame(result)

In [66]:
with bz2.open(data_path / 'suspected_ids.pkl.bz2', 'wb') as f:
    pickle.dump(result_df, f, protocol=4)