# Homework 1

I decided to use already preprocessed pickle data because of hardware limitations (mainly RAM).

In [1]:
import pickle
import pandas as pd
import bz2
import os

In [2]:
folder_path = './data/'
data_path = os.path.join(folder_path, '09_prepared.pkl.bz2')
card_path = os.path.join(folder_path, 'card_lbe.pkl.bz2')
doc_path = os.path.join(folder_path, 'doc_lbe.pkl.bz2')
processed_file_path = os.path.join(folder_path, 'processed.pkl.bz2')

In [3]:
def load_pickle(file_path):
    data = None
    with bz2.open(file_path, 'rb') as fd:
        data = pickle.load(fd)
    return data

def save_pickle(file_path, pickle_data):
    with bz2.open(file_path, 'wb') as fd:
        pickle.dump(pickle_data, fd, protocol=4)

In [4]:
data = load_pickle(data_path)

In [5]:
data.head()

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,id_card_int,id_doc_int
0,2020-09-01 17:01:25,0E05D308-5CEC-EA11-B444-005056A7539A,0,8594499,52,271,1.0,108.0,False,1538855,1283228
1,2020-09-01 17:35:36,610205A1-61EC-EA11-B444-005056A7539A,0,1452388,52,271,1.0,108.0,False,267307,8873113
2,2020-09-01 19:19:21,8AF19602-70EC-EA11-B444-005056A7539A,0,3493538,52,271,1.0,107.95,False,610220,12712899
3,2020-09-01 21:53:34,EB6C71A3-84EC-EA11-B444-005056A7539A,0,2491281,52,271,1.0,108.0,False,441497,21535283
4,2020-09-01 18:42:31,0706023F-6BEC-EA11-B444-005056A7539A,0,5732396,61,97,2.0,88.0,False,1065358,642341


## Analysis

First thing that comes to mind is to check the total cost of the orders generated. Maybe there are some orders which are very small or very big. If we find such cases we can further explore them.

In [9]:
orders_stat = data.groupby(['id_order'])['sum'].sum()

In [13]:
orders_stat.head()

id_order
0         4.158384e+09
98066     3.594000e+02
98071     6.700000e+02
171075    1.042460e+03
191931    3.480000e+02
Name: sum, dtype: float64

In [14]:
orders_stat.describe()

count    5.035050e+05
mean     9.608367e+03
std      5.860336e+06
min      0.000000e+00
25%      5.914000e+02
50%      1.040980e+03
75%      1.753680e+03
max      4.158384e+09
Name: sum, dtype: float64

So, if we look at the statistics, the mean is somewhere around 9608. Now, look at the examples from previous cell. It shows that orders with id 0 have total sum of more than billion. So, for further analysis of the proposed hypothesis, I decided to remove them.

In [6]:
data = data[data['id_order'] != 0]

In [None]:
save_pickle(processed_file_path, data)

### Further anylsis
Now after removing the orders with id 0, the dataset became very light for processing, which is very good in my case especially. Now, let's do the same thing and check the total costs of the orders

In [4]:
data = load_pickle(processed_file_path)

In [5]:
print(data.shape)

(5385728, 11)


In [6]:
orders_sum = data.groupby(['id_order'])['sum'].sum()

In [7]:
orders_sum.head()

id_order
98066      359.40
98071      670.00
171075    1042.46
191931     348.00
197959    2847.70
Name: sum, dtype: float64

In [9]:
orders_sum.describe()

count    503504.000000
mean       1349.496831
std        1128.310615
min           0.000000
25%         591.400000
50%        1040.980000
75%        1753.672500
max       48005.000000
Name: sum, dtype: float64

This time we can see that the mean became around 1349. If we look at the statistics we can notice two things: 
    1. The minimum total cost is 0. How? The guy just didn't buy anything, but somehow order is generated?
    2. The maximum total cost is way bigger than the mean.
So, I decided to explore these two phenomenon.

#### Orders with total cost equal to 0
First let's focus on orders with total cost 0. For that we can find all of the orders of this kind. Let's see how many of them

In [36]:
orders_sum = pd.DataFrame(orders_sum)
zero_sum_orders = orders_sum[orders_sum['sum'] == 0]
zero_sum_orders

Unnamed: 0_level_0,sum
id_order,Unnamed: 1_level_1
3938925,0.0
4666710,0.0
5839509,0.0
5858774,0.0
5859138,0.0
6047958,0.0
6049447,0.0
6049846,0.0
6050177,0.0
6135448,0.0


So, there are not many orders with total cost 0. This means we can safely assume that this is an anomaly. Something that shouldn't have happened. Let's see what they bought

In [16]:
data[data['id_order'] == 3938925].head()

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,id_card_int,id_doc_int
1043004,2020-09-01 10:45:45,40B0BD5D-27EC-EA11-B444-005056A7539A,3938925,A361864,27128,-9999,1.0,0.0,False,1732482,5915507


In [69]:
data[data['id_order'] == 4666710] .head()

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,id_card_int,id_doc_int
189551,2020-09-01 07:24:04,3C42E819-0BEC-EA11-B444-005056A7539A,4666710,1679825,27126,-9999,1.0,0.0,False,307668,5510172
2980649,2020-09-03 08:01:03,26944FC3-A2ED-EA11-B444-005056A7539A,4666710,1679825,27126,-9999,1.0,0.0,False,307668,3528740
5753171,2020-09-05 08:52:35,D1BC3526-3CEF-EA11-B444-005056A7539A,4666710,1679825,27126,-9999,1.0,0.0,False,307668,19187156
8244458,2020-09-07 07:28:27,D94543CB-C2F0-EA11-B444-005056A7539A,4666710,1679825,27126,-9999,1.0,0.0,False,307668,19875656
11073203,2020-09-09 08:09:33,BBFC20F2-5AF2-EA11-B444-005056A7539A,4666710,1679825,27126,-9999,1.0,0.0,False,307668,17196053


I couldn't even find the above products on the list. So, I decided to mark these customers as frauds

In [37]:
fraudulent_order_ids = zero_sum_orders.index.values.tolist()
print(fraudulent_order_ids)

[3938925, 4666710, 5839509, 5858774, 5859138, 6047958, 6049447, 6049846, 6050177, 6135448, 6240034, 6277825, 6280102, 6487278, 6539109, 6539725]


In [47]:
frauds = []

In [48]:
for x in fraudulent_order_ids:
    frauds.extend(list(set(data[data['id_order'] == x]['id_card_int'].tolist())))
print(frauds)

[1732482, 307668, 1927451, 684987, 1297595, 1929250, 703172, 1404427, 1404431, 1866912, 1104628, 146102, 1404428, 1011306, 1404428, 1404431]


### Orders with big total cost
Let's now focus on orders with big total cost. The mean of the total costs was around 1349. So, I decided to treat orders with total cost more than 10000 as big orders. Let's find all those orders.

In [51]:
big_sum_orders = orders_sum[orders_sum['sum'] > 10000]
print(f'Number of orders with sum over 10000: {big_sum_orders.shape[0]}')
big_sum_orders.head()

Number of orders with sum over 10000: 283


Unnamed: 0_level_0,sum
id_order,Unnamed: 1_level_1
1081817,14369.98
1552779,10337.0
2061685,10985.14
3412255,13346.27
4213775,10570.22


In [52]:
big_sum_orders.describe()

Unnamed: 0,sum
count,283.0
mean,12403.111731
std,3245.051325
min,10000.24
25%,10637.74
50%,11473.65
75%,13105.725
max,48005.0


So, as expected there are not many big orders. This itself suggests an anomaly, but I decided that it's not enough to mark them as frauds. So, instead decided to check order with total cost more than 15000.

In [57]:
big_sum_orders_15000 = orders_sum[orders_sum['sum'] > 15000]
print(f'Number of orders with sum over 15000: {big_sum_orders.shape[0]}')
big_sum_orders_15000

Number of orders with sum over 15000: 34


Unnamed: 0_level_0,sum
id_order,Unnamed: 1_level_1
4455150,16577.39
5363551,15359.03
5414142,22440.0
5542850,16100.0
5571374,16943.43
5583042,17293.17
5610480,15194.01
5656434,21437.04
5687691,16770.78
5699038,16100.0


In [56]:
big_sum_orders_15000.describe()

Unnamed: 0,sum
count,34.0
mean,18523.898235
std,5867.675729
min,15169.5
25%,15770.7325
50%,16764.28
75%,18834.2475
max,48005.0


Only 34 orders have more than 15000 total cost. But the mean from statistics about orders with total cost more than 10000 and these statistics suggests that it's difficult to just find outliers. Let's see what these customers have ordered

In [58]:
data[data['id_order'] == 5414142].head()

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,id_card_int,id_doc_int
597217,2020-09-01 13:13:01,D560F504-3DEC-EA11-B444-005056A7539A,5414142,4879050,27128,-9999,1.0,935.0,False,897626,19519892
597218,2020-09-01 13:13:30,DA60F504-3DEC-EA11-B444-005056A7539A,5414142,4879050,27128,-9999,1.0,935.0,False,897626,19977141
3391758,2020-09-03 15:54:51,9449F272-E7ED-EA11-B444-005056A7539A,5414142,4879050,27128,-9999,1.0,935.0,False,897626,13566207
3391759,2020-09-03 15:55:11,A449F272-E7ED-EA11-B444-005056A7539A,5414142,4879050,27128,-9999,1.0,935.0,False,897626,15030184
7310327,2020-09-06 14:43:35,C2D45FEE-36F0-EA11-B444-005056A7539A,5414142,4879050,27128,-9999,1.0,935.0,False,897626,17821499


In [70]:
data[data['id_order'] == 6518049].head()

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,id_card_int,id_doc_int
38058645,2020-09-28 08:36:51,9B6A36BF-4C01-EB11-B444-005056A7539A,6518049,A634010,647,13567,1.0,5.0,False,1755035,14218897
38060133,2020-09-28 08:36:51,9B6A36BF-4C01-EB11-B444-005056A7539A,6518049,A634010,21379,15189,4.0,9600.0,False,1755035,14218897
38060134,2020-09-28 08:36:51,9B6A36BF-4C01-EB11-B444-005056A7539A,6518049,A634010,21379,18018,16.0,38400.0,False,1755035,14218897


Nothing interesting, except once again there's a problem with product with id 27128. If we recall we saw the same product when we were investigating the order with total cost 0. Let's see the history of this product.

In [71]:
faulty_tovar = data[data['id_tov'] == 27128]
print(f'number of orders for tovar with id 27128: {faulty_tovar.shape[0]}')
faulty_tovar

number of orders for tovar with id 27128: 124


Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,id_card_int,id_doc_int
415974,2020-09-01 15:40:57,22807D94-51EC-EA11-B444-005056A7539A,5458715,4367576,27128,-9999,1.0,1200.0,False,790313,3156064
415975,2020-09-01 15:39:35,F5CAF1FE-50EC-EA11-B444-005056A7539A,5458715,4367576,27128,-9999,1.0,1200.0,False,790313,22485197
463223,2020-09-01 19:34:40,FECAC23A-71EC-EA11-B444-005056A7539A,4870963,6953797,27128,-9999,1.0,1000.0,False,1291614,23307423
597217,2020-09-01 13:13:01,D560F504-3DEC-EA11-B444-005056A7539A,5414142,4879050,27128,-9999,1.0,935.0,False,897626,19519892
597218,2020-09-01 13:13:30,DA60F504-3DEC-EA11-B444-005056A7539A,5414142,4879050,27128,-9999,1.0,935.0,False,897626,19977141
...,...,...,...,...,...,...,...,...,...,...,...
38770472,2020-09-28 11:01:01,BFCC7EE9-6001-EB11-B444-005056A7539A,6357143,A806273,27128,-9999,1.0,1150.0,False,1771721,17544947
39679704,2020-09-29 18:28:23,B3F58E6A-6902-EB11-B444-005056A7539A,5414142,4879050,27128,-9999,1.0,935.0,False,897626,16461919
39679705,2020-09-29 18:28:45,B5F58E6A-6902-EB11-B444-005056A7539A,5414142,4879050,27128,-9999,1.0,935.0,False,897626,16644788
41072662,2020-09-30 21:30:30,7F1B92B7-4B03-EB11-B444-005056A7539A,5950641,6953797,27128,-9999,1.0,1100.0,False,1291614,11630323


So it looks like an expensive product. But somehow before it generated 0 total cost. Let's see what happend if we just remove this product altogether from the dataset

In [61]:
orders_without_faulty_tovar = data[data['id_tov'] != 27128]

In [62]:
orders_without_faulty_tovar.shape[0]
new_order_sum = orders_without_faulty_tovar.groupby(['id_order'])['sum'].sum()

In [63]:
new_order_sum.shape[0]
new_order_sum.head()

id_order
98066      359.40
98071      670.00
171075    1042.46
191931     348.00
197959    2847.70
Name: sum, dtype: float64

In [64]:
new_order_sum.describe()

count    503482.000000
mean       1349.283812
std        1127.269415
min           0.000000
25%         591.382500
50%        1040.880000
75%        1753.510000
max       48005.000000
Name: sum, dtype: float64

In [68]:
orders_without_faulty_tovar[orders_without_faulty_tovar['sum'] == 0].shape[0]

12719

Yeah, nothing changed in particular. So, it was a bad idea.

## Quantity analysis
The next idea is to try to find anomalies in terms of quantities of the products ordered. Maybe some products are ordered way too much than normally ordered.

In [72]:
qdata = data.groupby(['id_order'])['quantity'].sum()

In [73]:
print(f'Number of orders: {qdata.shape[0]}')
qdata

Number of orders: 503504


id_order
98066       3.425
98071       8.000
171075      9.234
191931      6.000
197959     28.939
            ...  
6672789     5.000
6672802     6.000
6672828     5.000
6672847    12.000
6673056    11.000
Name: quantity, Length: 503504, dtype: float64

In [74]:
qdata.describe()

count    503504.000000
mean         13.804329
std          10.669261
min           0.100000
25%           6.562000
50%          11.000000
75%          17.997000
max         260.872000
Name: quantity, dtype: float64

So, the mean is 13, but the max is more than 260. Let's see what is going on

In [79]:
data[data['quantity'] > 50]

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,id_card_int,id_doc_int
21440462,2020-09-16 10:20:02,6E91C47F-EDF7-EA11-B444-005056A7539A,6054725,1182272,647,13567,55.0,275.0,False,219195,10115145
33114325,2020-09-24 20:26:59,F1BC3D7B-8BFE-EA11-B444-005056A7539A,6424640,5665874,647,-9999,55.0,275.0,False,1054598,22113347
34809950,2020-09-25 17:47:50,ECF2EB1B-3FFF-EA11-B444-005056A7539A,6454232,3242846,15914,15677,55.0,495.0,False,566194,21674862


So if we pull up the product ordered in more than 50 quantities we see some products but the total cost is not much. Also, judging by the max value of the quantities, the column does not represent 'quantity'. It may also represent the weight (weight of vegetables for example). So, this means it's really difficult to judge the order based on the quantitiy. But let's see what are some orders with very small quantity

In [82]:
data[data['quantity'] < 0.1]

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,id_card_int,id_doc_int
44965,2020-09-01 18:17:46,219E6982-66EC-EA11-B444-005056A7539A,5585636,a871075,17456,16860,0.096,22.56,False,1919552,3075089
76223,2020-09-01 19:48:39,FB34A539-74EC-EA11-B444-005056A7539A,5588920,0586045,15738,19687,0.000,0.79,False,116224,22979294
78161,2020-09-01 19:48:39,FB34A539-74EC-EA11-B444-005056A7539A,5588920,0586045,28662,19687,0.010,2.22,False,116224,22979294
82593,2020-09-01 14:00:27,C38B741E-43EC-EA11-B444-005056A7539A,5574719,0371676,17456,16860,0.074,17.39,False,75066,17886732
192104,2020-09-01 11:42:54,BC7CA823-2E24-4A1A-991A-A3A759EECCD0,5483467,7196141,17456,16860,0.090,21.15,False,1334055,17241883
...,...,...,...,...,...,...,...,...,...,...,...
42010175,2020-09-30 18:05:07,A6ECE5A9-2E03-EB11-B444-005056A7539A,6660009,1813497,31296,18463,0.096,50.40,False,329561,15270701
42019225,2020-09-30 11:48:20,9ED2AB04-FA02-EB11-B444-005056A7539A,6642101,4218114,605,19230,0.092,2.67,False,758061,14530437
42027047,2020-09-30 20:38:50,4E195EE6-4403-EB11-B444-005056A7539A,6670940,4796446,17456,16860,0.090,21.15,False,880997,7142650
42039467,2020-09-30 14:04:41,56DBBEE6-0C03-EB11-B444-005056A7539A,6648930,9010719,17456,16860,0.090,20.68,False,1588788,7943806


If we look at the quantities, they are not integers. So, that means these are just weights. So, it's find to buy something that weighs less than 100 gramms. Nothing here too

# Final list of potential frauds

In [83]:
print('Found potential frauds:')
print(frauds)

Found potential frauds:
[1732482, 307668, 1927451, 684987, 1297595, 1929250, 703172, 1404427, 1404431, 1866912, 1104628, 146102, 1404428, 1011306, 1404428, 1404431]
