# **Shopee Code League - Order Brushing**

This notebook is for Shopee Code League Competition, Order Brushing, to detecting abnormal user behaviour. The competition details can be obtained from [here](https://www.kaggle.com/c/order-brushing-shopee-code-league/overview).

Team name: CY Team (Malaysia)

Team member:
1. Chong Li Chuin
2. Ee Yeo Keat

# **Import libraries and read dataset**

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import datetime
from datetime import timedelta
from google.colab import files

In [0]:
data = pd.read_csv('order_brush_order.csv').sort_values(by=['event_time','shopid','userid'])
data

Unnamed: 0,orderid,shopid,userid,event_time
150060,31075200506751,6042309,97707522,2019-12-27 00:00:00
81020,31075200506753,8715449,97707522,2019-12-27 00:00:00
25985,31075200506752,104804492,97707522,2019-12-27 00:00:00
146598,31075201870570,190969466,170182475,2019-12-27 00:00:02
80355,31075205798264,2859407,12532131,2019-12-27 00:00:05
...,...,...,...,...
114113,31507183252446,149254894,193333760,2019-12-31 23:59:43
206585,31507187390691,147941492,40258063,2019-12-31 23:59:47
160657,31507191066627,154074176,2338306,2019-12-31 23:59:51
126212,31507191066628,187123853,2338306,2019-12-31 23:59:51


In [0]:
## Set event_time as index 
df_time = data.set_index(pd.DatetimeIndex(data['event_time'])).drop('event_time', axis=1)
df_time = df_time.sort_index()

In [0]:
df_time

Unnamed: 0_level_0,orderid,shopid,userid
event_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-12-27 00:00:00,31075200506751,6042309,97707522
2019-12-27 00:00:00,31075200506753,8715449,97707522
2019-12-27 00:00:00,31075200506752,104804492,97707522
2019-12-27 00:00:02,31075201870570,190969466,170182475
2019-12-27 00:00:05,31075205798264,2859407,12532131
...,...,...,...
2019-12-31 23:59:43,31507183252446,149254894,193333760
2019-12-31 23:59:47,31507187390691,147941492,40258063
2019-12-31 23:59:51,31507191066627,154074176,2338306
2019-12-31 23:59:51,31507191066628,187123853,2338306


In [0]:
grouped_orders = df_time.groupby(['shopid', 'userid', pd.Grouper(freq='60min', label='left', base=0)]).sum()
grouped_orders

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,orderid
shopid,userid,event_time,Unnamed: 3_level_1
10009,196962305,2019-12-27 03:00:00,31086409141107
10051,2854032,2019-12-27 19:00:00,31144571933461
10051,48600461,2019-12-29 01:00:00,31254979546679
10061,62464559,2019-12-31 02:00:00,31431527100615
10061,130633421,2019-12-30 21:00:00,31412115824794
...,...,...,...
214949521,46269178,2019-12-31 20:00:00,31493203163305
214964814,200983383,2019-12-29 22:00:00,31328775676314
215175775,13688804,2019-12-31 09:00:00,31453591917585
215175775,129266028,2019-12-31 14:00:00,31472077221692


In [0]:
possible_brush = grouped_orders[grouped_orders.orderid > 2]
possible_brush

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,orderid
shopid,userid,event_time,Unnamed: 3_level_1
10009,196962305,2019-12-27 03:00:00,31086409141107
10051,2854032,2019-12-27 19:00:00,31144571933461
10051,48600461,2019-12-29 01:00:00,31254979546679
10061,62464559,2019-12-31 02:00:00,31431527100615
10061,130633421,2019-12-30 21:00:00,31412115824794
...,...,...,...
214949521,46269178,2019-12-31 20:00:00,31493203163305
214964814,200983383,2019-12-29 22:00:00,31328775676314
215175775,13688804,2019-12-31 09:00:00,31453591917585
215175775,129266028,2019-12-31 14:00:00,31472077221692


In [0]:
userids = []
possible_brush.reset_index().groupby('shopid')['userid'].apply(lambda x: userids.append(x.values))

shopid
10009        None
10051        None
10061        None
10084        None
10100        None
             ... 
214662358    None
214949521    None
214964814    None
215175775    None
215435223    None
Name: userid, Length: 18770, dtype: object

In [0]:
userids

[array([196962305]),
 array([ 2854032, 48600461]),
 array([ 62464559, 130633421, 168750452, 194819216]),
 array([   996734,   1668819,   4401933,   6488128,   8457753,   8838142,
         11753447,  12840239,  13837190,  13999404,  14062776,  16021925,
         18193868,  22105044,  23352983,  27099291,  33285757,  39828049,
         51586322,  57738503,  57956057,  68444473,  73993513,  77184045,
         77391117,  80643747,  81373632,  86491592,  88321715,  90239949,
         91048722,  93206344,  96570515,  99322339, 102616150, 115025463,
        115450459, 121615825, 123715078, 129074211, 132259411, 136264815,
        143378048, 151362536, 151363170, 156270912, 162847440, 167932181,
        177954889, 180772892, 181195859, 184840040, 185601856, 185802312]),
 array([   495431,    546854,   1700969,   3827594,   8149179,  10251073,
         12201862,  12201862,  23867438,  27265170,  30534780,  34507155,
         34632620,  35220553,  37830887,  40405968,  55224420,  75514903,
     

In [0]:
date = pd.to_datetime(data['event_time'])
date

150060   2019-12-27 00:00:00
81020    2019-12-27 00:00:00
25985    2019-12-27 00:00:00
146598   2019-12-27 00:00:02
80355    2019-12-27 00:00:05
                 ...        
114113   2019-12-31 23:59:43
206585   2019-12-31 23:59:47
160657   2019-12-31 23:59:51
126212   2019-12-31 23:59:51
179881   2019-12-31 23:59:56
Name: event_time, Length: 222750, dtype: datetime64[ns]

In [0]:
grouped_orders = df_time.groupby(['shopid', 'userid', pd.Grouper(freq='60min', label='left', base=0)])

In [0]:
data['Time'],data['Date']= date.apply(lambda x:x.time()), date.apply(lambda x:x.date())
data

Unnamed: 0,orderid,shopid,userid,event_time,Time,Date
150060,31075200506751,6042309,97707522,2019-12-27 00:00:00,00:00:00,2019-12-27
81020,31075200506753,8715449,97707522,2019-12-27 00:00:00,00:00:00,2019-12-27
25985,31075200506752,104804492,97707522,2019-12-27 00:00:00,00:00:00,2019-12-27
146598,31075201870570,190969466,170182475,2019-12-27 00:00:02,00:00:02,2019-12-27
80355,31075205798264,2859407,12532131,2019-12-27 00:00:05,00:00:05,2019-12-27
...,...,...,...,...,...,...
114113,31507183252446,149254894,193333760,2019-12-31 23:59:43,23:59:43,2019-12-31
206585,31507187390691,147941492,40258063,2019-12-31 23:59:47,23:59:47,2019-12-31
160657,31507191066627,154074176,2338306,2019-12-31 23:59:51,23:59:51,2019-12-31
126212,31507191066628,187123853,2338306,2019-12-31 23:59:51,23:59:51,2019-12-31


In [0]:
time = data['Time']
time

150060    00:00:00
81020     00:00:00
25985     00:00:00
146598    00:00:02
80355     00:00:05
            ...   
114113    23:59:43
206585    23:59:47
160657    23:59:51
126212    23:59:51
179881    23:59:56
Name: Time, Length: 222750, dtype: object

In [0]:
datetimeFormat = '%Y-%m-%d %H:%M:%S'

In [0]:
for i in data['event_time']:
  diff = datetime.datetime.strptime(str(i), datetimeFormat)- datetime.datetime.strptime(str(i), datetimeFormat)
  print(diff.seconds)

TypeError: ignored

In [0]:
def check_concentrate_rate():
  concentrate_rate = #Number of Orders within 1 hour / Number of Unique Buyers within 1 hour
  if concentrate_rate >= 3:
    shop_list.append(shopid)
  
  return shop_list

NameError: ignored

In [0]:
data

Unnamed: 0,orderid,shopid,userid,event_time,Time,Date
150060,31075200506751,6042309,97707522,2019-12-27 00:00:00,00:00:00,2019-12-27
81020,31075200506753,8715449,97707522,2019-12-27 00:00:00,00:00:00,2019-12-27
25985,31075200506752,104804492,97707522,2019-12-27 00:00:00,00:00:00,2019-12-27
146598,31075201870570,190969466,170182475,2019-12-27 00:00:02,00:00:02,2019-12-27
80355,31075205798264,2859407,12532131,2019-12-27 00:00:05,00:00:05,2019-12-27
...,...,...,...,...,...,...
114113,31507183252446,149254894,193333760,2019-12-31 23:59:43,23:59:43,2019-12-31
206585,31507187390691,147941492,40258063,2019-12-31 23:59:47,23:59:47,2019-12-31
160657,31507191066627,154074176,2338306,2019-12-31 23:59:51,23:59:51,2019-12-31
126212,31507191066628,187123853,2338306,2019-12-31 23:59:51,23:59:51,2019-12-31


In [0]:
save_data = data.shopid.unique
save_data

<bound method Series.unique of 150060      6042309
81020       8715449
25985     104804492
146598    190969466
80355       2859407
            ...    
114113    149254894
206585    147941492
160657    154074176
126212    187123853
179881     31844790
Name: shopid, Length: 222750, dtype: int64>

In [0]:
user = data.userid[:18770]

In [0]:
column_names = ["shopid", "userid"]

df = pd.DataFrame(columns = column_names)
df['shopid'] = save_data
df['userid'] = user
df

Unnamed: 0,shopid,userid
150060,6042309,97707522
81020,8715449,97707522
25985,104804492,97707522
146598,190969466,170182475
80355,2859407,12532131
...,...,...
175242,199316386,89533887
49566,84821602,115405128
64225,48685076,62353080
125215,110871170,2558553


In [0]:
df.to_csv('CY_Team_submission4.csv') 
files.download('CY_Team_submission4.csv')