In [1]:
# -*- coding: utf-8 -*- Line 2
#----------------------------------------------------------------------------
# Project     : Price Alarm System Enhancement
# Created By  : Eungi Cho
# Created Date: 26/05/22
# Updated Date: 31/05/22
# version ='1.0'
# ---------------------------------------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")
plt.style.use('default')

In [2]:
import pathlib
print(pathlib.Path().absolute())
df_raw = pd.read_csv('/Users/cho-eungi/Practice/CSV/market_entry_price.csv')
print(df_raw.shape)
# print(df_raw.isnull().sum())
df_raw = df_raw.drop_duplicates()
df_raw.head()

/Users/cho-eungi/Practice/Tridge
(10619563, 11)


Unnamed: 0,source_id,country,market_id,product_id,entry_id,currency,final_unit,date,price_min,price_max,price_avg
0,201,South Africa,1487,131,92926374,ZAR,kg,2020-07-20,19.64,21.2,19.956
1,39,India,810,490,41039702,INR,kg,2020-07-06,11.8,12.5,12.2
2,41,India,2188,133,50157058,INR,kg,2020-07-06,50.0,52.7,51.4
3,556,Bangladesh,6581,545,84458922,BDT,kg,2020-07-13,4400.0,4800.0,4600.0
4,150,Turkey,2482,126,58387432,TRY,,2020-07-13,10.0,15.0,11.288


In [3]:
# Create Test df
entry_lst = np.sort(df_raw['entry_id'].unique())
np.random.seed(0)
sample_entry = np.random.choice(entry_lst, 1000)
test_df = df_raw.loc[df_raw['entry_id'].isin(sample_entry)].sort_values(
    by = ['source_id', 'market_id', 'entry_id', 'date']).copy()
test_df['date'] = pd.to_datetime(test_df['date'])
test_df

Unnamed: 0,source_id,country,market_id,product_id,entry_id,currency,final_unit,date,price_min,price_max,price_avg
9270545,1,Netherlands,2775,99,41581966,USD,kg,2020-04-20,15.96,15.96,15.960
9045476,1,Netherlands,2775,99,41581966,USD,kg,2020-06-15,8.96,8.96,8.960
9235582,1,Netherlands,2775,99,41581966,USD,kg,2020-07-20,8.16,8.16,8.160
3920876,1,Netherlands,2782,113,41000912,USD,kg,2020-04-27,21.96,21.96,21.960
3821445,1,Netherlands,2782,113,41000912,USD,kg,2020-05-04,18.75,22.22,20.485
...,...,...,...,...,...,...,...,...,...,...,...
8855661,730,Vietnam,8784,15269,119373567,VND,kg,2022-03-28,22000.00,22000.00,22000.000
8855655,730,Vietnam,8784,15269,119373567,VND,kg,2022-04-04,20500.00,20500.00,20500.000
8855647,730,Vietnam,8784,15269,119373567,VND,kg,2022-04-11,20500.00,20500.00,20500.000
8855659,730,Vietnam,8784,15269,119373567,VND,kg,2022-04-18,19750.00,19750.00,19750.000


In [4]:
# W-MON date range from 2020 to 2022
date_range = pd.date_range('2020-01-01', '2022-06-30', freq = 'W-MON')
time_df = pd.DataFrame({'date': date_range})

# Left Join Test DF and Time DF
empty_df = pd.DataFrame()
for entry in sample_entry:
    time_df['entry_id_'] = entry
    entry_df = test_df.loc[test_df['entry_id'] == entry]
    joined_df = pd.merge(time_df, entry_df, left_on = ['date'], right_on = ['date'], how = 'left')
    empty_df = empty_df.append(joined_df)

In [5]:
# After 2022-02-01: Latest Data - df
df = empty_df.loc[
    (empty_df['date'] > '2022-02-01') & (empty_df['date'] < '2022-05-27')].sort_values(
    by = ['entry_id_', 'date']).copy()
df.set_index(np.arange(len(df)), inplace=True)
df

Unnamed: 0,date,entry_id_,source_id,country,market_id,product_id,entry_id,currency,final_unit,price_min,price_max,price_avg
0,2022-02-07,40857878,264.0,Brazil,901.0,121.0,40857878.0,BRL,kg,1.35,1.96,1.553333
1,2022-02-14,40857878,264.0,Brazil,901.0,121.0,40857878.0,BRL,kg,1.96,1.96,1.960000
2,2022-02-21,40857878,,,,,,,,,,
3,2022-02-28,40857878,,,,,,,,,,
4,2022-03-07,40857878,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
15995,2022-04-25,134278888,,,,,,,,,,
15996,2022-05-02,134278888,,,,,,,,,,
15997,2022-05-09,134278888,,,,,,,,,,
15998,2022-05-16,134278888,,,,,,,,,,


In [6]:
# Abnormal Price Range Detection
def exceed_3sigma(array):
    threshold_min = np.min(array) - 3 * np.std(array)
    threshold_max = np.min(array) + 3 * np.std(array)
    if threshold_min < 0:
        threshold_min = 0

    check_col = []
    for i in array:
        if i > threshold_max or i < threshold_min:
            check_col.append(1)
        else:
            check_col.append(0)
    return np.asarray(check_col)

# Count the occurances of consecutive null value
def count_consec_nan(array):
    consec_cnt = array.isnull().astype(int).groupby(array.notnull().astype(int).cumsum()).cumsum()
    return np.asarray(consec_cnt)

# -- Alarm -- #
# price change rate
def alarm_1(df):
    df['price_avg_chg'] = np.where(
        (df['price_avg'].notnull()) & (df['price_avg'].shift(1).notnull())
        , df['price_avg'] / df['price_avg'].shift(1)
        , 0)
    df['rank'] = df.groupby('entry_id')['date'].rank("dense", ascending = True)
    df['price_avg_chg_'] = np.where(df['rank'] == 1, np.nan, df['price_avg_chg'])
    df.drop(['price_avg_chg', 'rank'], axis = 1, inplace = True)
    df['alarm1'] = np.where(df['price_avg_chg_'] > 2, 1, 0)

# consecutive null count
def alarm_2(df):
    df['consec_null3'] = df.groupby('entry_id_')['price_avg'].transform(count_consec_nan)
    df['alarm2'] = np.where(df['consec_null3'] >=3, 1, 0)

# abnormal price
def alarm_3(df):
    group = df.groupby('entry_id')["price_avg"]
    df['alarm3'] = group.transform(exceed_3sigma)

# constant price
def alarm_4(df):
    df['rank_'] = df.groupby(by = 'entry_id')['date'].rank("dense", ascending = True)
    df['price_avg_shift'] = np.where(df['rank_'] == 1, np.nan, df['price_avg'].shift(1))
    df.drop(['rank_'], axis = 1, inplace = True)
    df['consec_count_same'] = (df['price_avg'] == df['price_avg_shift']).groupby(
        (df['price_avg'] != df['price_avg_shift']).cumsum()
    ).cumsum()
    threshold = 10
    df['alarm4'] = np.where(df['consec_count_same'] >= threshold, 1, 0)
    
# change in number of entries in source 
def alarm_5(df):
    df_source = df.sort_values(by = ['date', 'source_id', 'entry_id']).copy()
    entry_countBysource = pd.DataFrame(empty_df.groupby(
        by = ['date', 'source_id'])['entry_id'].count()
                                      ).reset_index(level = (0,1))
    entry_countBysource = entry_countBysource.sort_values(by = ['source_id', 'date'])
    entry_countBysource['diff'] = entry_countBysource['entry_id'] - entry_countBysource['entry_id'].shift(1)
    entry_countBysource['Rank'] = entry_countBysource.groupby(by = 'source_id')['date'].rank("dense", ascending = True)
    entry_countBysource['diff_'] = np.where(entry_countBysource['Rank'] == 1, np.nan, entry_countBysource['diff'])
    entry_countBysource.drop(['Rank', 'diff'], axis = 1, inplace = True)

    threshold_alarm5 = 10
    result = entry_countBysource.loc[entry_countBysource['diff_'].abs() > threshold_alarm5]
    result_dict = result.to_dict('records')
    source_list = []
    date_list = []
    for result in result_dict:
        source_list.append(result['source_id'])
        date_list.append(result['date'])

    df['alarm5'] = np.where((df['source_id'].isin(source_list)) & (df['date'].isin(date_list)),
                           1, 0)

In [7]:
alarm_5(df)
alarm_4(df)
alarm_3(df)
alarm_2(df)
alarm_1(df)

In [24]:
print('Process of analysis: ', 'df_raw -> 1000 entries random sampling -> analysis on "22/02/01 ~ 22/05/27" -> df')
print('total number of rows: ', len(df))
print('total number of entries: ', len(df['entry_id_'].unique())) # Why -2...
print('alarm_5: change in # of entries in each source', len(df[df['alarm5'] == 1]))
print('alarm_4: constant price', len(df[df['alarm4'] == 1]))
print('alarm_3: abnormal price range', len(df[df['alarm3'] == 1]))
print('alarm_2: consecutive null count ', len(df[df['alarm2'] == 1]))
print('alarm_1: price change rate (200%) ', len(df[df['alarm1'] == 1]))

## Point 1
# alarm 5 threshold = 10: source 별로 묶여 있는 entry 개수가 너무 다르기 때문에, threshold를 entry마다 달리 설정할 필요성이 있어 보임. 기준을 별도로 정해야 할 필요성이 있음.
# alarm 4 threshold = 10: 이는 기준을 비교적 정성적으로 정해도 될 것으로 판단 됨.
# alarm 3 threshold = confidence level (99): 기준을 별도로 정해야 할 필요성이 있음.
# alarm 2 threshold = 3: 이는 기준을 비교적 정성적으로 정해도 될 것으로 판단 됨.
# alarm 1 threshold = 200%: 이는 기준을 비교적 정성적으로 정해도 될 것으로 판단 됨.

## Point 2
# alarm 2와 같은 경우 그 수가 많은 이유는, 이전부터 "어떠한 문제" 가 발생하여 crawl 되고 있지 않았지만
# detect되지 않아 방치되고 있던 entry로 보임. 따라서 consecutive null count를 했을 때, 그 수가
# 현재 임의로 설정한 threshold인 3을 초과하여 alarm detection에 포함된 것으로 보임.

Process of analysis:  df_raw -> 1000 entries random sampling -> analysis on "22/02/01 ~ 22/05/27" -> df
total number of rows:  16000
total number of entries:  998
alarm_5: change in # of entries in each source 194
alarm_4: constant price 12
alarm_3: abnormal price range 237
alarm_2: consecutive null count  10517
alarm_1: price change rate (200%)  14


# Alarm 1: Abnormal Price Change (rate > 2)

In [9]:
# alarm 1. 1) 특정 entry에서 report된 price_avg 값이 2) 그 전 시점의 price_avg_chg와의 비가 2배 이상

# df['price_avg_shift'] = df['price_avg'].shift(1)
# # price_avg_shift 와 price_avg 컬럼 모두 null 이 아닌 경우에만 비율 계산. 아니면 0
# df['price_avg_chg'] = np.where((df['price_avg'].notnull()) & (df['price_avg_shift'].notnull())
#                                , df['price_avg'] / df['price_avg_shift']
#                                , 0)
# df

In [10]:
# df['rank'] = df.groupby(by = 'entry_id')['date'].rank("dense", ascending = True)
# df['price_avg_chg_'] = np.where(df['rank'] == 1, np.nan, df['price_avg_chg'])
# df.drop(['price_avg_shift', 'price_avg_chg', 'rank'], axis = 1, inplace = True)
# df

In [11]:
# df['alarm1'] = np.where(df['price_avg_chg_'] > 2, 1, 0)
# df.loc[df['alarm1'] == 1].shape
# # df.loc[df['alarm1'] == 1]

# Alarm 2: Discontinued price data (consecutive null cnt)

In [12]:
# # alarm 2. 1) 특정 entry에서 report된 price_avg 값이 2) 연속적으로 null을 3번 기록한 경우에 해당

# # count consecitive nan function
# def count_consec_nan(array):
#     consec_cnt = array.isnull().astype(int).groupby(array.notnull().astype(int).cumsum()).cumsum()
#     return consec_cnt

# # entry 별로 price_avg 컬럼에서 consecutive nan을 count하는 새로운 컬럼을 추가
# df['consec_null3'] = df.groupby(by = 'entry_id_')['price_avg'].transform(count_consec_nan)
# # 3이상을 기록할 경우 alarm2 col = 1
# df['alarm2'] = np.where(df['consec_null3'] >=3, 1, 0)
# df[df['alarm2']==1]

# Alarm 3: Abnormal entry

In [13]:
# # alarm 3. 1) 특정 entry에서 report된 price_ave 값이 2) 임계치 이상을 벗어난 경우
# # 임계치를 whisker = .75 percentile +- 1.5*IQR
# def exceed_3sigma(array):
#     threshold_min = np.min(array) - 3 * np.std(array)
#     threshold_max = np.min(array) + 3 * np.std(array)
#     if threshold_min < 0:
#         threshold_min = 0
    
#     check_col = []
#     for i in array:
#         if i > threshold_max or i < threshold_min:
#             check_col.append(1)
#         else:
#             check_col.append(0)
#     return np.asarray(check_col)

# df['alarm3'] = df.groupby(by = 'entry_id')['price_avg'].transform(exceed_3sigma)
# df.loc[df['alarm3'] == 1].shape

# Alarm 4: Constant price

In [14]:
# # alarm 4. 1) 특정 entry에서 report된 price_ave 값이 2) 일정 임계기간동안 같은 값을 반복하는 경우

# # shift(1)을 통해 비교 column 생성. But entry_id로 groupby되어 나타나는 첫 번째 shift값은 np.nan으로 변경
# df['price_avg_shift'] = df['price_avg'].shift(1)
# df['Rank'] = df.groupby('entry_id')['date'].rank("dense", ascending = True)
# df['price_avg_shift_'] = np.where(df['Rank'] == 1, np.nan, df['price_avg_shift'])
# df.drop(['price_avg_shift', 'Rank'], axis = 1, inplace = True)

# # price_avg 와 price_avg_shift_ 간 비교한 후, groupby하여 cumsum을 적용함으로써 
# # 같은(same) 값의 consecutive count 실행
# df['consec_count_same'] = (df['price_avg'] == df['price_avg_shift_']).groupby(
#     (df['price_avg'] != df['price_avg_shift_']).cumsum()
# ).cumsum()

# # comsec count threshoold 8로 설정 (2개월)): entry별, 혹은 market별로 다르게 설정해야할지?
# # 1) np.where multiple conditions: https://stackoverflow.com/questions/39109045/numpy-where-with-multiple-conditions
# # OR
# # 2) for group, subset in df.groupby['market_id'] -> if group == 'certain_market_id' & subset.consec_count_same == 'certain value':
# # df.loc[(df['market_id'] == group) & (df['consec_count_same'] == 'certain_value')]['alarm4'] == 1
# threshold = 8
# df['alarm4'] = np.where(df['consec_count_same'] >= threshold, 1, 0)
# df.loc[df['alarm4'] == 1].shape

# Alarm 5: Abnormal entry count change

In [15]:
# # alarm 5. 1) 특정 소스에서 감지된 엔트리의 총 개수합이 2) 임계값 이상 변화한 경우
# df_source = empty_df.sort_values(by = ['date', 'source_id', 'entry_id']).copy()
# df_source

In [16]:
# # 소스에 해당되는 entry를 count한 후, 크롤링되는 시점마다 그 전 시점과의 개수 차이를 계산
# # 일정 threshold를 정한 후, 개수의 차이의 절대값이 해당 threshold보다 크면 alarm 5 출력
# entry_countBysource = pd.DataFrame(empty_df.groupby(by = ['date', 'source_id'])['entry_id'].count()).reset_index(level = (0,1))
# entry_countBysource = entry_countBysource.sort_values(by = ['source_id', 'date'])
# entry_countBysource

In [17]:
# entry_countBysource['diff'] = entry_countBysource['entry_id'] - entry_countBysource['entry_id'].shift(1)
# entry_countBysource['Rank'] = entry_countBysource.groupby(by = 'source_id')['date'].rank("dense", ascending = True)
# entry_countBysource['diff_'] = np.where(entry_countBysource['Rank'] == 1, np.nan, entry_countBysource['diff'])
# entry_countBysource.drop(['Rank', 'diff'], axis = 1, inplace = True)
# entry_countBysource

In [18]:
# threshold_alarm5 = 2
# result = entry_countBysource.loc[entry_countBysource['diff_'].abs() > threshold_alarm5]
# result_dict = result.to_dict('records')
# source_list = []
# date_list = []
# for result in result_dict:
#     source_list.append(result['source_id'])
#     date_list.append(result['date'])

In [19]:
# df['alarm5'] = np.where((df['source_id'].isin(source_list)) & (df['date'].isin(date_list)),
#                        1, 0)
# df.loc[df['alarm5'] == 1]