In [1]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.abspath(r"C:\ThacSi\HoachDinhCNTT\HDCNTT")) 
import utils, plot_help
import matplotlib.pyplot as plt

#avoid warning popping up
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

%matplotlib inline

In [3]:
event_df = utils.chunk_loader('../data/cleaned/checkin_eventlog.csv')
event_df.head()

Unnamed: 0,business_id
2009-12-30 02:53:27,jm_6bIhR_TuciFIk5rhR7g
2009-12-31 00:18:24,yEKmlxvhA2EWrMLbwWi7Hw
2010-01-16 00:36:59,TRxR32_T_7Ly_RW4Ke97fw
2010-01-16 01:15:27,OzoE9NH0xjhfsVMc7ygaaQ
2010-01-16 02:12:05,7lwe7n-Yc-V9E_HfLAeylg


### get first and last checkin of business and span over which checkins are being recorded.

In [4]:
#create a groupby object
event_df_groupby = event_df.reset_index(level=0).groupby(by='business_id', as_index=False)

#get first
first_checkin = event_df_groupby.min()
first_checkin = first_checkin.rename(columns={'index': 'first'})
first_checkin['first'] = pd.to_datetime(first_checkin['first']) 

first_checkin.head()

Unnamed: 0,business_id,first
0,---kPU91CF4Lq2-WlRu9Lw,2020-03-13 21:10:56
1,--0iUa4sNDFiZFrAdIWhZQ,2010-09-13 21:43:09
2,--30_8IhuyMHbSOcNWd6DQ,2013-06-14 23:29:17
3,--7PUidqRWpRSpXebiyxTg,2011-02-15 17:12:00
4,--7jw19RH9JKXgFohspgQw,2014-04-21 20:42:11


In [5]:
#get last
last_checkin = event_df_groupby.max()
last_checkin = last_checkin.rename(columns={'index': 'last'})
last_checkin['last'] = pd.to_datetime(last_checkin['last']) 

last_checkin.head()

Unnamed: 0,business_id,last
0,---kPU91CF4Lq2-WlRu9Lw,2021-11-11 16:23:50
1,--0iUa4sNDFiZFrAdIWhZQ,2014-04-12 23:04:47
2,--30_8IhuyMHbSOcNWd6DQ,2014-08-13 23:20:22
3,--7PUidqRWpRSpXebiyxTg,2015-09-27 13:18:32
4,--7jw19RH9JKXgFohspgQw,2021-06-21 19:59:50


In [6]:
first_last_df = pd.merge(left=first_checkin, 
                         right=last_checkin, 
                         how='inner',
                         left_on='business_id', 
                         right_on='business_id')

#get difference between first and last as seconds
first_last_df['span_checkin'] = (first_last_df['last'] - first_last_df['first']).apply(lambda x: x.seconds)

first_last_df.head()

Unnamed: 0,business_id,first,last,span_checkin
0,---kPU91CF4Lq2-WlRu9Lw,2020-03-13 21:10:56,2021-11-11 16:23:50,69174
1,--0iUa4sNDFiZFrAdIWhZQ,2010-09-13 21:43:09,2014-04-12 23:04:47,4898
2,--30_8IhuyMHbSOcNWd6DQ,2013-06-14 23:29:17,2014-08-13 23:20:22,85865
3,--7PUidqRWpRSpXebiyxTg,2011-02-15 17:12:00,2015-09-27 13:18:32,72392
4,--7jw19RH9JKXgFohspgQw,2014-04-21 20:42:11,2021-06-21 19:59:50,83859


### Evaluate what is the average monthly checkin for each business

In [7]:
event_df_month = event_df.copy()
event_df_month['month'] = pd.DatetimeIndex(event_df.index).month
event_df_month['ones'] = 1

event_df_month.head()

Unnamed: 0,business_id,month,ones
2009-12-30 02:53:27,jm_6bIhR_TuciFIk5rhR7g,12,1
2009-12-31 00:18:24,yEKmlxvhA2EWrMLbwWi7Hw,12,1
2010-01-16 00:36:59,TRxR32_T_7Ly_RW4Ke97fw,1,1
2010-01-16 01:15:27,OzoE9NH0xjhfsVMc7ygaaQ,1,1
2010-01-16 02:12:05,7lwe7n-Yc-V9E_HfLAeylg,1,1


In [8]:
#get total checkin by month
checkin_month_count = event_df_month.groupby(by=['business_id', 'month'], as_index=False).sum()
checkin_month_count.head()

Unnamed: 0,business_id,month,ones
0,---kPU91CF4Lq2-WlRu9Lw,1,1
1,---kPU91CF4Lq2-WlRu9Lw,3,1
2,---kPU91CF4Lq2-WlRu9Lw,4,1
3,---kPU91CF4Lq2-WlRu9Lw,5,1
4,---kPU91CF4Lq2-WlRu9Lw,6,1


In [9]:
#group by business ID to get average monthly checkin
checkin_month_avg = checkin_month_count.groupby(by='business_id', as_index=False).mean()
checkin_month_avg.head()

Unnamed: 0,business_id,month,ones
0,---kPU91CF4Lq2-WlRu9Lw,6.7,1.1
1,--0iUa4sNDFiZFrAdIWhZQ,6.714286,1.285714
2,--30_8IhuyMHbSOcNWd6DQ,7.0,1.0
3,--7PUidqRWpRSpXebiyxTg,5.857143,1.428571
4,--7jw19RH9JKXgFohspgQw,6.5,2.0


In [10]:
#combine in single dataframe
df_checkin = pd.merge(left=checkin_month_avg, 
                      right = first_last_df, 
                      how='inner', 
                      on='business_id')

df_checkin = df_checkin.drop(columns=['month'])

df_checkin = df_checkin.rename(columns={'ones': 'avg_month_checkin'})

df_checkin.head()

Unnamed: 0,business_id,avg_month_checkin,first,last,span_checkin
0,---kPU91CF4Lq2-WlRu9Lw,1.1,2020-03-13 21:10:56,2021-11-11 16:23:50,69174
1,--0iUa4sNDFiZFrAdIWhZQ,1.285714,2010-09-13 21:43:09,2014-04-12 23:04:47,4898
2,--30_8IhuyMHbSOcNWd6DQ,1.0,2013-06-14 23:29:17,2014-08-13 23:20:22,85865
3,--7PUidqRWpRSpXebiyxTg,1.428571,2011-02-15 17:12:00,2015-09-27 13:18:32,72392
4,--7jw19RH9JKXgFohspgQw,2.0,2014-04-21 20:42:11,2021-06-21 19:59:50,83859


In [11]:
df_checkin.to_csv('../data/cleaned/checkin_feat.csv')