In [301]:
import pandas as pd

## create a dataframe:

In [311]:
data = pd.read_csv(
    "../data/feed-views.log",
    names=['datetime', 'user'],
    sep='\t'
)
data['datetime'] = pd.to_datetime(data['datetime'], infer_datetime_format=True).astype("datetime64[ns]")
data['year', 'month'] = data['datetime'].dt.year
data['month'] = data['datetime'].dt.month
data['day'] = data['datetime'].dt.day
data['hour'] = data['datetime'].dt.hour
data['minute'] = data['datetime'].dt.minute
data['second'] = data['datetime'].dt.second
data

## create the new column daytime

In [312]:
evaluation_bins = [0, 3, 6, 10, 16, 19, 23]
group_names = ['night', 'early morning', 'morning', 'afternoon', 'early evening', 'evening']
data['daytime'] = pd.cut(data['hour'], bins = evaluation_bins, labels = group_names, include_lowest = True, ordered=False)
data.set_index('user')

Unnamed: 0_level_0,datetime,"(year, month)",month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
artem,2020-04-17 12:01:08.463179,2020,4,17,12,1,8,afternoon
artem,2020-04-17 12:01:23.743946,2020,4,17,12,1,23,afternoon
artem,2020-04-17 12:27:30.646665,2020,4,17,12,27,30,afternoon
artem,2020-04-17 12:35:44.884757,2020,4,17,12,35,44,afternoon
artem,2020-04-17 12:35:52.735016,2020,4,17,12,35,52,afternoon
...,...,...,...,...,...,...,...,...
valentina,2020-05-21 18:45:20.441142,2020,5,21,18,45,20,early evening
maxim,2020-05-21 23:03:06.457819,2020,5,21,23,3,6,evening
pavel,2020-05-21 23:23:49.995349,2020,5,21,23,23,49,evening
artem,2020-05-21 23:49:22.386789,2020,5,21,23,49,22,evening


##  number of elements

In [323]:
number_of_elements = data.count()
value_counts = data.value_counts('daytime')
print(f"number of elements: {number_of_elements.daytime}")
value_counts

number of elements: 1076


daytime
evening          509
afternoon        252
early evening    145
night            129
morning           36
early morning      5
dtype: int64

## sort values

In [314]:
data = data.sort_values(['hour', 'minute', 'second'])
data

Unnamed: 0,datetime,user,"(year, month)",month,day,hour,minute,second,daytime
944,2020-05-15 00:00:13.222265,valentina,2020,5,15,0,0,13,night
945,2020-05-15 00:01:05.153738,valentina,2020,5,15,0,1,5,night
563,2020-05-12 00:01:27.764025,pavel,2020,5,12,0,1,27,night
564,2020-05-12 00:01:38.444917,pavel,2020,5,12,0,1,38,night
565,2020-05-12 00:01:55.395042,pavel,2020,5,12,0,1,55,night
...,...,...,...,...,...,...,...,...,...
1074,2020-05-21 23:49:22.386789,artem,2020,5,21,23,49,22,evening
246,2020-05-09 23:53:55.599821,anatoliy,2020,5,9,23,53,55,evening
247,2020-05-09 23:54:54.260791,pavel,2020,5,9,23,54,54,evening
942,2020-05-14 23:58:56.754866,valentina,2020,5,14,23,58,56,evening


## min, max and mode

In [346]:
groups = data.groupby('daytime').max('hour')
print(f"maximum hour where time of day is night: {groups.loc['night']['hour']}")
print("User who visited at those hours:")
data.loc[data['hour'] == groups.loc['night']['hour']].head(1)

maximum hour where time of day is night: 3
User who visited at those hours:


Unnamed: 0,datetime,user,"(year, month)",month,day,hour,minute,second,daytime
46,2020-04-19 03:23:35.471598,konstantin,2020,4,19,3,23,35,night


In [347]:
groups = data.groupby('daytime').min('hour')
print(f"minimum hour where time of day is morning: {groups.loc['morning']['hour']}")
print("User who visited at those hours:")
data.loc[data['hour'] == groups.loc['morning']['hour']].head(1)


minimum hour where time of day is morning: 8
User who visited at those hours:


Unnamed: 0,datetime,user,"(year, month)",month,day,hour,minute,second,daytime
963,2020-05-15 08:16:03.918402,alexander,2020,5,15,8,16,3,morning


In [348]:
print(f"mode for hour is {data['hour'].mode().iloc[0]}")
print(f"mode for daytime is {data['daytime'].mode().iloc[0]}")

mode for hour is 22
mode for daytime is evening


## 3 earliest hours

In [352]:
print('Earliest in the morning:')
morning = data.loc[data['daytime'] == 'morning']
morning.nsmallest(3, "hour")

Earliest in the morning


Unnamed: 0,datetime,user,"(year, month)",month,day,hour,minute,second,daytime
963,2020-05-15 08:16:03.918402,alexander,2020,5,15,8,16,3,morning
964,2020-05-15 08:35:01.471463,alexander,2020,5,15,8,35,1,morning
965,2020-05-15 09:02:24.999438,alexander,2020,5,15,9,2,24,morning


In [353]:
print('Latest in the morning:')
morning = data.loc[data['daytime'] == 'morning']
morning.nlargest(3, "hour")

Latest in the morning


Unnamed: 0,datetime,user,"(year, month)",month,day,hour,minute,second,daytime
976,2020-05-16 10:02:08.925840,konstantin,2020,5,16,10,2,8,morning
756,2020-05-14 10:08:04.832361,maxim,2020,5,14,10,8,4,morning
757,2020-05-14 10:08:05.726956,maxim,2020,5,14,10,8,5,morning


## describe statistics

In [361]:
q1 = data.describe().loc['25%', 'hour']
q3 = data.describe().loc['75%', 'hour']
iqr = q3 - q1
print(f"iqr is: {iqr}")

iqr is: 9.0
