# ３章 時系列データの加工・可視化１０本ノック

## ノック４１：時系列データを読み込んでみよう

In [None]:
from glob import glob
files = glob('../../downloads/100knock/chapter-3/data/person_count_1sec/out_0001/*.csv')
files.sort()
files[:5]

In [None]:
import pandas as pd
data = pd.read_csv(files[0])
display(data.head(5))
print(data.dtypes)

In [None]:
data['receive_time'] = pd.to_datetime(data['receive_time'])
display(data.head())
print(data.dtypes)

In [None]:
data = pd.read_csv(files[0], parse_dates=["receive_time"])
display(data.head())
print(data.dtypes)

In [None]:
data = []
for f in files:
  tmp = pd.read_csv(f, parse_dates=["receive_time"])
  data.append(tmp)
data = pd.concat(data,ignore_index=True)
display(data.head())
len(data)

## ノック４２：日付の範囲を確認しよう

In [None]:
min_receive_time = data['receive_time'].min()
max_receive_time = data['receive_time'].max()
print(min_receive_time)
print(max_receive_time)

In [None]:
print(data['receive_time'].max()-data['receive_time'].min())

## ノック４３：日毎のデータ件数を確認しよう

In [None]:
data['receive_date'] = data['receive_time'].dt.date
data.head()

In [None]:
daily_count = data[['receive_date','id']].groupby('receive_date', as_index=False).count()
daily_count.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 5))
plt.xticks(rotation=90)
sns.barplot(x=daily_count['receive_date'], y=daily_count["id"])

## ノック４４：日付から曜日を算出しよう

In [None]:
data['dayofweek'] = data['receive_time'].dt.dayofweek
data['day_name'] = data['receive_time'].dt.day_name()
data.head()

In [None]:
data[['receive_date','dayofweek','day_name']].drop_duplicates(subset='receive_date').head(10)

## ノック４５：特定範囲のデータに絞り込もう

In [None]:
import datetime as dt
data_extract = data.loc[(data['receive_time']>=dt.datetime(2021,1,20))&
                        (data['receive_time']<dt.datetime(2021,1,23))].copy()
display(data_extract.head())
display(data_extract.tail())

## ノック４６：秒単位のデータを作成しよう

In [None]:
data_extract['receive_time_sec'] = data_extract['receive_time'].dt.round('S')
data_extract.head()

In [None]:
print(len(data_extract))
print(len(data_extract['receive_time_sec'].unique()))

In [None]:
data_extract[data_extract['receive_time_sec'].duplicated(keep=False)].head()

In [None]:
data_extract['receive_time_sec'] = data_extract['receive_time'].dt.floor('S')
display(data_extract.head())
print(len(data_extract))
print(len(data_extract['receive_time_sec'].unique()))

In [None]:
data_extract[data_extract['receive_time_sec'].duplicated(keep=False)].head()

In [None]:
data_extract = data_extract.drop_duplicates(subset=['receive_time_sec'])
min_receive_time = data_extract['receive_time_sec'].min()
max_receive_time = data_extract['receive_time_sec'].max()
print(len(data_extract))
print(f'{min_receive_time}から{max_receive_time}')

## ノック４７：秒単位のデータを整形しよう

In [None]:
print(pd.date_range('2021-01-15', '2021-01-16', freq='S'))

In [None]:
base_data = pd.DataFrame({'receive_time_sec':pd.date_range(min_receive_time, max_receive_time,freq='S')})
display(base_data.head())
display(base_data.tail())
print(len(base_data))

In [None]:
data_base_extract = pd.merge(base_data, data_extract, on='receive_time_sec', how='left')
display(data_base_extract.head())
display(data_base_extract.isna().sum())

## ノック４８：秒間の欠損データを処理しよう

In [None]:
data_base_extract.sort_values('receive_time_sec',inplace=True)
data_base_extract = data_base_extract.fillna(method='ffill')
data_base_extract.head()

## ノック４９：通った人数を可視化しよう

In [None]:
data_analytics = data_base_extract[['receive_time_sec','in1','out1']].copy()
data_analytics.head()

In [None]:
data_before_1sec = data_analytics.shift(1)
data_before_1sec.head()

In [None]:
data_before_1sec.columns = ['receive_time_sec_b1sec','in1_b1sec','out1_b1sec']
data_analytics = pd.concat([data_analytics, data_before_1sec],axis=1)
data_analytics.head()

In [None]:
data_analytics['in1_calc'] = data_analytics['in1'] - data_analytics['in1_b1sec'] 
data_analytics['out1_calc'] = data_analytics['out1'] - data_analytics['out1_b1sec'] 
data_analytics.head()

In [None]:
data_analytics['date_hour'] = data_analytics['receive_time_sec'].dt.strftime('%Y%m%d%H')
data_analytics.head()

In [None]:
viz_data = data_analytics[['date_hour','in1_calc','out1_calc']].groupby('date_hour',as_index=False).sum()
viz_data = pd.melt(viz_data, id_vars='date_hour', value_vars=['in1_calc', 'out1_calc'])
viz_data.head()

In [None]:
plt.figure(figsize=(15, 5))
plt.xticks(rotation=90)
sns.lineplot(x=viz_data['date_hour'], y=viz_data["value"], hue=viz_data['variable'])

## ノック５０：移動平均を計算して可視化しよう

In [None]:
viz_data = data_analytics[['date_hour','in1_calc','out1_calc']].groupby('date_hour',as_index=False).sum()
viz_data.head(10)

In [None]:
viz_data_rolling = viz_data[['in1_calc','out1_calc']].rolling(3).mean()
viz_data_rolling.head(10)

In [None]:
viz_data_rolling['date_hour'] = viz_data['date_hour']
viz_data_rolling = pd.melt(viz_data_rolling, id_vars='date_hour', value_vars=['in1_calc', 'out1_calc'])

plt.figure(figsize=(15, 5))
plt.xticks(rotation=90)
sns.lineplot(x=viz_data_rolling['date_hour'], y=viz_data_rolling["value"], hue=viz_data_rolling['variable'])