In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, tzinfo
import pytz
from io import StringIO
import os
from IPython.display import Markdown
import boto3

In [2]:
# Wrapper function for convenience
def dm(text):
    return display(Markdown(text))

# Introduction
https://en.youbike.com.tw/region/main/stations/

In [3]:
class ConnectionToS3:
    """Factory method object to create an active boto3 S3 resource. Ensure the correct env variables are set before calling this object.

    Available class method:
        from_env(): create connection from environment variables

    """

    def __init__(
        self,
        bucket_name: str,
        aws_access_key_id: str,
        aws_secret_access_key: str,
        endpoint_url: str = None,
        region_name: str = "ap-northeast-1",
    ):
        self._resource = boto3.resource(
            "s3",
            endpoint_url=endpoint_url,
            region_name=region_name,
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
        )
        self._bucket_name = bucket_name

    @classmethod
    def from_env(cls):
        app_env = os.getenv("APP_ENV", "local")
        print("Loading from env: ", app_env)
        if app_env == "local":
            return cls(
                "local-youbike",
                os.environ["MINIO_ACCESS_KEY_ID"],
                os.environ["MINIO_SECRET_ACCESS_KEY"],
                f'http://{os.environ["MINIO_HOST"]}:9000'
            )
        elif app_env == "stage":
            return cls(
                "stage-youbike",
                os.environ["AWS_ACCESS_KEY_ID"],
                os.environ["AWS_SECRET_ACCESS_KEY"],
            )
        else:
            raise Exception(f"The argument env={app_env} is not valid.")

    @property
    def resource(self):
        return self._resource

    @property
    def bucket_name(self):
        return self._bucket_name



In [None]:
os.environ['APP_ENV'] = 'stage'

In [None]:
connection = ConnectionToS3.from_env()
connection.bucket_name

In [None]:
def download_from_bucket(bucket_name: str, remote_path: str, filter: str, dest_dir: str):
    bucket = connection.resource.Bucket(bucket_name)

    for obj in bucket.objects.all():
        if f'{remote_path}{filter}' in obj.key:
            local_file_path = os.path.join(dest_dir, obj.key)
            if '/' in obj.key:
                os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
                connection.resource.meta.client.download_file(connection.bucket_name, obj.key, local_file_path)
            print(f"Downloaded {obj.key} at {dest_dir}")

In [None]:
def create_or_append_df(parquet_path: str, df: pd.DataFrame) -> pd.DataFrame:
    if df is None:
        concat_df = pd.read_parquet(parquet_path).loc[[]]
    else:
        df_to_append = pd.read_parquet(parquet_path)
        concat_df = pd.concat([df, df_to_append], join='outer')
    return concat_df

In [None]:
download_from_bucket(connection.bucket_name, 'raw_data/', 'youbike_dock_info_2024-03-18', '../tmp_data/parquet_raw_data')

In [None]:
#Create historical data df
hist_df = None
cnt = 0
for i in os.listdir('../tmp_data/parquet_raw_data/raw_data/'):
    if "youbike_dock_info_2024-03-18" in i:
        print(i)
        try: 
            hist_df = create_or_append_df(f"../tmp_data/parquet_raw_data/raw_data/{i}", hist_df)
        except:
            print("failed with ", i)
        cnt += 1
        print(f"Added: {cnt} / {len(os.listdir('../tmp_data/parquet_raw_data/raw_data/'))}")

In [None]:
#Enforce Schema for historical df
hist_df = hist_df[['id', 'name', 'type', 'space', 'full', 'empty', 'bike_yb2', 'bike_eyb', 'city', 'area',
         'lat', 'lng', 'address', 'is_open', 'place_id', 'last_update_ts', 'extraction_ts']].reset_index(drop=True)
display(hist_df.head(5), hist_df.shape)

# Validate & clean the dataset

In [50]:
# Analyze Raw da
main_df = pd.read_parquet("./youbike_dock_info_history_2024-03-01_2024-03-18_raw.parquet")



**Columns are understood as follows:**
- id: unique identifier per bike station
- type: youbike type (1.0, 2.0)
- space: total available bike slots per station (= full + empty)
- full: nbr of occupied slots (= bike_yb2 + bike_eyb)
- empty: nbr of available slots for parking
- bike_yb2: count of YouBike 2.0 in occupied slots (see 'full')
- bike_eyb: count of E-Youbike 2.0 in occupied slots (see 'full')
- city: city where bike station is located
- area: city's district where bike station is located
- lat: latitude coordinate of bike station
- lng: longitude coordinate of bike station
- place_id: ??
- address: postal address of bike station
- is_open: ??
- last_updated_ts: timestamp of data in unix epoch
- extraction_ts: ts data was pulled from API

## Check types & missing values

In [51]:
display(main_df.info(show_counts=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12650288 entries, 0 to 12650287
Data columns (total 17 columns):
 #   Column          Non-Null Count     Dtype                      
---  ------          --------------     -----                      
 0   id              12650288 non-null  int64                      
 1   name            12650288 non-null  object                     
 2   type            12650288 non-null  int64                      
 3   space           12650288 non-null  int64                      
 4   full            12650288 non-null  int64                      
 5   empty           12650288 non-null  int64                      
 6   bike_yb2        12650288 non-null  int64                      
 7   bike_eyb        12650288 non-null  int64                      
 8   city            12452384 non-null  object                     
 9   area            12650288 non-null  object                     
 10  lat             12650288 non-null  float64                    
 

None

**Observation**
- place_id contains no value
- city has empty values
- updated_at is an int64

In [52]:
# Drop empty cities
main_df.drop(main_df[main_df["city"].isna()].index, inplace=True) # Discard vals with null cities

#Drop youbikes of type 1 - because soon discontinued
main_df.drop(main_df[main_df["type"] != 2].index, inplace=True)

#

## Discard irrelevant features

In [53]:
main_df.drop(labels=['place_id', 'address'], axis=1, inplace=True)

In [54]:
dm("#### Any missing values left?")
display(main_df.isna().sum())

#### Any missing values left?

id                0
name              0
type              0
space             0
full              0
empty             0
bike_yb2          0
bike_eyb          0
city              0
area              0
lat               0
lng               0
is_open           0
last_update_ts    0
extraction_ts     0
dtype: int64

## Check assumptions

In [55]:
# Space = full + empty

space_uneq = main_df[main_df["space"] != (main_df["full"] + main_df["empty"])]


display(Markdown('#### Checking if space = full + empty'))
display(Markdown(f'**Rows where unequal:** {space_uneq.shape[0]}'))
display(Markdown(f'**As proportion of total dataset:** {space_uneq.shape[0] / main_df.shape[0]}')) #displays ratio of total bikes 
display(Markdown('**Excerpt output**'))
display(space_uneq.head(5)) # Displays where it is not true
dm(f'**conclusion:** Understanding the is_open code mapping would be useful to assess how to handle these records')

#### Checking if space = full + empty

**Rows where unequal:** 1196884

**As proportion of total dataset:** 0.10951727845970659

**Excerpt output**

Unnamed: 0,id,name,type,space,full,empty,bike_yb2,bike_eyb,city,area,lat,lng,is_open,last_update_ts,extraction_ts
1,500501081,北興停車場,2,25,0,0,0,0,新竹縣,竹北市,24.83,121.03,0,2024-03-07 15:09:17+08:00,2024-03-07 18:25:10+08:00
4,500501077,竹北國民運動中心(縣泳館),2,25,0,0,0,0,新竹縣,竹北市,24.82,121.02,0,2024-03-07 15:09:17+08:00,2024-03-07 18:25:10+08:00
7,500501075,綠41公園,2,20,0,0,0,0,新竹縣,竹北市,24.82,121.02,0,2024-03-07 15:28:19+08:00,2024-03-07 18:25:10+08:00
8,500501073,廣五公園,2,18,0,0,0,0,新竹縣,竹北市,24.83,121.0,0,2024-03-07 15:09:18+08:00,2024-03-07 18:25:10+08:00
10,501310001,左鎮化石園區,2,20,0,0,0,0,臺南市,左鎮區,23.06,120.39,0,2024-03-06 21:06:15+08:00,2024-03-07 18:25:10+08:00


**conclusion:** Understanding the is_open code mapping would be useful to assess how to handle these records

In [56]:
dm(f'Inequality appears across all is_open, and mostly on 0 and 2. \n Proportion rows unequal vs total per is_open')
display(space_uneq["is_open"].value_counts() / main_df["is_open"].value_counts())

Inequality appears across all is_open, and mostly on 0 and 2. 
 Proportion rows unequal vs total per is_open

is_open
0   0.25
1   0.10
2   0.65
3    NaN
4   0.05
5   0.12
Name: count, dtype: float64

In [57]:
dm('#### Check if full = bike_yb2 + bike_eyb')
full_uneq = main_df[main_df["full"] != (main_df["bike_yb2"] + main_df["bike_eyb"])]
dm(f'**Rows where unequal:** {full_uneq.shape[0]}')
dm(f'**Excerpt output where unequal**')
display(full_uneq.head(5))
dm(f'**Conclusion**: Assumption is valid')

#### Check if full = bike_yb2 + bike_eyb

**Rows where unequal:** 0

**Excerpt output where unequal**

Unnamed: 0,id,name,type,space,full,empty,bike_yb2,bike_eyb,city,area,lat,lng,is_open,last_update_ts,extraction_ts


**Conclusion**: Assumption is valid

## Check range and distribution of values

### Categorical

In [58]:
main_df_cat = main_df.select_dtypes(include=["category", "object"]) 
main_df_cat["type"] = main_df["type"]
main_df_cat["is_open"] = main_df["is_open"]

In [59]:
for col in main_df_cat.columns:
    unique_values = main_df[col].unique()
    print(f"Unique values in '{col}': {unique_values}")

Unique values in 'name': ['勝利國中' '北興停車場' '兒10公園' ... '為恭醫院' '立功街55巷口' '關渡國中']
Unique values in 'city': ['新竹縣' '苗栗縣' '臺南市' '台中市' '台北市' '新北市' '桃園市' '高雄市' '新竹科學工業園區' '屏東縣' '嘉義市'
 '新竹市']
Unique values in 'area': ['竹北市' '後龍鎮' '左鎮區' '楠西區' '后里區' '松山區' '中山區' '文山區' '大安區' '信義區' '中正區' '南區'
 '安南區' '善化區' '通霄鎮' '汐止區' '安定區' '東區' '三重區' '仁德區' '北投區' '蘆竹區' '八德區' '茄萣區'
 '林園區' '三民區' '湖內區' '中和區' '板橋區' '歸仁區' '楠梓區' '烏日區' '永康區' '佳里區' '左營區' '旗山區'
 '新化區' '西港區' '鹽水區' '新營區' '鶯歌區' '土城區' '八里區' '永安區' '鼓山區' '中西區' '橋頭區' '台北維調區'
 '前鎮區' '大樹區' '岡山區' '苓雅區' '路竹區' '霧峰區' '士林區' '新竹科學園區' '潮州鎮' '竹田鄉' '車城鄉'
 '林邊鄉' '麟洛鄉' '維調區' '恆春鎮' '東港鎮' '屏東市' '關廟區' '學甲區' '新市區' '麻豆區' '後壁區' '柳營區'
 '東山區' '官田區' '安平區' '玉井區' '白河區' '北區' '六甲區' '北門區' '七股區' '下營區' '大內區' '山上區'
 '美濃區' '甲仙區' '維護調度中心' '燕巢區' '阿蓮區' '彌陀區' '大社區' '大寮區' '鳥松區' '仁武區' '梓官區'
 '小港區' '旗津區' '鳳山區' '前金區' '鹽埕區' '新興區' '市區' '臨時站' '維修調度中心' '測試站' '竹南鎮' '苑裡鎮'
 '新社區' '外埔區' '和平區' '石岡區' '龍井區' '潭子區' '大雅區' '大肚區' '神岡區' '梧棲區' '東勢區' '大甲區'
 '沙鹿區' '清水區' '太平區' '大里區' '豐原區' '北屯區' '南屯區' '西屯區' '西區' '新豐鄉' '中區' '湖

In [60]:
class TransliterationMapper(dict):
    def __missing__(self, key):
        return 'N/A'
#Map city to ascii chars

city_name_glossary = TransliterationMapper({"新北市": "XinBeiShi",
                      "台北市": "TaiBeiShi",
                      "台中市":"TaiZhongShi",
                      "高雄市":"GaoXiongShi",
                      "桃園市":"TaoYuanShi",
                      "臺南市": "TaiNanShi",
                      "嘉義市": "JiaYiShi",
                      "屏東縣": "PingDongXian",
                      "新竹市": "XinZhuShi",
                      "新竹縣": "XinZhuXian",
                      "苗栗縣" : "MiaoLiXian",
                      "新竹科學工業園區": "XinZhuKeXueGong"})

In [61]:
area_name_glossary = TransliterationMapper(
     {'竹北市': 'ZhuBeiShi',
    '後龍鎮': 'HouLongZhen',
    '左鎮區': 'ZuoZhenQu',
    '楠西區': 'NanXiQu',
    '后里區': 'HouLiQu',
    '松山區': 'SongShanQu',
    '中山區': 'ZhongShanQu',
    '文山區': 'WenShanQu',
    '大安區': 'DaAnQu',
    '信義區': 'XinYiQu',
    '中正區': 'ZhongZhengQu',
    '南區': 'NanQu',
    '安南區': 'AnNanQu',
    '善化區': 'ShanHuaQu',
    '通霄鎮': 'TongXiaoZhen',
    '汐止區': 'XiZhiQu',
    '安定區': 'AnDingQu',
    '東區': 'DongQu',
    '三重區': 'SanChongQu',
    '仁德區': 'RenDeQu',
    '北投區': 'BeiTouQu',
    '蘆竹區': 'LuZhuQu',
    '八德區': 'BaDeQu',
    '茄萣區': 'QieDingQu',
    '林園區': 'LinYuanQu',
    '三民區': 'SanMinQu',
    '湖內區': 'HuNeiQu',
    '中和區': 'ZhongHeQu',
    '板橋區': 'BanQiaoQu',
    '歸仁區': 'GuiRenQu',
    '楠梓區': 'NanZiQu',
    '烏日區': 'WuRiQu',
    '永康區': 'YongKangQu',
    '佳里區': 'JiaLiQu',
    '左營區': 'ZuoYingQu',
    '旗山區': 'QiShanQu',
    '新化區': 'XinHuaQu',
    '西港區': 'XiGangQu',
    '鹽水區': 'YanShuiQu',
    '新營區': 'XinYingQu',
    '鶯歌區': 'YingGeQu',
    '土城區': 'TuChengQu',
    '八里區': 'BaLiQu',
    '永安區': 'YongAnQu',
    '鼓山區': 'GuShanQu',
    '中西區': 'ZhongXiQu',
    '橋頭區': 'QiaoTouQu',
    '台北維調區': 'TaiBeiWeiDiaoQu',
    '前鎮區': 'QianZhenQu',
    '大樹區': 'DaShuQu',
    '岡山區': 'GangShanQu',
    '苓雅區': 'LingYaQu',
    '路竹區': 'LuZhuQu',
    '霧峰區': 'WuFengQu',
    '士林區': 'ShiLinQu',
    '新竹科學園區': 'XinZhuKeXueYuanQu',
    '潮州鎮': 'ChaoZhouZhen',
    '竹田鄉': 'ZhuTianXiang',
    '車城鄉': 'CheChengXiang',
    '林邊鄉': 'LinBianXiang',
    '麟洛鄉': 'LinLuoXiang',
    '維調區': 'WeiDiaoQu',
    '恆春鎮': 'HengChunZhen',
    '東港鎮': 'DongGangZhen',
    '屏東市': 'PingDongShi',
    '關廟區': 'GuanMiaoQu',
    '學甲區': 'XueJiaQu',
    '新市區': 'XinShiQu',
    '麻豆區': 'MaDouQu',
    '後壁區': 'HouBiQu',
    '柳營區': 'LiuYingQu',
    '東山區': 'DongShanQu',
    '官田區': 'GuanTianQu',
    '安平區': 'AnPingQu',
    '玉井區': 'YuJingQu',
    '白河區': 'BaiHeQu',
    '北區': 'BeiQu',
    '六甲區': 'LiuJiaQu',
    '北門區': 'BeiMenQu',
    '七股區': 'QiGuQu',
    '下營區': 'XiaYingQu',
    '大內區': 'DaNeiQu',
    '山上區': 'ShanShangQu',
    '美濃區': 'MeiNongQu',
    '甲仙區': 'JiaXianQu',
    '維護調度中心': 'WeiHuTiaoDuZhongXin',
    '燕巢區': 'YanChaoQu',
    '阿蓮區': 'ALianQu',
    '彌陀區': 'MiTuoQu',
    '大社區': 'DaSheQu',
    '大寮區': 'DaLiaoQu',
    '鳥松區': 'NiaoSongQu',
    '仁武區': 'RenWuQu',
    '梓官區': 'ZiGuanQu',
    '小港區': 'XiaoGangQu',
    '旗津區': 'QiJinQu',
    '鳳山區': 'FengShanQu',
    '前金區': 'QianJinQu',
      '鹽埕區': 'YanChengQu',
    '新興區': 'XinXingQu',
    '市區': 'ShiQu',
    '臨時站': 'LinShiZhan',
    '維修調度中心': 'WeiXiuTiaoDuZhongXin',
    '測試站': 'CeShiZhan',
    '竹南鎮': 'ZhuNanZhen',
    '苑裡鎮': 'YuanLiZhen',
    '新社區': 'XinSheQu',
    '外埔區': 'WaiPuQu',
    '和平區': 'HePingQu',
    '石岡區': 'ShiGangQu',
    '龍井區': 'LongJingQu',
    '潭子區': 'TanZiQu',
    '大雅區': 'DaYaQu',
    '大肚區': 'DaDuQu',
    '神岡區': 'ShenGangQu',
    '梧棲區': 'WuQiQu',
    '東勢區': 'DongShiQu',
    '大甲區': 'DaJiaQu',
    '沙鹿區': 'ShaLuQu',
    '清水區': 'QingShuiQu',
    '太平區': 'TaiPingQu',
    '大里區': 'DaLiQu',
    '豐原區': 'FengYuanQu',
    '北屯區': 'BeiTunQu',
    '南屯區': 'NanTunQu',
    '西屯區': 'XiTunQu',
    '西區': 'XiQu',
    '新豐鄉': 'XinFengXiang',
    '中區': 'ZhongQu',
    '湖口鄉': 'HuKouXiang',
    '竹東鎮': 'ZhuDongZhen',
    '香山區': 'XiangShanQu',
    '觀音區': 'GuanYinQu',
    '龜山區': 'GuiShanQu',
    '龍潭區': 'LongTanQu',
    '楊梅區': 'YangMeiQu',
    '桃園區': 'TaoYuanQu',
    '新屋區': 'XinWuQu',
    '平鎮區': 'PingZhenQu',
    '中壢區': 'ZhongLiQu',
    '大溪區': 'DaXiQu',
    '大園區': 'DaYuanQu',
    '猴雙公共自行車專區': 'HouShuangGongGongZiXingCheZhuanQu',
    '新北維調區': 'XinBeiWeiDiaoQu',
    '蘆洲區': 'LuZhouQu',
    '樹林區': 'ShuLinQu',
    '雙溪區': 'ShuangXiQu',
    '新莊區': 'XinZhuangQu',
    '新店區': 'XinDianQu',
    '萬里區': 'WanLiQu',
    '瑞芳區': 'RuiFangQu',
    '淡水區': 'DanShuiQu',
    '深坑區': 'ShenKengQu',
    '泰山區': 'TaiShanQu',
    '林口區': 'LinKouQu',
    '坪林區': 'PingLinQu',
    '金山區': 'JinShanQu',
    '永和區': 'YongHeQu',
    '石門區': 'ShiMenQu',
    '石碇區': 'ShiDingQu',
    '五股區': 'WuGuQu',
    '三峽區': 'SanXiaQu',
    '三芝區': 'SanZhiQu',
    '臺大公館校區': 'TaiDaGongGuanXiaoQu',
    '萬華區': 'WanHuaQu',
    '南港區': 'NanGangQu',
    '內湖區': 'NeiHuQu',
    '大同區': 'DaTongQu',
    '苗栗市': 'MiaoLiShi',
    '頭份市': 'TouFenShi'}
)

In [62]:
dm('**Outcome**: Transliterating Chinese characters for standardization')
for col in main_df[['city', 'area']].columns:
    print(f"Unique values in '{col}' {main_df[col].unique()}")
# main_df['city'] = main_df['city'].map(city_name_glossary)
# main_df['area'] = main_df['area'].map(area_name_glossary)

**Outcome**: Transliterating Chinese characters for standardization

Unique values in 'city' ['新竹縣' '苗栗縣' '臺南市' '台中市' '台北市' '新北市' '桃園市' '高雄市' '新竹科學工業園區' '屏東縣' '嘉義市'
 '新竹市']
Unique values in 'area' ['竹北市' '後龍鎮' '左鎮區' '楠西區' '后里區' '松山區' '中山區' '文山區' '大安區' '信義區' '中正區' '南區'
 '安南區' '善化區' '通霄鎮' '汐止區' '安定區' '東區' '三重區' '仁德區' '北投區' '蘆竹區' '八德區' '茄萣區'
 '林園區' '三民區' '湖內區' '中和區' '板橋區' '歸仁區' '楠梓區' '烏日區' '永康區' '佳里區' '左營區' '旗山區'
 '新化區' '西港區' '鹽水區' '新營區' '鶯歌區' '土城區' '八里區' '永安區' '鼓山區' '中西區' '橋頭區' '台北維調區'
 '前鎮區' '大樹區' '岡山區' '苓雅區' '路竹區' '霧峰區' '士林區' '新竹科學園區' '潮州鎮' '竹田鄉' '車城鄉'
 '林邊鄉' '麟洛鄉' '維調區' '恆春鎮' '東港鎮' '屏東市' '關廟區' '學甲區' '新市區' '麻豆區' '後壁區' '柳營區'
 '東山區' '官田區' '安平區' '玉井區' '白河區' '北區' '六甲區' '北門區' '七股區' '下營區' '大內區' '山上區'
 '美濃區' '甲仙區' '維護調度中心' '燕巢區' '阿蓮區' '彌陀區' '大社區' '大寮區' '鳥松區' '仁武區' '梓官區'
 '小港區' '旗津區' '鳳山區' '前金區' '鹽埕區' '新興區' '市區' '臨時站' '維修調度中心' '測試站' '竹南鎮' '苑裡鎮'
 '新社區' '外埔區' '和平區' '石岡區' '龍井區' '潭子區' '大雅區' '大肚區' '神岡區' '梧棲區' '東勢區' '大甲區'
 '沙鹿區' '清水區' '太平區' '大里區' '豐原區' '北屯區' '南屯區' '西屯區' '西區' '新豐鄉' '中區' '湖口鄉'
 '竹東鎮' '香山區' '觀音區' '龜山區' '龍潭區' '楊梅區' '桃園區' '新屋區' '平鎮區' '中壢區' '大溪區' '大園區'
 '猴

### Numerical

In [63]:
main_df_num = main_df.drop(labels=main_df_cat.columns, axis=1)
main_df_num['last_update_ts'] = main_df['last_update_ts'].copy(deep=True)
main_df_num['extraction_ts'] = main_df['extraction_ts'].copy(deep=True)

In [64]:
pd.set_option('display.float_format', '{:.2f}'.format)


In [65]:
main_df_num.describe(include='all')

Unnamed: 0,id,space,full,empty,bike_yb2,bike_eyb,lat,lng,last_update_ts,extraction_ts
count,10928723.0,10928723.0,10928723.0,10928723.0,10928723.0,10928723.0,10928723.0,10928723.0,10928723,10928723
mean,500638998.41,20.95,7.25,13.25,6.91,0.34,24.11,120.56,2024-03-10 21:23:24.960862208+08:00,2024-03-12 09:04:26.412556032+08:00
min,500101001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-01-14 01:18:15+08:00,2024-03-01 04:18:31+08:00
25%,500207013.0,14.0,3.0,7.0,2.0,0.0,23.01,120.38,2024-03-08 12:32:19+08:00,2024-03-10 14:56:07+08:00
50%,500602030.0,17.0,6.0,11.0,5.0,0.0,24.44,120.83,2024-03-13 02:03:14+08:00,2024-03-13 11:14:44+08:00
75%,501203106.0,24.0,10.0,16.0,9.0,0.0,25.03,121.49,2024-03-15 23:53:18+08:00,2024-03-16 05:20:52+08:00
max,508201041.0,99.0,99.0,99.0,99.0,62.0,25.29,122.0,2024-03-18 23:49:18+08:00,2024-03-18 23:50:54+08:00
std,741534.43,11.97,7.05,10.38,6.96,1.23,1.65,6.83,,


**Observations about range and distribution**
- Empty has negative values
- Space min value is zero
- Some long/lat coordinates are outside of Taiwan
- Some ts shows stale data (many days older than latest ts)
- Rest is according to expectations

In [66]:
dm("""
**Dropping:**
- Stations with negative empty values. (= Available spaces cannot be negative.)
- Stations without space are irrelevant
- Stations located outside of Taiwan (these are test locations. Confirmed by not being displayed on YouBikes official map)
- Stations with last_update_ts older than the first day of this historical dataset. (Stale data is useless)
""")
dm(f"Row counts before: {main_df.shape[0]}")
discard_mask = (
    (main_df["empty"] < 0)
    | (main_df["space"] < 1) # Stations without space are irrelevant
    | (main_df["lat"] < 21.89) #southermost lat of Taiwan's main island
    | (main_df["lng"] < 120) #westernmost lng of Taiwan's main island
    | (main_df["last_update_ts"] < pd.to_datetime("2024-02-16").tz_localize(tz='Asia/Taipei'))
)
main_df.drop(main_df[discard_mask == True].index, inplace=True)

dm(f"Row counts after: {main_df.shape[0]}")
#TODO: bound to the easter and northernmost points too? 


**Dropping:**
- Stations with negative empty values. (= Available spaces cannot be negative.)
- Stations without space are irrelevant
- Stations located outside of Taiwan (these are test locations. Confirmed by not being displayed on YouBikes official map)
- Stations with last_update_ts older than the first day of this historical dataset. (Stale data is useless)


Row counts before: 10928723

Row counts after: 10633396

## Check uniqueness of observations

In [67]:
dm("#### Unique key is (extraction_ts, id)") 

main_df['identical_key_cnt'] = main_df.groupby(['extraction_ts', 'id']).transform('size')
display(main_df['identical_key_cnt'].value_counts())
dm("key_cnt = 1 means no duplicates")

#### Unique key is (extraction_ts, id)

identical_key_cnt
1    10633396
Name: count, dtype: int64

key_cnt = 1 means no duplicates

In [68]:
dm("### Does each station have a unique record per extraction? (uniqueness on position (= Lat / Lng)")

main_df["pos"] = list(zip(main_df['lat'], main_df['lng']))
main_df["duplic_pos"] = main_df.groupby(['pos', 'extraction_ts']).transform('size')

display(main_df[main_df["duplic_pos"] > 1].head(4))
dm("**Observation:** Some Lat / Lng are recorded twice within the same extraction")
display(main_df["duplic_pos"].value_counts())
dm("<br>Filtering for those positions which are recorded twice returns ")
display(main_df[main_df["duplic_pos"] > 1]["pos"].value_counts())
dm("**Conclusion**: For the moment, any duplicated station (identified by using pos) per extraction will be dismissed")

### Does each station have a unique record per extraction? (uniqueness on position (= Lat / Lng)

Unnamed: 0,id,name,type,space,full,empty,bike_yb2,bike_eyb,city,area,lat,lng,is_open,last_update_ts,extraction_ts,identical_key_cnt,pos,duplic_pos
5504,500199002,蘆洲維修中心,2,17,0,17,0,0,台北市,台北維調區,25.0,121.54,5,2024-03-07 18:19:20+08:00,2024-03-07 18:25:10+08:00,1,"(24.99609, 121.54284)",2
6446,500105068,景興國中,2,10,1,9,1,0,台北市,文山區,25.0,121.54,1,2024-03-07 18:23:14+08:00,2024-03-07 18:25:10+08:00,1,"(24.99609, 121.54284)",2
13387,500199002,蘆洲維修中心,2,17,0,17,0,0,台北市,台北維調區,25.0,121.54,5,2024-03-01 11:33:20+08:00,2024-03-01 11:57:38+08:00,1,"(24.99609, 121.54284)",2
14329,500105068,景興國中,2,10,0,10,0,0,台北市,文山區,25.0,121.54,1,2024-03-01 11:48:14+08:00,2024-03-01 11:57:38+08:00,1,"(24.99609, 121.54284)",2


**Observation:** Some Lat / Lng are recorded twice within the same extraction

duplic_pos
1    10628352
2        5044
Name: count, dtype: int64

<br>Filtering for those positions which are recorded twice returns 

pos
(24.99609, 121.54284)    3192
(25.07965, 121.54148)     984
(25.03062, 121.49021)     868
Name: count, dtype: int64

**Conclusion**: For the moment, any duplicated station (identified by using pos) per extraction will be dismissed

In [69]:
dm("Dropping stations with duplicated pos, extraction_ts")
dm(f"Row count before: {main_df.shape[0]}")

main_df.drop(main_df[main_df["duplic_pos"] > 1].index, inplace=True)

dm(f"Row count after: {main_df.shape[0]}")

Dropping stations with duplicated pos, extraction_ts

Row count before: 10633396

Row count after: 10628352

In [70]:
#Last visual check before dl
#Checkpoint
base_df = (
    main_df[['id', 'name', 'lat', 'lng', 'space', 'full', 'empty', 'bike_yb2', 'bike_eyb', 
             'city', 'area', 'last_update_ts','extraction_ts']]
    .sort_values(by=['id', 'extraction_ts'])
    .reset_index(drop=True)
    .copy(deep=True)
          )


In [71]:
dm("Last visual check before saving")
base_df.head(5)

Last visual check before saving

Unnamed: 0,id,name,lat,lng,space,full,empty,bike_yb2,bike_eyb,city,area,last_update_ts,extraction_ts
0,500101001,捷運科技大樓站,25.03,121.54,28,1,27,1,0,台北市,大安區,2024-03-01 03:35:18+08:00,2024-03-01 04:18:31+08:00
1,500101001,捷運科技大樓站,25.03,121.54,28,3,25,3,0,台北市,大安區,2024-03-01 10:57:52+08:00,2024-03-01 11:00:48+08:00
2,500101001,捷運科技大樓站,25.03,121.54,28,4,24,4,0,台北市,大安區,2024-03-01 11:03:19+08:00,2024-03-01 11:05:52+08:00
3,500101001,捷運科技大樓站,25.03,121.54,28,2,26,2,0,台北市,大安區,2024-03-01 11:09:19+08:00,2024-03-01 11:10:57+08:00
4,500101001,捷運科技大樓站,25.03,121.54,28,4,24,4,0,台北市,大安區,2024-03-01 11:14:19+08:00,2024-03-01 11:16:02+08:00


In [None]:
base_df.to_parquet("../tmp_data/clean_data/youbike_dock_info_history_2024-03-01_2024-03-18.parquet", index=False)