# Data Preprocessing

In [6]:
import pandas as pd
import json
import numpy as np

In [7]:
def handle_preprocess(filename, isWeekday):
    # Parse the JSON data into a Python dictionary
    with open(f'./data/{filename}', 'r', encoding='utf-8') as file:
        data = json.load(file)["features"]

    df = pd.json_normalize(data)

    # Drop unnecessary columns
    df = df.drop(columns=["type"])
    df = df.drop(columns=["geometry.type"])

    # Rename columns
    df.rename(
        columns={
            "properties.on_stop_id": "on_stop_id",
            "properties.off_stop_id": "off_stop_id",
            "properties.on_stop": "on_stop",
            "properties.off_stop": "off_stop",
            "properties.sum_of_txn_times": "sum_of_txn_times",
            "properties.district_origin": "district_origin",
            "properties.district_destination": "district_destination",
            "properties.width": "width",
            "geometry.coordinates": "coordinates",
            },
        inplace=True,
    )

    # Function to calculate the Haversine distance
    def haversine(coord1, coord2):
        # Radius of Earth in kilometers
        R = 6371.0
        
        lat1, lon1 = np.radians(coord1)
        lat2, lon2 = np.radians(coord2)
        
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        
        a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
        
        distance = R * c  # Distance in kilometers
        return distance

    # Calculate the Haversine distance
    df["distance"] = df["coordinates"].apply(lambda x: haversine(x[0], x[1]))

    # Split the coordinates into two columns
    df["on_stop_coordinates"] = df["coordinates"].apply(lambda x: x[0])
    df["off_stop_coordinates"] = df["coordinates"].apply(lambda x: x[1])

    # Save the data to a CSV file
    if isWeekday:
        df["txn_times_per_day"] = df["sum_of_txn_times"] / 5
        df.to_csv('./data/weekday_data.csv', index=False)
    else:
        df["txn_times_per_day"] = df["sum_of_txn_times"] / 2
        df.to_csv('./data/weekend_data.csv', index=False)
    return df

In [8]:
df_weekend = handle_preprocess("週末起訖站點統計_202307.geojson", False)
df_weekend.head()

Unnamed: 0,on_stop_id,off_stop_id,on_stop,off_stop,sum_of_txn_times,district_origin,district_destination,width,coordinates,distance,on_stop_coordinates,off_stop_coordinates,txn_times_per_day
0,U101001,U101001,捷運科技大樓站,捷運科技大樓站,139,大安區,大安區,2.415385,"[[121.5436, 25.02605], [121.5436, 25.02605]]",0.0,"[121.5436, 25.02605]","[121.5436, 25.02605]",69.5
1,U101001,U101002,捷運科技大樓站,復興南路二段273號前,47,大安區,大安區,1.471795,"[[121.5436, 25.02605], [121.54357, 25.02565]]",0.023506,"[121.5436, 25.02605]","[121.54357, 25.02565]",23.5
2,U101001,U101003,捷運科技大樓站,國北教大實小東側門,24,大安區,大安區,1.235897,"[[121.5436, 25.02605], [121.54124, 25.02429]]",0.281683,"[121.5436, 25.02605]","[121.54124, 25.02429]",12.0
3,U101001,U101004,捷運科技大樓站,和平公園東側,121,大安區,大安區,2.230769,"[[121.5436, 25.02605], [121.54282, 25.02351]]",0.171329,"[121.5436, 25.02605]","[121.54282, 25.02351]",60.5
4,U101001,U101005,捷運科技大樓站,辛亥復興路口西北側,185,大安區,大安區,2.887179,"[[121.5436, 25.02605], [121.54299, 25.02153]]",0.27154,"[121.5436, 25.02605]","[121.54299, 25.02153]",92.5


In [9]:
df_weekday = handle_preprocess("週間起訖站點統計_202307.geojson", True)
df_weekday.head()

Unnamed: 0,on_stop_id,off_stop_id,on_stop,off_stop,sum_of_txn_times,district_origin,district_destination,width,coordinates,distance,on_stop_coordinates,off_stop_coordinates,txn_times_per_day
0,U101001,U101001,捷運科技大樓站,捷運科技大樓站,332,大安區,大安區,2.392846,"[[121.5436, 25.02605], [121.5436, 25.02605]]",0.0,"[121.5436, 25.02605]","[121.5436, 25.02605]",66.4
1,U101001,U101002,捷運科技大樓站,復興南路二段273號前,109,大安區,大安區,1.454463,"[[121.5436, 25.02605], [121.54357, 25.02565]]",0.023506,"[121.5436, 25.02605]","[121.54357, 25.02565]",21.8
2,U101001,U101003,捷運科技大樓站,國北教大實小東側門,98,大安區,大安區,1.408176,"[[121.5436, 25.02605], [121.54124, 25.02429]]",0.281683,"[121.5436, 25.02605]","[121.54124, 25.02429]",19.6
3,U101001,U101004,捷運科技大樓站,和平公園東側,345,大安區,大安區,2.44755,"[[121.5436, 25.02605], [121.54282, 25.02351]]",0.171329,"[121.5436, 25.02605]","[121.54282, 25.02351]",69.0
4,U101001,U101005,捷運科技大樓站,辛亥復興路口西北側,1028,大安區,大安區,5.321611,"[[121.5436, 25.02605], [121.54299, 25.02153]]",0.27154,"[121.5436, 25.02605]","[121.54299, 25.02153]",205.6
