# Raw preprocessing of PTV data
This notebook preprocesses the landing PTV data, outputting the cleaned dataset to the `../data/raw/ptv` directory

In [1]:
# Import dependencies
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import os

In [2]:
LANDING_DATA_DIR_PTV = "../../data/landing/ptv"
RAW_DATA_DIR_PTV = "../../data/raw/ptv"
filename = "stops.txt"
folders = ["reg_trains", "metro_trains", "metro_trams", "metro_buses", 
           "reg_coaches", "reg_buses"]

In [3]:
# Load text files with stop information (eg. bus stops)
ptv_vars = []

for folder in folders:
    var_name = f"{folder}_stops"
    ptv_vars.append(var_name)

    globals()[var_name] = pd.read_csv(
        f"{LANDING_DATA_DIR_PTV}/{folder}/{filename}", header=0
    )

## Transform text files to shape files

In [4]:
# Create `ptv` folder if doesn't exist
if not (os.path.exists(RAW_DATA_DIR_PTV)):
    os.makedirs(RAW_DATA_DIR_PTV)

In [5]:
# For each ptv text file
for var_name in ptv_vars:
    df = globals()[var_name]

    location = []

    for row in range(len(df)):
        
        # Create point object and add to list (longitude = x, latitude = y)
        location.append(Point(df["stop_lon"][row], df["stop_lat"][row]))

    # Add new column to dataframe
    df["location"] = location

    # Write to shapefile and save in `raw` data directory
    df = gpd.GeoDataFrame(df, geometry="location")
    df.to_file(f"{RAW_DATA_DIR_PTV}/{var_name}.shp", driver="ESRI Shapefile")