In [1]:
import pandas as pd
import os
import numpy as np
import geopandas as gdp
import folium

In [2]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col, to_timestamp,hour,\
    dayofmonth,date_format,desc

def landing_to_raw(df, cols, time_col = None, boolean_weather_dataset = None):
    """
    This function converts the df selected on given cols, and output them
    to the raw layer

    boolean_weather_dataset: boolean values to determine wheter mta or weather
    time_col: the column related to time in dataframe
    """
    # select columns 
    intermediate = df.select(*cols)

    # consistent_col_casing
    consistent_col_casing = \
        [F.col(col_name).alias(col_name.lower()) for col_name in intermediate.columns]
    intermediate = intermediate.select(*consistent_col_casing)

    # convert date data type: string --> timestamp
    if boolean_weather_dataset:
        # Y-M-D, H:m:s, 24 hour format --> weather dataset
        intermediate = intermediate.withColumn\
            (time_col, to_timestamp(col(time_col), "yyyy-MM-dd'T'HH:mm:ss"))
    else:
        # M-D-Y, h:m:s a: 12 hour format --> mta dataset
        intermediate = intermediate.withColumn\
            (time_col, to_timestamp(col(time_col), "MM/dd/yyyy hh:mm:ss a"))

        # Parse the "value" column from string to integer
        intermediate = \
            intermediate.withColumn("ridership", col("ridership").cast("int"))
    return intermediate


def statistics(df):
    """
    Print out the number of rows and columns of this dataframe
    """
    # Count the number of rows (equivalent to shape[0] in Pandas)
    num_rows = df.count()

    # Get the list of column names (equivalent to columns in Pandas)
    column_names = df.columns

    print(f"Number of rows: {num_rows}")
    print(f"Number of columns: {len(column_names)}")
    print("Column names:", column_names)


def save_to_parquet(df, file_path):
    """
    save the dataframe into parquet format in file_path
    """
    df \
        .coalesce(1) \
        .write \
        .mode('overwrite') \
        .parquet(file_path)

## Weather dataset
Obtained from Integrated Weather Surface (pulled on 08/08/2023)
Note that when rerun the shape can be different as these links are still updated daily

In [3]:
# Create a Spark session
spark = (
    SparkSession.builder.appName("MAST30034 mta weather")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

# Read the CSV files into Spark DataFrames
weather_2022 = spark.read.csv("../data/landing/weather_link_2022.csv", \
                              header=True)
weather_2023 = spark.read.csv("../data/landing/weather_link_2023.csv", \
                              header=True)
print(weather_2022.count(), len(weather_2022.columns))
print(weather_2023.count(), len(weather_2023.columns))

# Define the columns to retain
# the author argues to remove DEW as this is the dew temperature, which is 
# not commonly used in traditional weather forecasts on televisions or 
# our smartphones. In addition, we already have the tmp feature, which is the
# normal measurement of temperature. If we retain dew, then we have 2 highly
# positively correlated features inside the dataset, while the assumption is
# to have independent columns which are correlated with the response variable
columns_to_retain = ['DATE', 'WND', 'TMP']


# Select the specified columns, and pass to convert to raw data layer
# convert from landing to raw on weather datasets constrained by columns to 
# retain
weather_2022 = landing_to_raw(weather_2022, columns_to_retain, 'date', True)
weather_2023 = landing_to_raw(weather_2023, columns_to_retain, 'date', True)

weather_2022

your 131072x1 screen size is bogus. expect trouble
23/08/21 03:25:20 WARN Utils: Your hostname, DESKTOP-LHMPQFC resolves to a loopback address: 127.0.1.1; using 172.19.194.216 instead (on interface eth0)
23/08/21 03:25:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/21 03:25:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


13344 100
8475 95


date,wnd,tmp
2022-01-01 00:00:00,"200,1,N,0031,1",941
2022-01-01 00:51:00,"200,5,N,0031,5",945
2022-01-01 01:51:00,"200,5,N,0031,5",895
2022-01-01 02:41:00,"200,5,N,0036,5",945
2022-01-01 02:51:00,"220,5,N,0026,5",945
2022-01-01 03:00:00,"220,1,N,0026,1",941
2022-01-01 03:16:00,"200,5,N,0015,5",895
2022-01-01 03:51:00,"190,5,N,0026,5",895
2022-01-01 04:17:00,"180,5,N,0026,5",945
2022-01-01 04:51:00,"170,5,N,0036,5",945


In [4]:
# schema to be handled later as this requires in depth analysis on splititng
# Example: WND: 200,1,N,0031,1, the information that we want to extract is 200
# therefore, still retain the original schema, and will deal with this later
# in curated layer to design our own features
print(weather_2022.printSchema())
print(statistics(weather_2022))
print(statistics(weather_2023))
save_to_parquet(weather_2022, file_path="../data/raw/weather_2022")
save_to_parquet(weather_2023, file_path="../data/raw/weather_2023")


root
 |-- date: timestamp (nullable = true)
 |-- wnd: string (nullable = true)
 |-- tmp: string (nullable = true)

None
Number of rows: 13344
Number of columns: 3
Column names: ['date', 'wnd', 'tmp']
None
Number of rows: 8475
Number of columns: 3
Column names: ['date', 'wnd', 'tmp']
None


                                                                                

In [5]:
# quality check
sample_2022 = spark.read.parquet("../data/raw/weather_2022", header = True)
sample_2022.show(20, truncate = False)

+-------------------+--------------+-------+
|date               |wnd           |tmp    |
+-------------------+--------------+-------+
|2022-01-01 00:00:00|200,1,N,0031,1|+0094,1|
|2022-01-01 00:51:00|200,5,N,0031,5|+0094,5|
|2022-01-01 01:51:00|200,5,N,0031,5|+0089,5|
|2022-01-01 02:41:00|200,5,N,0036,5|+0094,5|
|2022-01-01 02:51:00|220,5,N,0026,5|+0094,5|
|2022-01-01 03:00:00|220,1,N,0026,1|+0094,1|
|2022-01-01 03:16:00|200,5,N,0015,5|+0089,5|
|2022-01-01 03:51:00|190,5,N,0026,5|+0089,5|
|2022-01-01 04:17:00|180,5,N,0026,5|+0094,5|
|2022-01-01 04:51:00|170,5,N,0036,5|+0094,5|
|2022-01-01 04:59:00|999,9,9,9999,9|+9999,9|
|2022-01-01 04:59:00|999,9,9,9999,9|+9999,9|
|2022-01-01 05:49:00|180,5,N,0041,5|+0090,5|
|2022-01-01 05:51:00|180,5,N,0036,5|+0094,5|
|2022-01-01 06:00:00|180,1,N,0036,1|+0094,1|
|2022-01-01 06:18:00|180,5,N,0026,5|+0089,5|
|2022-01-01 06:51:00|160,5,N,0021,5|+0094,5|
|2022-01-01 07:09:00|130,5,N,0015,5|+0094,5|
|2022-01-01 07:51:00|130,5,N,0026,5|+0094,5|
|2022-01-0

## MTA Subway

In [6]:
# apply the same process on mta
mta = spark.read.csv("../data/landing/mta_2023.csv", header = True)
print(mta.count(), len(mta.columns))


# retain ['transit_timestamp', 'borough', 'ridership']
# remove transfers as this is already included in ridership based on 
# transfers: those entering subway either bus-to-subway or out-of-network
columns_to_retain = ['transit_timestamp', 'borough', 'ridership', 
                     'Georeference']

# Convert string to timestamp
mta = landing_to_raw(mta, columns_to_retain, 'transit_timestamp', False)
# Show the DataFrame with converted timestamps
mta.show(truncate=False)

                                                                                

5553521 11
+-------------------+-------+---------+----------------------------+
|transit_timestamp  |borough|ridership|georeference                |
+-------------------+-------+---------+----------------------------+
|2023-08-12 11:00:00|M      |283      |POINT (-73.937965 40.851696)|
|2023-08-12 13:00:00|BK     |148      |POINT (-73.92261 40.66472)  |
|2022-10-08 21:00:00|M      |117      |POINT (-73.94748 40.7906)   |
|2022-05-20 10:00:00|M      |356      |POINT (-73.94748 40.7906)   |
|2022-03-14 08:00:00|M      |845      |POINT (-73.968376 40.799446)|
|2022-05-03 22:00:00|M      |470      |POINT (-73.98163 40.730953) |
|2023-01-28 18:00:00|M      |null     |POINT (-73.98163 40.730953) |
|2023-07-07 08:00:00|M      |730      |POINT (-73.968376 40.799446)|
|2022-02-28 20:00:00|M      |482      |POINT (-73.98163 40.730953) |
|2022-03-03 00:00:00|M      |43       |POINT (-73.968376 40.799446)|
|2022-09-08 13:00:00|M      |396      |POINT (-73.94748 40.7906)   |
|2022-12-18 02:00:00|M 

In [7]:
mta.printSchema()

root
 |-- transit_timestamp: timestamp (nullable = true)
 |-- borough: string (nullable = true)
 |-- ridership: integer (nullable = true)
 |-- georeference: string (nullable = true)



In [8]:
save_to_parquet(mta, file_path="../data/raw/mta")

                                                                                

In [9]:
spark.stop()