In [53]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import OneHotEncoder

In [5]:
spark = (
    SparkSession
    .builder
    .appName('Yello Taxi')
    .master('local[*]')
    .config("spark.driver.bindAddress", "127.0.0.1")
    .getOrCreate()
)

24/05/31 18:40:02 WARN SparkContext: Another SparkContext is being constructed (or threw an exception in its constructor). This may indicate an error, since only one SparkContext should be running in this JVM (see SPARK-2243). The other SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.Con

In [6]:
spark

## Q1. Read the data for January. How many columns are there?

In [9]:
df = spark.read.parquet('data/yellow_tripdata_2023-01.parquet')

                                                                                

In [12]:
len(df.columns)

19

## Answer: 19

## Q2. What's the standard deviation of the trips duration in January?

In [35]:
df.columns

['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge',
 'airport_fee']

In [16]:
df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [23]:
df.select('tpep_pickup_datetime', 'tpep_dropoff_datetime').show()

+--------------------+---------------------+
|tpep_pickup_datetime|tpep_dropoff_datetime|
+--------------------+---------------------+
| 2023-01-01 00:32:10|  2023-01-01 00:40:36|
| 2023-01-01 00:55:08|  2023-01-01 01:01:27|
| 2023-01-01 00:25:04|  2023-01-01 00:37:49|
| 2023-01-01 00:03:48|  2023-01-01 00:13:25|
| 2023-01-01 00:10:29|  2023-01-01 00:21:19|
| 2023-01-01 00:50:34|  2023-01-01 01:02:52|
| 2023-01-01 00:09:22|  2023-01-01 00:19:49|
| 2023-01-01 00:27:12|  2023-01-01 00:49:56|
| 2023-01-01 00:21:44|  2023-01-01 00:36:40|
| 2023-01-01 00:39:42|  2023-01-01 00:50:36|
| 2023-01-01 00:53:01|  2023-01-01 01:01:45|
| 2023-01-01 00:43:37|  2023-01-01 01:17:18|
| 2023-01-01 00:34:44|  2023-01-01 01:04:25|
| 2023-01-01 00:09:29|  2023-01-01 00:29:23|
| 2023-01-01 00:33:53|  2023-01-01 00:49:15|
| 2023-01-01 00:13:04|  2023-01-01 00:22:10|
| 2023-01-01 00:45:11|  2023-01-01 01:07:39|
| 2023-01-01 00:04:33|  2023-01-01 00:19:22|
| 2023-01-01 00:03:36|  2023-01-01 00:09:36|
| 2023-01-

In [37]:
df_with_duration = df.select(
    *df.columns,
    ((
        F.col('tpep_dropoff_datetime') - F.col('tpep_pickup_datetime')
    ).cast('long') / 60).alias('duration')
)

In [39]:
df_with_duration.select(F.stddev('duration')).show()

+------------------+
|  stddev(duration)|
+------------------+
|42.594351241955756|
+------------------+



                                                                                

## Answer: 42.59

## Q3. Dropping outliers (keep only the records where the duration was between 1 and 60 minutes (inclusive))

In [41]:
current_rown = df_with_duration.count()
current_rown

3066766

In [43]:
df_no_outliers = df_with_duration.filter((F.col('duration') > 1) & (F.col('duration') <= 60))

## What fraction of the records left after you dropped the outliers?

In [47]:
df_no_outliers.count() / current_rown * 100

                                                                                

98.11286547457485

## ANSWER: 98%

## Q4. One-hot encoding

In [51]:
df_no_outliers.select('PULocationID', 'DOLocationID')

+------------+------------+
|PULocationID|DOLocationID|
+------------+------------+
|         161|         141|
|          43|         237|
|          48|         238|
|         138|           7|
|         107|          79|
|         161|         137|
|         239|         143|
|         142|         200|
|         164|         236|
|         141|         107|
|         234|          68|
|          79|         264|
|         164|         143|
|         138|          33|
|          33|          61|
|          79|         186|
|          90|          48|
|         113|         255|
|         237|         239|
|         143|         229|
+------------+------------+
only showing top 20 rows



                                                                                

In [None]:
ohe = 