# Save the raw files as parquet to save space

We just move the data to parquet 

In [20]:
!hdfs dfs -du -s -h /taxi/raw/

237.2 G  237.2 G  /taxi/raw


In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Convert To Parquet") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-04-26 09:33:03,030 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Prepare HDSF 

In [22]:
!hdfs dfs -ls /taxi/raw_parquet/

ls: `/taxi/raw_parquet/': No such file or directory


In [23]:
!hdfs dfs -mkdir /taxi/raw_parquet/

In [24]:
!hdfs dfs -ls /taxi/raw_parquet/

### For every year and month load the csv and save it as parquet

In [41]:
def read_and_write(year, month):
    spark.read.csv(f"/taxi/raw/{year}/yellow_tripdata_{year}-{month}.csv", header=True)\
    .withColumnRenamed(" pickup_datetime", "pickup_datetime")\
    .withColumnRenamed(" dropoff_datetime", "dropoff_datetime")\
    .withColumnRenamed(" passenger_count", "passenger_count")\
    .withColumnRenamed(" trip_distance", "trip_distance")\
    .withColumnRenamed(" pickup_longitude", "pickup_longitude")\
    .withColumnRenamed(" pickup_latitude", "pickup_latitude")\
    .withColumnRenamed(" rate_code", "rate_code")\
    .withColumnRenamed(" store_and_fwd_flag", "store_and_fwd_flag")\
    .withColumnRenamed(" dropoff_longitude", "dropoff_longitude")\
    .withColumnRenamed(" dropoff_latitude", "dropoff_latitude")\
    .withColumnRenamed(" payment_type", "payment_type")\
    .withColumnRenamed(" fare_amount", "fare_amount")\
    .withColumnRenamed(" surcharge", "surcharge")\
    .withColumnRenamed(" mta_tax", "mta_tax")\
    .withColumnRenamed(" tip_amount", "tip_amount")\
    .withColumnRenamed(" tolls_amount", "tolls_amount")\
    .withColumnRenamed(" total_amount", "total_amount")\
    .repartition(55).write.parquet(f"/taxi/raw_parquet/{year}/{month}.parquet")

### First attempt until 2013

In [None]:
for year in ["2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022"]:
    for month in ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]:
        !echo processing {year}/{month}
        read_and_write(year, month)
        ! hdfs dfs -rm -r /taxi/raw/{year}/yellow_tripdata_{year}-{month}.csv

### Second Try
Got the error:
- `Column name " pickup_datetime" contains invalid character(s). Please use alias to rename it`
- `Column name " dropoff_datetime" contains invalid character(s). Please use alias to rename it`
- `Column name " passenger_count" contains invalid character(s). Please use alias to rename it`
- `Column name "  trip_distance" contains invalid character(s). Please use alias to rename it`
- `Column name "  pickup_longitude" contains invalid character(s). Please use alias to rename it`
- `Column name "  pickup_latitude" contains invalid character(s). Please use alias to rename it`
- `Column name "  rate_code" contains invalid character(s). Please use alias to rename it`
- `Column name "  store_and_fwd_flag" contains invalid character(s). Please use alias to rename it`

etc.

In [None]:
for year in ["2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022"]:
    for month in ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]:
        !echo processing {year}/{month}
        read_and_write(year, month)
        ! hdfs dfs -rm -r /taxi/raw/{year}/yellow_tripdata_{year}-{month}.csv

### Stopping Spark 

In [43]:
spark.stop()