## Azure Databricks: Extract the data using Spark

By Selman Karaosmanoglu

### Read Payment

In [0]:
df_payment = spark.read.csv("dbfs:/FileStore/data/payments.csv", header=False, inferSchema=True).toDF("payment_id", "date", "amount", "rider_id")

In [0]:
df_payment.printSchema()

root
 |-- payment_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- amount: double (nullable = true)
 |-- rider_id: integer (nullable = true)



In [0]:
df_payment.describe().show()

+-------+-----------------+------------------+-----------------+
|summary|       payment_id|            amount|         rider_id|
+-------+-----------------+------------------+-----------------+
|  count|          1946607|           1946607|          1946607|
|   mean|         973304.0|  9.99539467904919|38575.52808296693|
| stddev|561937.1820657537|3.4687957778018923|21674.45772403217|
|    min|                1|               3.0|             1000|
|    max|          1946607|              25.0|            75999|
+-------+-----------------+------------------+-----------------+



In [0]:
df_payment.pandas_api().info()

<class 'pyspark.pandas.frame.DataFrame'>
Int64Index: 1946607 entries, 0 to 1946606
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   payment_id  1946607 non-null  int32  
 1   date        1946607 non-null  object 
 2   amount      1946607 non-null  float64
 3   rider_id    1946607 non-null  int32  
dtypes: float64(1), int32(2), object(1)

In [0]:
df_payment.pandas_api().head()

Unnamed: 0,payment_id,date,amount,rider_id
0,1,2019-05-01,9.0,1000
1,2,2019-06-01,9.0,1000
2,3,2019-07-01,9.0,1000
3,4,2019-08-01,9.0,1000
4,5,2019-09-01,9.0,1000


### Read Riders

In [0]:
df_rider = spark.read.csv('dbfs:/FileStore/data/riders.csv', header=False, inferSchema=True).toDF('rider_id', 'first_name', 'last_name', 'address', 'birthdate','account_start_date','account_end_date','is_member')

In [0]:
df_rider.pandas_api().info()

<class 'pyspark.pandas.frame.DataFrame'>
Int64Index: 75000 entries, 0 to 74999
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   rider_id            75000 non-null  int32 
 1   first_name          75000 non-null  object
 2   last_name           75000 non-null  object
 3   address             75000 non-null  object
 4   birthdate           75000 non-null  object
 5   account_start_date  75000 non-null  object
 6   account_end_date    14954 non-null  object
 7   is_member           75000 non-null  bool  
dtypes: bool(1), int32(1), object(6)

In [0]:
df_rider.pandas_api().head()

Unnamed: 0,rider_id,first_name,last_name,address,birthdate,account_start_date,account_end_date,is_member
0,1000,Diana,Clark,1200 Alyssa Squares,1989-02-13,2019-04-23,,True
1,1001,Jennifer,Smith,397 Diana Ferry,1976-08-10,2019-11-01,2020-09-01,True
2,1002,Karen,Smith,644 Brittany Row Apt. 097,1998-08-10,2022-02-04,,True
3,1003,Bryan,Roberts,996 Dickerson Turnpike,1999-03-29,2019-08-26,,False
4,1004,Jesse,Middleton,7009 Nathan Expressway,1969-04-11,2019-09-14,,True


### Read Stations

In [0]:
df_station = spark.read.csv('dbfs:/FileStore/data/stations.csv', header=False, inferSchema=True).toDF('station_id', 'name', 'latitude', 'longitude')

In [0]:
df_station.pandas_api().info()

<class 'pyspark.pandas.frame.DataFrame'>
Int64Index: 838 entries, 0 to 837
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   station_id  838 non-null    object 
 1   name        838 non-null    object 
 2   latitude    838 non-null    float64
 3   longitude   838 non-null    float64
dtypes: float64(2), object(2)

In [0]:
df_station.pandas_api().head()

Unnamed: 0,station_id,name,latitude,longitude
0,525,Glenwood Ave & Touhy Ave,42.012701,-87.666058
1,KA1503000012,Clark St & Lake St,41.885795,-87.631101
2,637,Wood St & Chicago Ave,41.895634,-87.672069
3,13216,State St & 33rd St,41.834733,-87.625827
4,18003,Fairbanks St & Superior St,41.895808,-87.620253


### Read Trips

In [0]:
df_trip = spark.read.csv('dbfs:/FileStore/data/trips.csv', header=False, inferSchema=True).toDF('trip_id','rideable_type','started_at','ended_at','start_station_id','end_station_id','rider_id')

In [0]:
df_trip.pandas_api().info()

<class 'pyspark.pandas.frame.DataFrame'>
Int64Index: 4584921 entries, 0 to 4584920
Data columns (total 7 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   trip_id           4584921 non-null  object        
 1   rideable_type     4584921 non-null  object        
 2   started_at        4584921 non-null  datetime64[ns]
 3   ended_at          4584921 non-null  datetime64[ns]
 4   start_station_id  4584921 non-null  object        
 5   end_station_id    4584921 non-null  object        
 6   rider_id          4584921 non-null  int32         
dtypes: datetime64[ns](2), int32(1), object(4)

In [0]:
df_trip.pandas_api().head()

Unnamed: 0,trip_id,rideable_type,started_at,ended_at,start_station_id,end_station_id,rider_id
0,89E7AA6C29227EFF,classic_bike,2021-02-12 16:14:56,2021-02-12 16:21:43,525,660,71934
1,0FEFDE2603568365,classic_bike,2021-02-14 17:52:38,2021-02-14 18:12:09,525,16806,47854
2,E6159D746B2DBB91,electric_bike,2021-02-09 19:10:18,2021-02-09 19:19:10,KA1503000012,TA1305000029,70870
3,B32D3199F1C2E75B,classic_bike,2021-02-02 17:49:41,2021-02-02 17:54:06,637,TA1305000034,58974
4,83E463F23575F4BF,electric_bike,2021-02-23 15:07:23,2021-02-23 15:22:37,13216,TA1309000055,39608


### Writing Delta Files

In [0]:
dataframes = [
    (df_payment, "payment"),
    (df_rider, "rider"),
    (df_station, "station"),
    (df_trip, "trip")
]


In [0]:
def save_to_delta(df, name):
    """Save DataFrame in delta format"""
    path = f"/delta/{name}_bronze"
    df.write.format("delta").mode("overwrite").save(path)

In [0]:
# Save each df as Delta
for df, name in dataframes:
    save_to_delta(df, name)