## Spark Initialization

In [1]:
# Import findspark to read SPARK_HOME and HADOOP_HOME
import findspark
findspark.init()

In [2]:
# Import required library
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()

In [3]:
# Print Spark object ID
print(spark)

<pyspark.sql.session.SparkSession object at 0x0000022BED843BE0>


## Loading Data using Spark

In [4]:
df = spark.read.csv("data.csv", header=True, inferSchema=True)

In [5]:
df.show()

+-------+----+-----+----+---+----+----------+------+-------------------+-------------------+------------------+-----------+-------+---------------+--------------------+------------------+------------------+----------------+-------------+--------------------+------------------+------------------+--------------+
|trip_id|year|month|week|day|hour|  usertype|gender|          starttime|           stoptime|      tripduration|temperature| events|from_station_id|   from_station_name|    latitude_start|   longitude_start|dpcapacity_start|to_station_id|     to_station_name|      latitude_end|     longitude_end|dpcapacity_end|
+-------+----+-----+----+---+----+----------+------+-------------------+-------------------+------------------+-----------+-------+---------------+--------------------+------------------+------------------+----------------+-------------+--------------------+------------------+------------------+--------------+
|2355134|2014|    6|  27|  0|  23|Subscriber|  Male|2014-06-30 2

In [6]:
df.count()

9495235

In [7]:
df.createOrReplaceTempView("divvy")

## Data Mining Process

### Dataset
Dataset diambil dari https://www.kaggle.com/yingwurenjian/chicago-divvy-bicycle-sharing-data merupakan data Bicyle Sharing di Chicago. Dataset berformat .csv dengan ukurang +-2GB memiliki 23 kolom yaitu 
- trip_id (number)
- year (number)
- month (number)
- week (number)
- day (number)
- hour (number : 0-24)
- usertype (text : customer, subscriber)
- gender (texr : male, female)
- starttime (datetime)
- stoptime (datetime)
- tripduration (number in minutes)
- temperature (number)
- events (text : unknown, clear, cloudy, rain or snow, tstorms, not clear)
- from_station_id (number)
- from_station_name (text)
- latitude_start (number)
- longitude_start (number)
- dpcapacity_start (number)
- to_station_id (number)
- to_station_name (text)
- latitude_end (number)
- longitude_end (number)
- dpcapacity_end (number)

In [8]:
# Daftar station yang paling banyak menjadi tujuan customer beserta kapasitas dock parkirnya
query1 = spark.sql("SELECT to_station_id, to_station_name, dpcapacity_end, COUNT(to_station_name) \
                    FROM divvy \
                    GROUP BY to_station_id, to_station_name, dpcapacity_end \
                    ORDER BY COUNT(to_station_name) DESC")

In [9]:
query1.show()

+-------------+--------------------+--------------+----------------------+
|to_station_id|     to_station_name|dpcapacity_end|count(to_station_name)|
+-------------+--------------------+--------------+----------------------+
|           91|Clinton St & Wash...|          31.0|                161152|
|           77|Clinton St & Madi...|          31.0|                102144|
|          192| Canal St & Adams St|          47.0|                 79442|
|           43|Michigan Ave & Wa...|          43.0|                 76968|
|          287|Franklin St & Mon...|          27.0|                 63659|
|          174|Canal St & Madiso...|          23.0|                 63509|
|           74|Kingsbury St & Er...|          23.0|                 60969|
|          133|Kingsbury St & Ki...|          27.0|                 60035|
|          117|Wilton Ave & Belm...|          23.0|                 58660|
|          289|Wells St & Concor...|          19.0|                 58628|
|          177| Theater o

In [10]:
# Daftar jumlah pengguna berdasarkan tipe pengguna dan gendernya beserta rata-rata durasi perjalanan
query2 = spark.sql("SELECT usertype, gender, COUNT(usertype), AVG(tripduration) \
                    FROM divvy \
                    GROUP BY usertype, gender \
                    ORDER BY AVG(tripduration) DESC")

In [11]:
query2.show()

+----------+------+---------------+------------------+
|  usertype|gender|count(usertype)| avg(tripduration)|
+----------+------+---------------+------------------+
|  Customer|Female|            194|21.541323024054982|
|  Customer|  Male|           1083| 14.10594028931979|
|Subscriber|Female|        2378481|12.779013790734535|
| Dependent|  Male|            178|12.285205992509361|
|Subscriber|  Male|        7115299| 11.00085300467806|
+----------+------+---------------+------------------+



In [12]:
# Daftar transaksi paling banyak berdasarkan tahun dan bulan
query3 = spark.sql("SELECT year, month, COUNT(trip_id)  \
                    FROM divvy \
                    GROUP BY year, month \
                    ORDER by count(trip_id) DESC")

In [13]:
query3.show()

+----+-----+--------------+
|year|month|count(trip_id)|
+----+-----+--------------+
|2017|    8|        406964|
|2017|    7|        381960|
|2017|    6|        368841|
|2017|    9|        365210|
|2016|    8|        345086|
|2016|    6|        344473|
|2016|    7|        340516|
|2015|    8|        324861|
|2016|    9|        319003|
|2015|    7|        311632|
|2015|    9|        295226|
|2017|   10|        291880|
|2016|   10|        275132|
|2017|    5|        260717|
|2015|    6|        259727|
|2016|    5|        256118|
|2015|   10|        252321|
|2014|    7|        245397|
|2014|    8|        227792|
|2014|    9|        210532|
+----+-----+--------------+
only showing top 20 rows

