In [1]:
!pip install apyori



In [2]:
import pandas as pd
import apyori

In [3]:
sample_2019 = pd.read_csv('/home/felipe/repos/tcc/nyc_data/csv/2019_preprocessed_sample.csv')
sample_2020 = pd.read_csv('/home/felipe/repos/tcc/nyc_data/csv/2020_preprocessed_sample.csv')
sample_2021 = pd.read_csv('/home/felipe/repos/tcc/nyc_data/csv/2021_preprocessed_sample.csv')

In [4]:
sample_2019 = sample_2019.drop(['dropoff_datetime', 'pickup_datetime', 'PROBABLE_CASE_COUNT', 'CASE_COUNT_7DAY_AVG', 'ALL_CASE_COUNT_7DAY_AVG'], axis=1)
sample_2020 = sample_2020.drop(['dropoff_datetime', 'pickup_datetime', 'PROBABLE_CASE_COUNT', 'CASE_COUNT_7DAY_AVG', 'ALL_CASE_COUNT_7DAY_AVG'], axis=1)
sample_2021 = sample_2021.drop(['dropoff_datetime', 'pickup_datetime', 'PROBABLE_CASE_COUNT', 'CASE_COUNT_7DAY_AVG', 'ALL_CASE_COUNT_7DAY_AVG'], axis=1)

In [5]:
def process_pu(x):
    return str(x)+' pickup'

def process_do(x):
    return str(x)+' dropoff'

def process_sr_flag(x):
    if x == 0:
        return 'not shared'
    return 'shared'

def process_pu_day(x):
    if x == 0:
        return 'dawn pickup'
    elif x == 1:
        return 'morning pickup'
    elif x == 2:
        return 'noon pickup'
    return 'night pickup'

def process_do_day(x):
    if x == 0:
        return 'dawn dropoff'
    elif x == 1:
        return 'morning dropoff'
    elif x == 2:
        return 'noon dropoff'
    return 'night dropoff'

def process_duration(x):
    duration_minutes = x/60
    
    if duration_minutes <= 15:
        return 'short trip'
    elif 15 < duration_minutes <= 30:
        return 'medium-short trip'
    elif 30 < duration_minutes <= 45:
        return 'medium-long trip'
    elif 45 < duration_minutes <= 60:
        return 'long trip'
    return 'longer trip'

def process_case(x):
    if x <= 500:
        return 'low count'
    elif 500 < x <= 1000:
        return 'mid-low count'
    elif 1000 < x <= 2000:
        return 'mid-high count'
    elif 2000 < x <= 3000:
        return 'high count'
    return 'higher count'

In [6]:
sample_2019['PULocationID'] = sample_2019['PULocationID'].apply(process_pu)
sample_2019['DOLocationID'] = sample_2019['DOLocationID'].apply(process_do)
sample_2019['SR_Flag'] = sample_2019['SR_Flag'].apply(process_sr_flag)
sample_2019['pickup_day_period'] = sample_2019['pickup_day_period'].apply(process_pu_day)
sample_2019['dropoff_day_period'] = sample_2019['dropoff_day_period'].apply(process_do_day)
sample_2019['trip_duration'] = sample_2019['trip_duration'].apply(process_duration)
sample_2019['CASE_COUNT'] = sample_2019['CASE_COUNT'].apply(process_case)

sample_2020['PULocationID'] = sample_2020['PULocationID'].apply(process_pu)
sample_2020['DOLocationID'] = sample_2020['DOLocationID'].apply(process_do)
sample_2020['SR_Flag'] = sample_2020['SR_Flag'].apply(process_sr_flag)
sample_2020['pickup_day_period'] = sample_2020['pickup_day_period'].apply(process_pu_day)
sample_2020['dropoff_day_period'] = sample_2020['dropoff_day_period'].apply(process_do_day)
sample_2020['trip_duration'] = sample_2020['trip_duration'].apply(process_duration)
sample_2020['CASE_COUNT'] = sample_2020['CASE_COUNT'].apply(process_case)

sample_2021['PULocationID'] = sample_2021['PULocationID'].apply(process_pu)
sample_2021['DOLocationID'] = sample_2021['DOLocationID'].apply(process_do)
sample_2021['SR_Flag'] = sample_2021['SR_Flag'].apply(process_sr_flag)
sample_2021['pickup_day_period'] = sample_2021['pickup_day_period'].apply(process_pu_day)
sample_2021['dropoff_day_period'] = sample_2021['dropoff_day_period'].apply(process_do_day)
sample_2021['trip_duration'] = sample_2021['trip_duration'].apply(process_duration)
sample_2021['CASE_COUNT'] = sample_2021['CASE_COUNT'].apply(process_case)

In [7]:
sample_2019_transactions = sample_2019.values.tolist()
sample_2020_transactions = sample_2020.values.tolist()
sample_2021_transactions = sample_2021.values.tolist()

In [8]:
rules_2019 = list(apyori.apriori(sample_2019_transactions, min_support=0.2, min_confidence=0.3))
rules_2020 = list(apyori.apriori(sample_2020_transactions, min_support=0.2, min_confidence=0.3))
rules_2021 = list(apyori.apriori(sample_2021_transactions, min_support=0.2, min_confidence=0.3))

In [9]:
for i in range(len(rules_2019)):
    LHS=list(rules_2019[i][2][0][0])
    RHS=list(rules_2019[i][2][0][1])
    support=rules_2019[i][1]
    confidence=rules_2019[i][2][0][2]
    lift=rules_2019[i][2][0][3]
    print("LHS:",LHS,"--","RHS:",RHS)
    print("Support:",support)
    print("Confidence:",confidence)
    print("Lift:",lift)
    print(10*"----")

LHS: [] -- RHS: ['low count']
Support: 1.0
Confidence: 1.0
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['medium-short trip']
Support: 0.3602934102934103
Confidence: 0.3602934102934103
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['night dropoff']
Support: 0.3491622574955908
Confidence: 0.3491622574955908
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['night pickup']
Support: 0.3410373577040244
Confidence: 0.3410373577040244
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['not shared']
Support: 0.820386403719737
Confidence: 0.820386403719737
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['short trip']
Support: 0.48890492223825555
Confidence: 0.48890492223825555
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['low count', 'medium-short trip']
Support: 0.3602934102934103
Confidence: 0.3602934102934103
Lift: 1.0
----------------------------------------
LHS: ['morni

In [10]:
for i in range(len(rules_2020)):
    LHS=list(rules_2020[i][2][0][0])
    RHS=list(rules_2020[i][2][0][1])
    support=rules_2020[i][1]
    confidence=rules_2020[i][2][0][2]
    lift=rules_2020[i][2][0][3]
    print("LHS:",LHS,"--","RHS:",RHS)
    print("Support:",support)
    print("Confidence:",confidence)
    print("Lift:",lift)
    print(10*"----")

LHS: [] -- RHS: ['low count']
Support: 0.6625428392481404
Confidence: 0.6625428392481404
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['medium-short trip']
Support: 0.3467392910676209
Confidence: 0.3467392910676209
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['night dropoff']
Support: 0.32773168380236595
Confidence: 0.32773168380236595
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['night pickup']
Support: 0.31878030414986525
Confidence: 0.31878030414986525
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['noon dropoff']
Support: 0.3093152053820874
Confidence: 0.3093152053820874
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['noon pickup']
Support: 0.3159935538806905
Confidence: 0.3159935538806905
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['not shared']
Support: 0.9520763400679798
Confidence: 0.9520763400679798
Lift: 1.0
------------------------------------

In [11]:
for i in range(len(rules_2021)):
    LHS=list(rules_2021[i][2][0][0])
    RHS=list(rules_2021[i][2][0][1])
    support=rules_2021[i][1]
    confidence=rules_2021[i][2][0][2]
    lift=rules_2021[i][2][0][3]
    print("LHS:",LHS,"--","RHS:",RHS)
    print("Support:",support)
    print("Confidence:",confidence)
    print("Lift:",lift)
    print(10*"----")

LHS: [] -- RHS: ['low count']
Support: 0.33137499068722925
Confidence: 0.33137499068722925
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['medium-short trip']
Support: 0.35508796577157636
Confidence: 0.35508796577157636
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['night dropoff']
Support: 0.3275860233936801
Confidence: 0.3275860233936801
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['night pickup']
Support: 0.31843290015645453
Confidence: 0.31843290015645453
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['noon dropoff']
Support: 0.31827325265813083
Confidence: 0.31827325265813083
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['noon pickup']
Support: 0.323541620102813
Confidence: 0.323541620102813
Lift: 1.0
----------------------------------------
LHS: [] -- RHS: ['not shared']
Support: 0.9995317006715838
Confidence: 0.9995317006715838
Lift: 1.0
----------------------------------

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, asc, desc, to_timestamp, unix_timestamp, from_unixtime
from pyspark.ml.fpm import FPGrowth

spark = SparkSession\
    .builder\
    .master('local[*]')\
    .config("spark.driver.memory", "4g")\
    .appName('process_tripdata')\
    .getOrCreate()

21/11/30 22:51:08 WARN Utils: Your hostname, LAPTOP-VD4O2HIL resolves to a loopback address: 127.0.1.1; using 172.30.167.142 instead (on interface eth0)
21/11/30 22:51:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/11/30 22:51:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [13]:
spark_2020_df = spark.createDataFrame([(i, sample_2020_transactions[i]) for i in range(len(sample_2020_transactions))], ['id', 'trip_info'])
spark_2021_df = spark.createDataFrame([(i, sample_2021_transactions[i]) for i in range(len(sample_2021_transactions))], ['id', 'trip_info'])

In [14]:
fp_growth_2020 = FPGrowth(itemsCol="trip_info", minSupport=0.3, minConfidence=0.5)
fp_growth_2021 = FPGrowth(itemsCol="trip_info", minSupport=0.3, minConfidence=0.5)

In [15]:
fp_growth_2020_model = fp_growth_2020.fit(spark_2020_df)
fp_growth_2021_model = fp_growth_2021.fit(spark_2021_df)

                                                                                

In [16]:
fp_growth_2020_model.associationRules.show(fp_growth_2020_model.associationRules.count())

+--------------------+---------------+------------------+------------------+-------------------+
|          antecedent|     consequent|        confidence|              lift|            support|
+--------------------+---------------+------------------+------------------+-------------------+
|        [short trip]|    [low count]|0.6505008786219104|0.9818246309327027|0.35689404015453796|
|        [short trip]|   [not shared]|0.9641624873337352|1.0126945148798598| 0.5289829065242327|
|[low count, not s...|   [short trip]|0.5483929917536606|0.9995407118913705|  0.337886432889283|
|         [low count]|   [not shared]|0.9299613374686664|0.9767718179009318| 0.6161392249174883|
|         [low count]|   [short trip]|0.5386731529081871|0.9818246309327024|0.35689404015453796|
| [medium-short trip]|   [not shared]|0.9429696379282351|0.9904349034247668| 0.3269646237535274|
|[short trip, not ...|    [low count]|0.6387473559579082|0.9640846117705604|  0.337886432889283|
|[short trip, low ...|   [not 

In [17]:
fp_growth_2021_model.associationRules.show(fp_growth_2021_model.associationRules.count())

+--------------------+---------------+------------------+------------------+-------------------+
|          antecedent|     consequent|        confidence|              lift|            support|
+--------------------+---------------+------------------+------------------+-------------------+
|        [short trip]|   [not shared]|0.9996369064428935|1.0001052550621845| 0.5274327617952893|
|[night pickup, no...|[night dropoff]|0.9701337792642141|  2.96146266929815|0.30872633225837354|
|[noon dropoff, no...|  [noon pickup]|0.9579862853319954| 2.960936787754146|0.30480964696616536|
|         [low count]|   [not shared]| 0.999421872490766|0.9998901203626218| 0.3311834136892408|
|[noon dropoff, no...|   [not shared]|0.9997207386462806|1.0001891265425298|0.30480964696616536|
| [medium-short trip]|   [not shared]|0.9996403201150975| 1.000108670333758| 0.3549602477729174|
|[night pickup, ni...|   [not shared]| 0.999345414456005| 0.999813626505839|0.30872633225837354|
|      [noon dropoff]|  [noon 