# Import packages and initialize spark session

In [1]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.session import SparkSession
import numpy as np

In [2]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

### Read data

In [3]:
lines = sc.textFile("mining.dat")

### Combine all columns into a list and create spark data frame 

In [4]:
data = []
i = 0
for line in lines.collect():
    data.append((i,list(set(map(lambda x : x ,str(line).split(','))))))
    i = i + 1

In [5]:
df = spark.createDataFrame(data, ["id", "items"])
print(df.show())

+---+--------------------+
| id|               items|
+---+--------------------+
|  0|[40.769, -73.9549...|
|  1|[-74.0345, 0, 40....|
|  2|[2, 0, -73.9873, ...|
|  3|[-73.9776, 3, 0, ...|
|  4|[0, 40.7594, B025...|
|  5|[-74.0403, 0, B02...|
|  6|[6, 0, -73.9887, ...|
|  7|[-73.979, 7, 0, B...|
|  8|[0, B02512, 8, 40...|
|  9|[-73.9846, 9, 1, ...|
| 10|[1, B02512, 10, 4...|
| 11|[11, 1, B02512, 4...|
| 12|[-73.9803, 40.727...|
| 13|[2, B02512, 13, 4...|
| 14|[2, 40.7564, B025...|
| 15|[2, -73.9531, B02...|
| 16|[2, 16, B02512, -...|
| 17|[3, 17, B02512, 4...|
| 18|[-74.0039, 40.753...|
| 19|[19, 40.7389, 3, ...|
+---+--------------------+
only showing top 20 rows

None


## Perform FPgrowth and extract frequent itemsets

In [6]:
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.001)
model = fpGrowth.fit(df)

In [7]:
model.freqItemsets.show()

+----------+----+
|     items|freq|
+----------+----+
| [40.7463]| 570|
| [40.7459]| 788|
|[-73.9867]|1180|
|[-74.0063]|1050|
| [40.7398]| 696|
|[-74.0024]| 870|
| [40.7393]| 605|
|[-74.0095]| 824|
|[-73.9858]|1079|
| [40.7302]| 640|
|[-73.9724]| 853|
| [40.7234]| 807|
|[-73.9941]|1113|
|  [40.723]| 677|
|[-73.9869]|1012|
|[-73.9745]| 862|
|[-73.9836]|1152|
| [40.7217]| 688|
| [40.7568]| 840|
| [-73.997]| 819|
+----------+----+
only showing top 20 rows



## Extract association rules

In [8]:
l=model.associationRules
l.show()

+----------+----------+--------------------+------------------+
|antecedent|consequent|          confidence|              lift|
+----------+----------+--------------------+------------------+
|      [11]|  [B02617]| 0.20085219707057256|1.0498447133035003|
|      [11]|  [B02598]| 0.31355525965379494|0.9658630545103032|
|      [11]|  [B02512]| 0.07360852197070572|1.1693265530395913|
|      [11]|  [B02682]| 0.39462050599201065|0.9778830838275472|
| [40.6449]|  [B02682]| 0.41411764705882353|1.0261976649066706|
|[-74.0057]|  [B02682]| 0.43115438108484005| 1.068415273355148|
|[-73.9873]|  [B02682]|  0.4260089686098655|1.0556647656086127|
|      [15]|  [B02617]| 0.19125265392781315|0.9996683658921063|
|      [15]|  [B02598]| 0.32467091295116773|1.0001032674109962|
|      [15]|  [B02512]| 0.06859164897381458|1.0896297645233541|
|      [15]|  [B02682]| 0.39547062986553433|0.9799897198042736|
|      [15]|  [B02764]|0.020014154281670204|1.1403219941937157|
|       [8]|  [B02617]| 0.20416739625284

## Convert all columns into list

In [9]:
antecedent = np.array(l.select("antecedent").collect()).reshape(-1)
consequent = np.array(l.select("consequent").collect()).reshape(-1)
confidence = np.array(l.select("confidence").collect()).reshape(-1)
lift = np.array(l.select("lift").collect()).reshape(-1)

In [10]:
antecedent = antecedent.tolist()
consequent = consequent.tolist()
confidence = confidence.tolist()
lift = lift.tolist()

## Extract useful rules 

## lat -> Base and lon -> Base

In [27]:
print("antecedent","-->","consequent","  ","confidence","       ","lift")
for i in  range(len(antecedent)):
    if '.' in antecedent[i]:
        if 'B' in consequent[i]:
            print(antecedent[i]," -->  ",consequent[i],"  ",confidence[i]," ",lift[i])

antecedent --> consequent    confidence         lift
40.6449  -->   B02682    0.41411764705882353   1.0261976649066706
-74.0057  -->   B02682    0.43115438108484005   1.068415273355148
-73.9873  -->   B02682    0.4260089686098655   1.0556647656086127
-73.9889  -->   B02682    0.39154160982264663   0.970253474024798
-73.9919  -->   B02682    0.40750853242320817   1.0098200532440467
-73.9872  -->   B02682    0.43870967741935485   1.087137555564618
-73.9918  -->   B02682    0.39303482587064675   0.9739537143611902
-73.992  -->   B02682    0.4228094575799722   1.0477362680644033
-73.9888  -->   B02598    0.3450327186198691   1.062824957489586
-73.9888  -->   B02682    0.40690065437239736   1.0083137106848234
40.774  -->   B02682    0.4115151515151515   1.0197485921158487
40.7741  -->   B02682    0.4079696394686907   1.0109626922421837
-73.9823  -->   B02682    0.4328018223234624   1.072497688978226
-73.9916  -->   B02682    0.402787456445993   0.9981210658232643


## time(hour) -> Base

In [31]:
print("antecedent","-->","consequent","  ","confidence","         ","lift")
for i in  range(len(antecedent)):
    if '.' not in antecedent[i]:
        if 'B' not in antecedent[i]:
            if 'B' in consequent[i]:
                print(antecedent[i],"      -->    ",consequent[i],"  ",confidence[i],"  ",lift[i])

antecedent --> consequent    confidence           lift
11       -->     B02617    0.20085219707057256    1.0498447133035003
11       -->     B02598    0.31355525965379494    0.9658630545103032
11       -->     B02512    0.07360852197070572    1.1693265530395913
11       -->     B02682    0.39462050599201065    0.9778830838275472
15       -->     B02617    0.19125265392781315    0.9996683658921063
15       -->     B02598    0.32467091295116773    1.0001032674109962
15       -->     B02512    0.06859164897381458    1.0896297645233541
15       -->     B02682    0.39547062986553433    0.9799897198042736
15       -->     B02764    0.020014154281670204    1.1403219941937157
8       -->     B02617    0.20416739625284538    1.067173098981225
8       -->     B02598    0.31281736998774295    0.9635900887576908
8       -->     B02512    0.05673262125722291    0.9012402189791324
8       -->     B02682    0.4117054806513745    1.0202202342121056
2       -->     B02617    0.17585089141004862    0.91

## Base -> time(hour)

In [39]:
print("antecedent","-->","consequent","  ","confidence","          ","lift")
for i in  range(len(antecedent)):
    if 'B' in antecedent[i]:
        if '.' not in consequent[i]:
            if 'B' not in consequent[i]:
                print(antecedent[i],"   -->  ",consequent[i],"       ",confidence[i],"  ",lift[i])

antecedent --> consequent    confidence            lift
B02617    -->   20         0.062267941963500335    0.9698234108281792
B02617    -->   9         0.03365709576763178    1.0590841178573256
B02617    -->   18         0.07354561531837668    0.9654375541128436
B02617    -->   23         0.034925602540717214    0.9547729512772648
B02617    -->   2         0.008036962620716475    0.9191640986216332
B02617    -->   14         0.04790696382440903    0.9946029050163689
B02617    -->   5         0.01625910871195637    0.9685055411669055
B02617    -->   12         0.03662929046953269    1.0644404683773663
B02617    -->   16         0.07474004870325275    1.004474653217918
B02617    -->   7         0.047203266636420035    1.0690872324383267
B02617    -->   1         0.012768400292589884    0.9276661852730593
B02617    -->   10         0.03434227460856844    1.0851205358183489
B02617    -->   4         0.010407311043416264    0.9637620736524238
B02617    -->   8         0.0431847853260618    