# FIM is performed using Frequent Pattern Growth (FP Growth) algorithm,  association rules are generated and interesting rules are mined

# Dataset is read from the .dat file creating from the "dataset_for_mining.ipynb" file

# This algorithm is implemented using pyspark

## Importing packages and initializing spark session

In [2]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.session import SparkSession
import numpy as np

In [3]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

## Reading data

In [4]:
lines = sc.textFile("mining.dat")

## Combining all columns into a list and creating spark dataframe 

In [5]:
data = []
i = 0
for line in lines.collect():
    data.append((i,list(set(map(lambda x : x ,str(line).split(','))))))
    i = i + 1

In [6]:
df = spark.createDataFrame(data, ["id", "items"])
print(df.show())

+---+--------------------+
| id|               items|
+---+--------------------+
|  0|[, hour, Lat, Bas...|
|  1|[B02512, -73.9549...|
|  2|[B02512, -74.0345...|
|  3|[40.7316, B02512,...|
|  4|[B02512, 40.7588,...|
|  5|[B02512, -73.9722...|
|  6|[B02512, 0, 40.73...|
|  7|[6, B02512, 40.72...|
|  8|[B02512, 40.762, ...|
|  9|[8, B02512, 0, -7...|
| 10|[-73.9846, B02512...|
| 11|[B02512, 40.7256,...|
| 12|[B02512, -73.9684...|
| 13|[40.7271, B02512,...|
| 14|[B02512, -73.7896...|
| 15|[B02512, -73.9167...|
| 16|[B02512, -73.9531...|
| 17|[B02512, 16, 2, 4...|
| 18|[-73.9821, B02512...|
| 19|[B02512, 40.7531,...|
+---+--------------------+
only showing top 20 rows

None


## Implementing FPgrowth and extracting Frequent Itemsets

In [7]:
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.001)
model = fpGrowth.fit(df)

In [8]:
model.freqItemsets.show()

+------------------+-----+
|             items| freq|
+------------------+-----+
|         [40.7463]|  570|
|         [40.7459]|  788|
|        [-73.9943]| 1050|
|         [40.7398]|  696|
|        [-74.0024]|  870|
|        [-74.0095]|  824|
|         [40.7302]|  640|
|        [-73.9812]| 1113|
|        [-73.9836]| 1152|
|         [40.7568]|  840|
|        [-74.0075]| 1069|
|        [-73.9726]|  884|
|        [-73.9918]| 1608|
|[-73.9918, B02682]|  632|
|         [40.7545]|  711|
|         [40.7349]|  667|
|        [-73.9982]|  771|
|         [40.7615]|  896|
|              [20]|36245|
|      [20, B02617]| 6725|
+------------------+-----+
only showing top 20 rows



## Extracting association rules

In [9]:
l=model.associationRules
l.show()

+----------+----------+--------------------+------------------+
|antecedent|consequent|          confidence|              lift|
+----------+----------+--------------------+------------------+
|  [B02617]|      [20]|0.062267941963500335|0.9698251288014711|
|  [B02617]|      [23]|0.034925602540717214|0.9547746425897365|
|  [B02617]|       [5]| 0.01625910871195637|0.9685072568056846|
|  [B02617]|      [12]| 0.03662929046953269|1.0644423539580556|
|  [B02617]|       [1]|0.012768400292589884|0.9276678285678203|
|  [B02617]|       [4]|0.010407311043416264|0.9637637808884875|
|  [B02617]|      [15]| 0.06255497634281164|0.9996701367336164|
|  [B02617]|      [11]|0.034916343367191044|1.0498465730288462|
|  [B02617]|      [22]|0.053221729428431216|0.9803749602476572|
|  [B02617]|       [6]| 0.03436079295562078|1.0485567737136157|
|  [B02617]|       [3]|0.008675845594022277|0.9715656272963051|
|  [B02617]|      [17]| 0.08056406885121434| 1.000083262724975|
|  [B02617]|       [9]| 0.03365709576763

## Converting all columns into list

In [10]:
antecedent = np.array(l.select("antecedent").collect()).reshape(-1)
consequent = np.array(l.select("consequent").collect()).reshape(-1)
confidence = np.array(l.select("confidence").collect()).reshape(-1)
lift = np.array(l.select("lift").collect()).reshape(-1)

In [11]:
antecedent = antecedent.tolist()
consequent = consequent.tolist()
confidence = confidence.tolist()
lift = lift.tolist()

## Extracting useful rules 

## Rule1: lat -> Base & lon -> Base

In [12]:
print("antecedent","-->","consequent","  ","confidence","       ","lift")
for i in  range(len(antecedent)):
    if '.' in antecedent[i]:
        if 'B' in consequent[i]:
            print(antecedent[i]," -->  ",consequent[i],"  ",confidence[i]," ",lift[i])

antecedent --> consequent    confidence         lift
40.6449  -->   B02682    0.41411764705882353   1.0261994827429497
-74.0057  -->   B02682    0.43115438108484005   1.068417165976922
-73.9889  -->   B02682    0.39154160982264663   0.9702551927599162
-73.9918  -->   B02682    0.39303482587064675   0.9739554396510214
-73.9919  -->   B02682    0.40750853242320817   1.0098218420685499
-73.9888  -->   B02598    0.3450327186198691   1.0628268402085128
-73.9888  -->   B02682    0.40690065437239736   1.0083154968409478
-73.9873  -->   B02682    0.4260089686098655   1.0556666356437676
-73.992  -->   B02682    0.4228094575799722   1.047738124054788
-73.9872  -->   B02682    0.43870967741935485   1.0871394813515853
40.774  -->   B02682    0.4115151515151515   1.0197503985280534
40.7741  -->   B02682    0.4079696394686907   1.0109644830907907
-73.9823  -->   B02682    0.4328018223234624   1.0724995888317093
-73.9916  -->   B02682    0.402787456445993   0.9981228339238422


## Rule2: time(hour) -> Base

In [13]:
print("antecedent","-->","consequent","  ","confidence","         ","lift")
for i in  range(len(antecedent)):
    if '.' not in antecedent[i]:
        if 'B' not in antecedent[i]:
            if 'B' in consequent[i]:
                print(antecedent[i],"      -->    ",consequent[i],"  ",confidence[i],"  ",lift[i])

antecedent --> consequent    confidence           lift
2       -->     B02617    0.17585089141004862    0.9191657268555514
2       -->     B02598    0.34380064829821716    1.0590316134482392
2       -->     B02682    0.41774716369529985    1.0351935647904358
4       -->     B02617    0.18438320209973752    0.9637637808884874
4       -->     B02598    0.3243110236220472    0.9989964483940962
4       -->     B02682    0.4288057742782152    1.0625972278331541
7       -->     B02617    0.20453360080240723    1.0690891262504285
7       -->     B02598    0.30680040120361085    0.9450573333747608
7       -->     B02512    0.06066198595787362    0.9636628300028407
7       -->     B02682    0.41255767301905716    1.022333806976485
9       -->     B02617    0.20261984392419174    1.0590859939496204
9       -->     B02598    0.3093645484949833    0.9529558439114415
9       -->     B02512    0.0653288740245262    1.0377999768601829
9       -->     B02682    0.40780379041248604    1.010553502740401

## Rule3: Base -> time(hour)

In [14]:
print("antecedent","-->","consequent","  ","confidence","          ","lift")
for i in  range(len(antecedent)):
    if 'B' in antecedent[i]:
        if '.' not in consequent[i]:
            if 'B' not in consequent[i]:
                print(antecedent[i],"   -->  ",consequent[i],"       ",confidence[i],"  ",lift[i])

antecedent --> consequent    confidence            lift
B02617    -->   20         0.062267941963500335    0.9698251288014711
B02617    -->   23         0.034925602540717214    0.9547746425897365
B02617    -->   5         0.01625910871195637    0.9685072568056846
B02617    -->   12         0.03662929046953269    1.0644423539580556
B02617    -->   1         0.012768400292589884    0.9276678285678203
B02617    -->   4         0.010407311043416264    0.9637637808884875
B02617    -->   15         0.06255497634281164    0.9996701367336164
B02617    -->   11         0.034916343367191044    1.0498465730288462
B02617    -->   22         0.053221729428431216    0.9803749602476572
B02617    -->   6         0.03436079295562078    1.0485567737136157
B02617    -->   3         0.008675845594022277    0.9715656272963051
B02617    -->   17         0.08056406885121434    1.000083262724975
B02617    -->   9         0.03365709576763178    1.0590859939496204
B02617    -->   18         0.07354561531837668 