In [2]:
import pyspark
import math
sc = pyspark.SparkContext.getOrCreate()
sqlContext = pyspark.sql.SQLContext(sc)

# Loading the dataset as pyspark RDD of Key-value
Where the key is the anonymous user ID and the value is the query string

In [91]:
logs_rdd = sc.textFile("/usr/data/user-ct-test-collection-01.txt")

headers = logs_rdd.first()
logs_rdd = logs_rdd.filter(lambda line: line != headers)
logs_rdd = logs_rdd.map(lambda line: line.split("\t"))
logs_rdd = logs_rdd.map(lambda line: (int(line[0]), line[1]))
logs_rdd.take(5)

[(142, 'rentdirect.com'),
 (142, 'www.prescriptionfortime.com'),
 (142, 'staple.com'),
 (142, 'staple.com'),
 (142, 'www.newyorklawyersite.com')]

# Count amount of queries for user
We'll do it by giving the value of 1 for each query and then reduce by key and sum <br> <br>
Essentially, this step is about calculating **|T|** for each user, in the following **"support"** equation <br>
![](https://wikimedia.org/api/rest_v1/media/math/render/svg/1c6acacd3b17051205704b5d323c83fc737e5db1 "Support equation")


In [92]:
from operator import add 
queries_count_per_user = logs_rdd.map(lambda k: (k[0],1)).reduceByKey(add)
queries_count_per_user.take(5)

[(217, 29), (1337, 48), (3745, 164), (4781, 413), (6356, 13)]

In [94]:
logs_with_count_per_user = logs_rdd.join(queries_count_per_user)
logs_with_count_per_user.take(2)

[(6356, ('cwe', 13)), (6356, ('www.crazy shit .com', 13))]

# Counting queries by key (User ID)
Let's iterate over the rdd and:
1. set each pair (user id, query) as a key with value of 1
2. reduce by key on that returned RDD, and sum the occurences of each query per user

Essentially, this step is about calculating **|X|** for each query, in the following **"support"** equation <br>
![](https://wikimedia.org/api/rest_v1/media/math/render/svg/1c6acacd3b17051205704b5d323c83fc737e5db1 "Support equation")

In [95]:
from operator import add
counted = logs_rdd.map(lambda pair: (pair, 1)).reduceByKey(add)
counted.take(10)

[((142, 'dfdf'), 2),
 ((142, 'vaniqa.comh'), 1),
 ((142, '207 ad2d 530'), 2),
 ((142, 'attornyleslie.com'), 1),
 ((217, 'mizuno.com'), 1),
 ((217, "p; .; p;' p; ' ;' ;';"), 2),
 ((217, 'yahoo.com'), 1),
 ((217, '-'), 3),
 ((1268, 'sstack.com'), 1),
 ((1268, 'www.raindanceexpress.com'), 1)]

# Calculate Support of X e.g. (|X|/|T|)

In [109]:
joined_counted_with_user_total = queries_count_per_user.map(lambda r: (r[0], r[1]))\
                                .join(counted.map(lambda x: (x[0][0], (x[0][1], x[1]))))

joined_counted_with_user_total.take(10)

[(6356, (13, ('http', 1))),
 (6356, (13, ('www.crazy shit .com', 1))),
 (6356, (13, ('x-men costom.com', 1))),
 (6356, (13, ('ccbg.co', 1))),
 (6356, (13, ('ca', 1))),
 (6356, (13, ('cwe', 1))),
 (6356, (13, ('www.ccbg.co', 1))),
 (6356, (13, ('www.emphaze.com', 2))),
 (6356, (13, ('www.ccbg.com', 1))),
 (6356, (13, ('cam', 1)))]

In [113]:
support = joined_counted_with_user_total.map(lambda x: (x[0], (x[1][1][0], x[1][1][1]/x[1][0])))
support.take(5)

[(6356, ('http', 0.07692307692307693)),
 (6356, ('www.crazy shit .com', 0.07692307692307693)),
 (6356, ('x-men costom.com', 0.07692307692307693)),
 (6356, ('ccbg.co', 0.07692307692307693)),
 (6356, ('ca', 0.07692307692307693))]

# Calculate Confidence
According to the following equation <br>
![](https://wikimedia.org/api/rest_v1/media/math/render/svg/90324dedc399441696116eed3658fd17c5da4329 "Confidence equation")