In [7]:
import pyspark
import math
import itertools
from operator import add
sc = pyspark.SparkContext.getOrCreate()
sqlContext = pyspark.sql.SQLContext(sc)

# Loading the dataset as pyspark RDD of Key-value
Where the key is the anonymous user ID and the value is the query string

In [3]:
logs_rdd = sc.textFile("/usr/data/user-ct-test-collection-02.txt")

headers = logs_rdd.first()
logs_rdd = logs_rdd.filter(lambda line: line != headers)
logs_rdd = logs_rdd.map(lambda line: line.split("\t"))
logs_rdd = logs_rdd.map(lambda line: (int(line[0]), line[1]))
logs_rdd.take(5)

[(142, 'rentdirect.com'),
 (142, 'www.prescriptionfortime.com'),
 (142, 'staple.com'),
 (142, 'staple.com'),
 (142, 'www.newyorklawyersite.com')]

# Calculating Support
![](https://wikimedia.org/api/rest_v1/media/math/render/svg/1c6acacd3b17051205704b5d323c83fc737e5db1 "Support equation")
<br>

### General Idea: Users as transactions and queries as groceries
All the queries that a user made will be treated as groceries bought in a transaction, <br>
and a user will be treated as if it was a transaction. <br>

### 1st step: Get query frequency across users (|X|)
We'll do it by giving the value of 1 for each query and then reduce by key and sum <br> <br>
Essentially, this step is about calculating **|X|** for each query, in the **"support"** equation <br>

In [4]:
#Remove duplicates of query for a certain user
distinct_queries = logs_rdd.distinct()
#Count for each query - how many users made it
query_freq = distinct_queries.map(lambda k: (k[1],1)).reduceByKey(add) # add is equal to <lambda x,y: x+y>
query_freq.take(5)

[('rentdirect.com', 1),
 ('staple.com', 2),
 ('www.newyorklawyersite.com', 1),
 ('207 ad2d 530', 1),
 ('frankmellace.com', 1)]

### 2nd Step: Get |T| - The Amount Of Users

In [5]:
users_amount = logs_rdd.map(lambda record: record[0]).distinct().count()
users_amount

3248

### Calculate |X| / |T| for each query X

In [6]:
support_x = query_freq.map(lambda qf: (qf[0], qf[1] / users_amount))
support_x.take(10)

[('rentdirect.com', 0.0003078817733990148),
 ('staple.com', 0.0006157635467980296),
 ('www.newyorklawyersite.com', 0.0003078817733990148),
 ('207 ad2d 530', 0.0003078817733990148),
 ('frankmellace.com', 0.0003078817733990148),
 ('ucs.ljx.com', 0.0003078817733990148),
 ('attornyleslie.com', 0.0003078817733990148),
 ('merit release appearance', 0.0003078817733990148),
 ('www.bonsai.wbff.org', 0.0003078817733990148),
 ('loislaw.com', 0.0003078817733990148)]

# Calculate Confidence
According to the following equation <br>
![](https://wikimedia.org/api/rest_v1/media/math/render/svg/90324dedc399441696116eed3658fd17c5da4329 "Confidence equation")


As before, we'll treat each user ID as it was a "transaction ID".  <br>
So in our case, Supp(xUy) is going to be the amount of users where both x & y queries took place, <br>
devided by the total amount of users. <br> <br>

### Supp(xUy)
1. Create a list of queries per user
2. Sort that list of queries per user
3. Calculate all the X & Y query pairs for each user.
4. Set the pairs to be the new key, with value of 1 (for counting)
5. Reduce by key to count the amount of users who queried both X & Y. ==> |xUy|
6. Devide |xUy| by |T| which is the amount of users ==> This will give us supp(xUy)

In [8]:
user_queries = distinct_queries.map(lambda query: (query[0], [query[1]])).reduceByKey(lambda queries_list, new_query: queries_list + new_query)
user_queries.take(10)

[(142,
  ['rentdirect.com',
   'staple.com',
   'www.newyorklawyersite.com',
   '207 ad2d 530',
   'frankmellace.com',
   'ucs.ljx.com',
   'attornyleslie.com',
   'merit release appearance',
   'www.bonsai.wbff.org',
   'loislaw.com',
   'www.prescriptionfortime.com',
   'westchester.gov',
   'space.comhttp',
   'dfdf',
   'vaniqa.comh',
   'www.collegeucla.edu',
   'www.elaorg',
   'broadway.vera.org',
   'vera.org',
   'rapny.com',
   'whitepages.com']),
 (1268,
  ['sstack.com',
   'www.victoriacostumiere.com',
   'osteen-schatzberg.com',
   'www.buckmountianestates.com',
   'idx.techsolsc.com',
   'www.bridleandbit.com',
   'http www.flickr.com photos 88145967 n00 24368586 in pool-32148876 n00',
   'href a href alt a http www.flickr.com photos 88145967 n00 24368586 in pool-32148876 n00',
   'www.acevedoarabians.com',
   'adbuyer3.lycos.com',
   'www.pinerplantation.com',
   'ozark horse blankets',
   'www.ghostrockranch.com',
   'openrangeht.zachsairforce.com',
   'www.mecab.org',


In [9]:
def generate_pairs(tup):
    tup[1].sort()
    generated_tupes = []
    for comb in itertools.combinations(tup[1],2):
        generated_tupes.append((comb, 1))
    return generated_tupes

In [10]:
query_pairs = user_queries.flatMap(generate_pairs)
query_pairs.take(100)

[(('207 ad2d 530', 'attornyleslie.com'), 1),
 (('207 ad2d 530', 'broadway.vera.org'), 1),
 (('207 ad2d 530', 'dfdf'), 1),
 (('207 ad2d 530', 'frankmellace.com'), 1),
 (('207 ad2d 530', 'loislaw.com'), 1),
 (('207 ad2d 530', 'merit release appearance'), 1),
 (('207 ad2d 530', 'rapny.com'), 1),
 (('207 ad2d 530', 'rentdirect.com'), 1),
 (('207 ad2d 530', 'space.comhttp'), 1),
 (('207 ad2d 530', 'staple.com'), 1),
 (('207 ad2d 530', 'ucs.ljx.com'), 1),
 (('207 ad2d 530', 'vaniqa.comh'), 1),
 (('207 ad2d 530', 'vera.org'), 1),
 (('207 ad2d 530', 'westchester.gov'), 1),
 (('207 ad2d 530', 'whitepages.com'), 1),
 (('207 ad2d 530', 'www.bonsai.wbff.org'), 1),
 (('207 ad2d 530', 'www.collegeucla.edu'), 1),
 (('207 ad2d 530', 'www.elaorg'), 1),
 (('207 ad2d 530', 'www.newyorklawyersite.com'), 1),
 (('207 ad2d 530', 'www.prescriptionfortime.com'), 1),
 (('attornyleslie.com', 'broadway.vera.org'), 1),
 (('attornyleslie.com', 'dfdf'), 1),
 (('attornyleslie.com', 'frankmellace.com'), 1),
 (('attorn

In [11]:
pairs_frequency = query_pairs.reduceByKey(add)
pairs_frequency.take(50)

[(('.army.mil', 'cato'), 1),
 (('aim', 'mom rings'), 1),
 (('aol.com', 'home depot'), 1),
 (('bed bath beyond', 'georgia summer camp'), 1),
 (('billygene king movie', 'disney world vacation packages'), 1),
 (('examples of acronyms', 'hallmark free ecards'), 1),
 (('dayton daily news.com', 'www.sbcyahoo.net'), 1),
 (('69tgp', 'indian jokes'), 1),
 (('ac vs. dc', 'simply beautiful'), 1),
 (('ap exam dates', 'www.pacsun.co'), 1),
 (('dirtyfrog.com', 'spanish accent'), 1),
 (('pentium d', 'you belong to me lyrics judy garland'), 1),
 (('qss', "schindler';s list"), 1),
 (('beneficial credit', 'god old testamentanimals'), 1),
 (('beneficial credit', 'spiegel what motivates people to oppress others'), 1),
 (('ccbc', 'chem lawn'), 1),
 (('animal drum muppet show', 'meditrim alabama'), 1),
 (('california foods inc', 'internation flights'), 1),
 (('cocoa beach hotels', 'matthew gregory'), 1),
 (('anthem arizona pop warner football', 'national weather service'), 1),
 (('anthem arizona rentals', '

In [12]:
support_xy = pairs_frequency.map(lambda q: (q[0], q[1]/users_amount))

In [13]:
support_xy.take(100)

[(('foods to avoid when breast feeding', 'yahoo.com'), 0.0003078817733990148),
 (('lily pads breast', 'roth ira'), 0.0003078817733990148),
 (('auto tune up clinic', 'lucky shoals park'), 0.0003078817733990148),
 (('greater lilburn athletic association', 'ingles grocery'),
  0.0003078817733990148),
 (('how to remove adware from computer', 'mailbox'), 0.0003078817733990148),
 (('ingles grocery', 'youth track and field georgia'), 0.0003078817733990148),
 (('lowes', 'martha liing'), 0.0003078817733990148),
 (('meadowcreek youth football georgia', 'party city'), 0.0003078817733990148),
 (('shimmering butterfly party', 'wamu'), 0.0003078817733990148),
 (('10-item edinburgh postnatal depression scale epds .',
   'www.household bank.com'),
  0.0003078817733990148),
 (('cause of baby blues', 'outback menu'), 0.0003078817733990148),
 (('ediets.com', 'mrsawebmd'), 0.0003078817733990148),
 (('pc dominoes game', 'www.househol'), 0.0003078817733990148),
 (('cat in hat photos', 'pier 1 imports'), 0.0

# Supp(xUy) / Supp(x) = Confidence (X=>Y)
### Prepare the data and devide

In [16]:
joined_support = support_xy.map(lambda x: (x[0][0], (x[0][1], x[1]))).join(support_x)
# joined_support will be an RDD of the form: (X, (Y, supp(xUy)), supp(x))
joined_support.take(10)

[('my son in law asked me for a handjob',
  (('should i let my son inlaw see me naked', 0.0003078817733990148),
   0.0003078817733990148)),
 ('my son in law asked me for a handjob',
  (('stiff cock for my mother inlaw', 0.0003078817733990148),
   0.0003078817733990148)),
 ('my son in law asked me for a handjob',
  (('wifes mother taught us about oral sex', 0.0003078817733990148),
   0.0003078817733990148)),
 ('my son in law asked me for a handjob',
  (('son inlaws velvet tongue', 0.0003078817733990148),
   0.0003078817733990148)),
 ('my son in law asked me for a handjob',
  (('true mother inlaw sex stories', 0.0003078817733990148),
   0.0003078817733990148)),
 ('my son in law asked me for a handjob',
  (('son inlaw needed sexual release', 0.0003078817733990148),
   0.0003078817733990148)),
 ('my son in law asked me for a handjob',
  (('my son nlaw stares at my tits', 0.0003078817733990148),
   0.0003078817733990148)),
 ('my son in law asked me for a handjob',
  (('sex with my mother in

In [20]:
conf_xy = joined_support.map(lambda x: (x[0], x[1][0], x[1][1] / x[2]))