In [1]:
import pyspark
import math
import itertools
from decimal import Decimal
from operator import add
sc = pyspark.SparkContext.getOrCreate()
sqlContext = pyspark.sql.SQLContext(sc)

# Loading the dataset as pyspark RDD of Key-value
Where the key is the anonymous user ID and the value is the query string

In [2]:
logs_rdd = sc.textFile("/usr/data/user-ct-test-collection-01.txt")

headers = logs_rdd.first()
logs_rdd = logs_rdd.filter(lambda line: line != headers)
logs_rdd = logs_rdd.map(lambda line: line.split("\t"))
logs_rdd = logs_rdd.map(lambda line: (int(line[0]), line[1]))
logs_rdd.take(5)

[(142, 'rentdirect.com'),
 (142, 'www.prescriptionfortime.com'),
 (142, 'staple.com'),
 (142, 'staple.com'),
 (142, 'www.newyorklawyersite.com')]

# Calculating Support
![](https://wikimedia.org/api/rest_v1/media/math/render/svg/1c6acacd3b17051205704b5d323c83fc737e5db1 "Support equation")
<br>

### General Idea: Users as transactions and queries as groceries
All the queries that a user made will be treated as groceries bought in a transaction, <br>
and a user will be treated as if it was a transaction. <br>

### 1st step: Get query frequency across users (|X|)
We'll do it by giving the value of 1 for each query and then reduce by key and sum <br> <br>
Essentially, this step is about calculating **|X|** for each query, in the **"support"** equation <br>

In [3]:
#Remove duplicates of query for a certain user
distinct_queries = logs_rdd.distinct()
#Count for each query - how many users made it
query_freq = distinct_queries.map(lambda k: (k[1],1)).reduceByKey(add) # add is equal to <lambda x,y: x+y>
query_freq.take(5)

[('mizuno.com', 6),
 ('first american chesapeake title services', 1),
 ('babycenter.com', 8),
 ('jesse mccartney', 13),
 ('kidsonlilne', 1)]

### 2nd Step: Get |T| - The Amount Of Users

In [4]:
users_amount = logs_rdd.map(lambda record: record[0]).distinct().count()
users_amount

65516

### Calculate |X| / |T| for each query X

In [5]:
support_x = query_freq.map(lambda qf: (qf[0], round(Decimal(qf[1] / users_amount), 8)))
support_x.take(10)

[('mizuno.com', Decimal('0.00009158')),
 ('first american chesapeake title services', Decimal('0.00001526')),
 ('babycenter.com', Decimal('0.00012211')),
 ('jesse mccartney', Decimal('0.00019842')),
 ('kidsonlilne', Decimal('0.00001526')),
 ('charles drew', Decimal('0.00001526')),
 ('ft mcpherson', Decimal('0.00001526')),
 ('new a c for hyundai elantra', Decimal('0.00001526')),
 ('anticholinergic', Decimal('0.00003053')),
 ('rxonline', Decimal('0.00003053'))]

# Calculate Confidence

**For both versions** we'll perform as follows:
1. Create a list of queries per user
2. Sort that list of queries per user
3. Calculate all the X & Y query pairs for each user.
4. Set the pairs to be the new key, with value of 1 (for counting)
5. Reduce by key to count the amount of users who queried both X & Y. ==> |xUy|


**Version 1** follows this equation <br> 
![](https://miro.medium.com/max/1400/1*E3mNKHcudWzHySGMvo_vPg.png "Confidence equation 1")
So whats left to do is to iterate over the X&Y pairs and devide their frequency in X's frequency

**Version** 2 follows this equation <br>
![](https://wikimedia.org/api/rest_v1/media/math/render/svg/90324dedc399441696116eed3658fd17c5da4329 "Confidence equation 2")

As before, we'll treat each user ID as it was a "transaction ID".  <br>
So in our case, Supp(xUy) is going to be the amount of users where both x & y queries took place, <br>
devided by the total amount of users. <br> <br>

**Supp(xUy)**
6. Devide |xUy| by |T| which is the amount of users ==> This will give us supp(xUy)

In [6]:
user_queries = distinct_queries.map(lambda query: (query[0], [query[1]])).reduceByKey(lambda queries_list, new_query: queries_list + new_query)
user_queries.take(10)

[(217,
  ['mizuno.com',
   "p; .; p;' p; ' ;' ;';",
   'yahoo.com',
   '-',
   'susheme',
   'united.com',
   'bestasiancompany.com',
   'weather.com',
   'ask.com',
   'vietnam',
   'wellsfargo.com',
   'www.ngo-quen.org',
   'asiansexygoddess.com',
   'www.tabiecummings.com',
   'wanttickets.com',
   'lottery',
   'ameriprise.com',
   'buddylis']),
 (1337,
  ['cbc companies',
   'first american chesapeake title services',
   'ford',
   'first american lenders advantage',
   'www.americantitleinc.com',
   'credit plus solutions group',
   'www.mygeisinger.com',
   'shamokin dam kentucky fried chicken menu',
   'titlesourceinc',
   'cheasapeake appraisal and settlement services',
   'first american chesapeake',
   'www.national-reis.com',
   'searchtec',
   'integrated loan services',
   'pennsylvania real estate settlement services',
   'www.aculinkms.com',
   'security search and abstract',
   'integrated real estate',
   'michael keaton date of birth',
   'titlesourcein.com',
   'se

In [7]:
def generate_pairs(tup):
    tup[1].sort()
    generated_tupes = []
    for comb in itertools.combinations(tup[1],2):
        generated_tupes.append((comb, 1))
    return generated_tupes

In [8]:
query_pairs = user_queries.flatMap(generate_pairs)
query_pairs.take(100)

[(('-', 'ameriprise.com'), 1),
 (('-', 'asiansexygoddess.com'), 1),
 (('-', 'ask.com'), 1),
 (('-', 'bestasiancompany.com'), 1),
 (('-', 'buddylis'), 1),
 (('-', 'lottery'), 1),
 (('-', 'mizuno.com'), 1),
 (('-', "p; .; p;' p; ' ;' ;';"), 1),
 (('-', 'susheme'), 1),
 (('-', 'united.com'), 1),
 (('-', 'vietnam'), 1),
 (('-', 'wanttickets.com'), 1),
 (('-', 'weather.com'), 1),
 (('-', 'wellsfargo.com'), 1),
 (('-', 'www.ngo-quen.org'), 1),
 (('-', 'www.tabiecummings.com'), 1),
 (('-', 'yahoo.com'), 1),
 (('ameriprise.com', 'asiansexygoddess.com'), 1),
 (('ameriprise.com', 'ask.com'), 1),
 (('ameriprise.com', 'bestasiancompany.com'), 1),
 (('ameriprise.com', 'buddylis'), 1),
 (('ameriprise.com', 'lottery'), 1),
 (('ameriprise.com', 'mizuno.com'), 1),
 (('ameriprise.com', "p; .; p;' p; ' ;' ;';"), 1),
 (('ameriprise.com', 'susheme'), 1),
 (('ameriprise.com', 'united.com'), 1),
 (('ameriprise.com', 'vietnam'), 1),
 (('ameriprise.com', 'wanttickets.com'), 1),
 (('ameriprise.com', 'weather.co

In [9]:
pairs_frequency = query_pairs.reduceByKey(add)
# pairs_frequency is an RDD of this form: [((x1, y2), 1) , ((x3, y7), 12), ((query 1, query 2), freqency over users)]

### Conf Version 1 specific:

In [17]:
xycount_and_xcount = pairs_frequency.map(lambda x: (x[0][0], (x[0][1], x[1]))).join(query_freq)

In [21]:
xycount_and_xcount.take(5)

[('house bugs that bite', (('texas cyclones', 1), 1)),
 ('house bugs that bite', (('myspace.com', 1), 1)),
 ('house bugs that bite', (('pace makers recalled', 1), 1)),
 ('house bugs that bite', (('texas farm bureau', 1), 1)),
 ('house bugs that bite', (('terrys furniture', 1), 1))]

In [27]:
confidence_version_a = xycount_and_xcount.map(lambda x: ((x[0], x[1][0][0]), x[1][0][1] / x[1][1]))

In [28]:
confidence_version_a.take(5)

[(('destination wedding guides', 'www.target'), 1.0),
 (('destination wedding guides', 'underwriters lab'), 1.0),
 (('destination wedding guides', 'www.actsfullgospelcogic.com'), 1.0),
 (('destination wedding guides', 'www.nsbe.org'), 1.0),
 (('destination wedding guides', 'firewood pizz'), 1.0)]

# Results with Confidence of minimum threshold

In [30]:
conf_version_a_06 = confidence_version_a.filter(lambda x: 1> x[1] >= 0.6)
conf_version_a_08 = conf_version_a_06.filter(lambda x: x[1] > 0.8)
conf_version_a_09 = conf_version_a_08.filter(lambda x: x[1] > 0.9)

print(f"Confidence: 0.6 --- {conf_version_a_06.count()} , 0.8 --- {conf_version_a_08.count()} , 0.9 --- {conf_version_a_09.count()} ")

Confidence: 0.6 --- 7908 , 0.8 --- 40 , 0.9 --- 4 


In [31]:
conf_version_a_09.collect()

[(('gooe', 'google'), 0.9090909090909091),
 (('eay', 'ebay'), 0.9285714285714286),
 (('mspace.com', 'myspace.com'), 0.9375),
 (('www.mypace.com', 'www.myspace.com'), 0.9090909090909091)]

# Top 3:
1. mspace.com --> myspace.com | with confidence of **0.9375**
2. eay --> ebay | with confidence of **0.9285**
3. both: gooe --> google  AND  www.mypace.com --> www.myspace.com | with confidence of **0.9090**

# Write results to file:

In [32]:
def toCSVLine(data):
  return ','.join(str(d) for d in data)

csv_lines = conf_version_a_06.map(toCSVLine)
csv_lines.saveAsTextFile('/usr/output/confidence_xy_version1.csv')

# Version 2 confidence (that didn't work for me)

## Supp(xUy) / Supp(x) = Confidence (X=>Y)
### Prepare the data and devide

In [10]:
support_xy = pairs_frequency.map(lambda q: (q[0], round(Decimal(q[1]/users_amount), 8)))
# support_xy is an RDD of this form: [((x,y), support)]

In [11]:
joined_support = support_xy.map(lambda x: (x[0][0], (x[0][1], x[1]))).join(support_x).distinct()
# joined_support will be an RDD of the form: (X, (Y, supp(xUy)), supp(x))

In [12]:
joined_support.take(10)

[('j.a.w. iglehart',
  (('marianne koch nude photos', Decimal('0.00001526')),
   Decimal('0.00001526'))),
 ('african gay pron',
  (('wwww.randyblue.com', Decimal('0.00001526')), Decimal('0.00001526'))),
 ('apjuydguldid92pp5upp5c77p.affrontgl.com',
  (('product liability insurance in california', Decimal('0.00001526')),
   Decimal('0.00003053'))),
 ('st michael the archangel prayer for the sick',
  (('www.purplefrogpub.com', Decimal('0.00001526')), Decimal('0.00001526'))),
 ('daily kos',
  (('social security deaths', Decimal('0.00001526')), Decimal('0.00004579'))),
 ('americanweldingassociation',
  (('macdbobs.resterantcom', Decimal('0.00001526')), Decimal('0.00001526'))),
 ('can you get pregnate on your period',
  (('child moleters', Decimal('0.00001526')), Decimal('0.00001526'))),
 ('pungo va', (('va pilot', Decimal('0.00001526')), Decimal('0.00001526'))),
 ('google.com',
  (('orion.csuchico.edu', Decimal('0.00003053')), Decimal('0.04386715'))),
 ('google.com',
  (('supermanreturns.co

In [13]:
conf_xy = joined_support.map(lambda x: ((x[0], x[1][0][0]), round(Decimal(x[1][0][1] / x[1][1]))))

# High Confidence Count

In [14]:
conf_prepared = conf_xy.filter(lambda x: 1 > x[1] >= 0.6)
conf_06 = conf_prepared.filter(lambda x: 0.8 > x[1])
conf_08 = conf_prepared.filter(lambda x: 0.9 > x[1] >= 0.8)
conf_09 = conf_prepared.filter(lambda x: 1   > x[1] >= 0.9)

In [15]:
print(f"Queries with correlation confidence >= 0.6: {conf_06.count()}")

Queries with correlation confidence >= 0.6: 0


In [16]:
print(f"Queries with correlation confidence >= 0.8: {conf_08.count()}")

Queries with correlation confidence >= 0.8: 0


In [17]:
print(f"Queries with correlation confidence >= 0.9: {conf_09.count()}")

Queries with correlation confidence >= 0.9: 0


# Deeper Dive To Highest Confidence

In [18]:
top_3 = conf_09.top(3, key=lambda x: x[2])
top_3

[]

# Write Confidence Results To File

In [19]:
def toCSVLine(data):
  return ','.join(str(d) for d in data)

csv_lines = conf_xy.map(toCSVLine)
csv_lines.saveAsTextFile('/usr/output/confidence_xy.csv')