##Using the MRJob Class below  calculate the  KL divergence of the following two objects.

In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
%%writefile kltext.txt
1.Data Science is an interdisciplinary field about processes and systems to extract knowledge or insights from large volumes of data in various forms (data in various forms, data in various forms, data in various forms), either structured or unstructured,[1][2] which is a continuation of some of the data analysis fields such as statistics, data mining and predictive analytics, as well as Knowledge Discovery in Databases.
2.Machine learning is a subfield of computer science[1] that evolved from the study of pattern recognition and computational learning theory in artificial intelligence.[1] Machine learning explores the study and construction of algorithms that can learn from and make predictions on data.[2] Such algorithms operate by building a model from example inputs in order to make data-driven predictions or decisions,[3]:2 rather than following strictly static program instructions.

Writing kltext.txt


##MRjob class for calculating pairwise similarity using K-L Divergence as the similarity measure

Job 1: create inverted index (assume just two objects) <P>
Job 2: calculate the similarity of each pair of objects 

In [4]:
import numpy as np
np.log(3)

1.0986122886681098

In [43]:
%%writefile kldivergence.py
from mrjob.job import MRJob
import re
import numpy as np
class kldivergence(MRJob):
    def mapper1(self, _, line):
        index = int(line.split('.',1)[0])
        letter_list = re.sub(r"[^A-Za-z]+", '', line).lower()
        count = {}
        for l in letter_list:
            if count.has_key(l):
                count[l] += 1
            else:
                count[l] = 1
        for key in count:
            yield key, [index, count[key]*1.0/len(letter_list)]


    def reducer1(self, key, values): #Ted added below
        newValues = [i for i in values] #put into list for reference
        pi = float(newValues[0][1]) #get pi value
        qi = float(newValues[1][1]) #get qi value
        yield None, pi * np.log(pi/qi) #yield so all will go to same reducer and calculate appropriate metric
    
    def reducer2(self, key, values):
        kl_sum = 0
        for value in values:
            kl_sum = kl_sum + value
        yield None, kl_sum
            
    def steps(self):
        return [self.mr(mapper=self.mapper1,
                        reducer=self.reducer1),
                self.mr(reducer=self.reducer2)]

if __name__ == '__main__':
    kldivergence.run()

Overwriting kldivergence.py


In [44]:
from kldivergence import kldivergence
mr_job = kldivergence(args=['kltext.txt'])
with mr_job.make_runner() as runner: 
    runner.run()
    # stream_output: get access of the output 
    for line in runner.stream_output():
        print mr_job.parse_output_line(line)



(None, 0.08088278445318145)


For testing

In [17]:
%%writefile mapper.py
#!/usr/bin/python
import sys
import re
# input comes from STDIN (standard input)
for line in sys.stdin:
    index = int(line.split('.',1)[0])
    letter_list = re.sub(r"[^A-Za-z]+", '', line).lower()
    count = {}
    for l in letter_list:
        if count.has_key(l):
            count[l] += 1
        else:
            count[l] = 1
    for key in count:
        print key, [index, count[key]*1.0/len(letter_list)]

Overwriting mapper.py


In [None]:
%%writefile reducer.py
#!/usr/bin/python
for line in sys.stdin:
        pi = float(values[0][1])
        qi = float(values[1][1])
        yield None, pi * np.log(pi/qi)

In [21]:
!cat kltext.txt | python mapper.py | sort

a [1, 0.11078717201166181]
a [2, 0.08483290488431877]
b [1, 0.0058309037900874635]
b [2, 0.007712082262210797]
c [1, 0.04081632653061224]
c [2, 0.04884318766066838]
d [1, 0.05539358600583091]
d [2, 0.04113110539845758]
e [1, 0.07580174927113703]
e [2, 0.08997429305912596]
f [1, 0.029154518950437316]
f [2, 0.02313624678663239]
g [1, 0.014577259475218658]
g [2, 0.02570694087403599]
h [1, 0.01749271137026239]
h [2, 0.030848329048843187]
i [1, 0.09620991253644315]
i [2, 0.09254498714652956]
k [1, 0.0058309037900874635]
k [2, 0.005141388174807198]
l [1, 0.03206997084548105]
l [2, 0.04884318766066838]
m [1, 0.026239067055393587]
m [2, 0.03598971722365039]
n [1, 0.0641399416909621]
n [2, 0.08997429305912596]
o [1, 0.06997084548104957]
o [2, 0.07969151670951156]
p [1, 0.008746355685131196]
p [2, 0.02570694087403599]
r [1, 0.06705539358600583]
r [2, 0.07455012853470437]
s [1, 0.11078717201166181]
s [2, 0.04884318766066838]
t [1, 0.08163265306122448]
t [2, 0.09

Version using smoother

In [49]:
%%writefile kldivergenceSmooth.py
from mrjob.job import MRJob
import re
import numpy as np
from string import ascii_lowercase
class kldivergenceSmooth(MRJob):
    def mapper1(self, _, line):
        index = int(line.split('.',1)[0])
        letter_list = re.sub(r"[^A-Za-z]+", '', line).lower()
        count = {}
        for l in letter_list:
            if count.has_key(l):
                count[l] += 1
            else:
                count[l] = 1 + 1
        for l in ascii_lowercase:
            if count.has_key(l):
                continue
            else:
                count[l] = 1
        for key in count:
            yield key, [index, count[key]*1.0/(len(letter_list) + 24)]


    def reducer1(self, key, values): #Ted added below
        newValues = [i for i in values]
        pi = float(newValues[0][1])
        qi = float(newValues[1][1])
        yield None, pi * np.log(pi/qi)
    
    def reducer2(self, key, values):
        kl_sum = 0
        for value in values:
            kl_sum = kl_sum + value
        yield None, kl_sum
            
    def steps(self):
        return [self.mr(mapper=self.mapper1,
                        reducer=self.reducer1),
                self.mr(reducer=self.reducer2)]

if __name__ == '__main__':
    kldivergenceSmooth.run()

Overwriting kldivergenceSmooth.py


In [50]:
from kldivergenceSmooth import kldivergenceSmooth
mr_job = kldivergenceSmooth(args=['kltext.txt'])
with mr_job.make_runner() as runner: 
    runner.run()
    # stream_output: get access of the output 
    for line in runner.stream_output():
        print mr_job.parse_output_line(line)



(None, 0.06823525136041805)
