In [1]:
import nltk
import codecs
import string
import re, math, collections
import random

In [2]:
def kldiv(_s, _t):
    if (len(_s) == 0):
        return 1e33
 
    if (len(_t) == 0):
        return 1e33
 
    ssum = 0. + sum(_s.values())
    slen = len(_s)
 
    tsum = 0. + sum(_t.values())
    tlen = len(_t)
 
    vocabdiff = set(_s.keys()).difference(set(_t.keys()))
    lenvocabdiff = len(vocabdiff)
 
    """ epsilon """
    epsilon = min(min(_s.values())/ssum, min(_t.values())/tsum) * 0.001
 
    """ gamma """
    gamma = 1 - lenvocabdiff * epsilon
 
    # print "_s: %s" % _s
    # print "_t: %s" % _t
 
    """ Check if distribution probabilities sum to 1"""
    sc = sum([v/ssum for v in _s.itervalues()])
    st = sum([v/tsum for v in _t.itervalues()])
 
    if sc < 9e-6:
        print ("Sum P: %e, Sum Q: %e" % (sc, st))
        print ("*** ERROR: sc does not sum up to 1. Bailing out ..")
        sys.exit(2)
    if st < 9e-6:
        print ("Sum P: %e, Sum Q: %e" % (sc, st))
        print ("*** ERROR: st does not sum up to 1. Bailing out ..")
        sys.exit(2)
 
    div = 0.
    for t, v in _s.iteritems():
        pts = v / ssum
 
        ptt = epsilon
        if t in _t:
            ptt = gamma * (_t[t] / tsum)
 
        ckl = (pts - ptt) * math.log(pts / ptt)
 
        div +=  ckl
 
    return div

In [3]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [4]:
def process(lst):
    prccd_item_list=[]
    for tweet in lst:
        # Normalizing utf8 formatting
        tweet = tweet.decode("unicode-escape").encode("utf8").decode("utf8")
        tweet = tweet.encode("ascii","ignore")
        tweet = tweet.strip('\t\n\r')
        # 1. Lowercasing
        tweet = tweet.lower()
        # Word-Level
        tweet = re.sub(' +',' ',tweet) # replace multiple spaces with a single space
        #  2. Normalizing digits
        tweet_words = tweet.strip('\r').split(' ')
        for word in [word for word in tweet_words if word.isdigit()]:
            tweet = tweet.replace(word, "D" * len(word))
        # 3. Normalizing URLs
        tweet_words = tweet.strip('\r').split(' ')
        for word in [word for word in tweet_words if '/' in word or '.' in word and  len(word) > 3]:
            tweet = tweet.replace(word, "httpAddress")
        # 4. Normalizing username
        tweet_words = tweet.strip('\r').split(' ')
        for word in [word for word in tweet_words if word[0] == '@' and len(word) > 1]:
            tweet = tweet.replace(word, "usrId")
        # 5. Removing special Characters
        punc = '@$%^&*()_+-={}[]:"|\'\~`<>/,'
        trans = string.maketrans(punc, ' '*len(punc))
        tweet = tweet.translate(trans)
        # 6. Normalizing +2 elongated char
        tweet = re.sub(r"(.)\1\1+",r'\1\1', tweet.decode('utf-8'))
        #print("[elong]", tweet)
        # 7. tokenization using tweetNLP
        tweet = ' '.join(simpleTokenize(tweet))
        #8. fix \n char
        tweet = tweet.replace('\n', ' ')

        prccd_item_list.append(tweet.strip())
    return prccd_item_list

In [5]:
def tokenize(_str):
    stopwords = ['and', 'for', 'if', 'the', 'then', 'be', 'is', 'are', 'will', 'in', 'it', 'to', 'that']
    unwanted_words = ["httpaddress", "usrid", "dd", "rt", "amp", "pm", " ", "'s", "n't", "\t", '``', "''", "", "//", "\\", "\\'s", "\\?"]
    tokens = collections.defaultdict(lambda: 0.)
    for m in re.finditer(r"(\w+)", _str, re.UNICODE):
        m = m.group(1).lower()
        if len(m) < 2: continue
        if m in stopwords : continue
        tokens[m] += 1
    return tokens

In [6]:
fp = codecs.open('Data/amazon/books/processed/neg_books.txt', 'r', 'utf-8',errors='ignore')
neg_books_doc_lst = fp.readlines()

In [7]:
fp = codecs.open('Data/amazon/books/processed/pos_books.txt', 'r', 'utf-8',errors='ignore')
pos_books_doc_lst = fp.readlines()

In [8]:
print(tokenize(neg_books_doc_lst[0]))

defaultdict(<function <lambda> at 0x7f045fdcdd70>, {u'slowly': 1.0, u'imbibe': 1.0, u'certain': 1.0, u'one': 1.0, u'clear': 1.0, u'anger': 1.0, u'children': 1.0, u'needs': 1.0, u'entities': 1.0, u'perhaps': 1.0, u'interesting': 1.0, u'readable': 1.0, u'heady': 1.0, u'its': 2.0, u'religion': 1.0, u'book': 1.0, u'various': 1.0, u'pretty': 1.0, u'speech': 1.0, u'you': 1.0, u'reading': 1.0, u'ofcourse': 1.0, u'very': 1.0, u'but': 1.0, u'thoughts': 2.0, u'cool': 1.0, u'on': 1.0, u'about': 1.0, u'has': 2.0, u'like': 1.0, u'of': 1.0, u'consumed': 1.0, u'author': 1.0, u'thin': 1.0, u'things': 1.0, u'leaves': 1.0, u'revisited': 1.0, u'page': 1.0, u'silence': 1.0})


In [9]:
print(tokenize(pos_books_doc_lst[0]))

defaultdict(<function <lambda> at 0x7f045fdcd668>, {u'inspiring': 1.0, u'help': 1.0, u'morals': 1.0, u'question': 1.0, u'who': 1.0, u'allows': 1.0, u'discover': 1.0, u'book': 1.0, u'spiritually': 1.0, u'you': 3.0, u'your': 1.0, u'mentally': 1.0, u'really': 1.0})


In [17]:
books_neg_f_dist_dict_sample1 = random.sample(neg_books_doc_lst, 10000)
books_pos_f_dist_dict_sample2 = random.sample(pos_books_doc_lst, 10000)

KL_DIV_scores_books_pos_neg_samples = []

for i in range(len(books_neg_f_dist_dict_sample1)):
    KL_DIV_scores_books_pos_neg_samples.append(kldiv(tokenize(books_neg_f_dist_dict_sample1[i])
                                                 ,tokenize(books_pos_f_dist_dict_sample2[i])))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_books_pos_neg_samples)/len(KL_DIV_scores_books_pos_neg_samples)
print("mean KL- div between books pos samples ",KL_DIV_mean)


('mean KL- div between books pos samples ', 3e+29)
