In [82]:
import nltk
import codecs
import string
import re, math, collections
import random

In [83]:
# %load twokenize.py
"""
Twokenize -- a tokenizer designed for Twitter text in English and some other European languages.
This tokenizer code has gone through a long history:

(1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif
       TweetMotif: Exploratory Search and Topic Summarization for Twitter.
       Brendan O'Connor, Michel Krieger, and David Ahn.
       ICWSM-2010 (demo track), http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf
(2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger
(2b) Jason Baldridge and David Snyder ported it to Scala
(3) Brendan bugfixed the Scala port and merged with POS-specific changes
    for the CMU ARK Twitter POS Tagger  
(4) Tobi Owoputi ported it back to Java and added many improvements (2012-06)

Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP

There have been at least 2 other Java ports, but they are not in the lineage for the code here.

Ported to Python by Myle Ott <myleott@gmail.com>.
"""

from __future__ import print_function

import operator
import re
import HTMLParser

def regex_or(*items):
    return '(?:' + '|'.join(items) + ')'

Contractions = re.compile(u"(?i)(\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$", re.UNICODE)
Whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)

punctChars = r"['\"“”‘’.?!…,:;]"
#punctSeq   = punctChars+"+"	#'anthem'. => ' anthem '.
punctSeq   = r"['\"“”‘’]+|[.?!,…]+|[:;]+"	#'anthem'. => ' anthem ' .
entity     = r"&(?:amp|lt|gt|quot);"
#  URLs


# BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
# If you actually empirically test it the results are bad.
# Please see https://github.com/brendano/ark-tweet-nlp/pull/9

urlStart1  = r"(?:https?://|\bwww\.)"
commonTLDs = r"(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"
ccTLDs	 = r"(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + \
r"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + \
r"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + \
r"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + \
r"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + \
r"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + \
r"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf_basic_trials|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + \
r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"	#TODO: remove obscure country domains?
urlStart2  = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
urlBody    = r"(?:[^\.\s<>][^\s<>]*?)?"
urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
urlEnd     = r"(?:\.\.+|[<>]|\s|$)"
url        = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"


# Numeric
timeLike   = r"\d+(?::\d+){1,2}"
#numNum     = r"\d+\.\d+"
numberWithCommas = r"(?:(?<!\d)\d{1,3},)+?\d{3}" + r"(?=(?:[^,\d]|$))"
numComb	 = u"[\u0024\u058f\u060b\u09f2\u09f3\u09fb\u0af1\u0bf9\u0e3f\u17db\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6\u00a2-\u00a5\u20a0-\u20b9]?\\d+(?:\\.\\d+)+%?".encode('utf-8')

# Abbreviations
boundaryNotDot = regex_or("$", r"\s", r"[“\"?!,:;]", entity)
aa1  = r"(?:[A-Za-z]\.){2,}(?=" + boundaryNotDot + ")"
aa2  = r"[^A-Za-z](?:[A-Za-z]\.){1,}[A-Za-z](?=" + boundaryNotDot + ")"
standardAbbreviations = r"\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\."
arbitraryAbbrev = regex_or(aa1, aa2, standardAbbreviations)
separators  = "(?:--+|―|—|~|–|=)"
decorations = u"(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\u2639-\u263b]+|[\ue001-\uebbb]+)".encode('utf-8')
thingsThatSplitWords = r"[^\s\.,?\"]"
embeddedApostrophe = thingsThatSplitWords+r"+['’′]" + thingsThatSplitWords + "*"

#  Emoticons
# myleott: in Python the (?iu) flags affect the whole expression
#normalEyes = "(?iu)[:=]" # 8 and x are eyes but cause problems
normalEyes = "[:=]" # 8 and x are eyes but cause problems
wink = "[;]"
noseArea = "(?:|-|[^a-zA-Z0-9 ])" # doesn't get :'-(
happyMouths = r"[D\)\]\}]+"
sadMouths = r"[\(\[\{]+"
tongue = "[pPd3]+"
otherMouths = r"(?:[oO]+|[/\\]+|[vV]+|[Ss]+|[|]+)" # remove forward slash if http://'s aren't cleaned

# mouth repetition examples:
# @aliciakeys Put it in a love song :-))
# @hellocalyclops =))=))=)) Oh well

# myleott: try to be as case insensitive as possible, but still not perfect, e.g., o.O fails
#bfLeft = u"(♥|0|o|°|v|\\$|t|x|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
bfLeft = u"(♥|0|[oO]|°|[vV]|\\$|[tT]|[xX]|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
bfCenter = r"(?:[\.]|[_-]+)"
bfRight = r"\2"
s3 = r"(?:--['\"])"
s4 = r"(?:<|&lt;|>|&gt;)[\._-]+(?:<|&lt;|>|&gt;)"
s5 = "(?:[.][_]+[.])"
# myleott: in Python the (?i) flag affects the whole expression
#basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5

eeLeft = r"[＼\\ƪԄ\(（<>;ヽ\-=~\*]+"
eeRight= u"[\\-=\\);'\u0022<>ʃ）/／ノﾉ丿╯σっµ~\\*]+".encode('utf-8')
eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight

oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"


emoticon = regex_or(
        # Standard version  :) :( :] :D :P
        "(?:>|&gt;)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),

        # reversed version (: D:  use positive lookbehind to remove "(word):"
        # because eyes on the right side is more ambiguous with the standard usage of : ;
        regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|&lt;)?",

        #inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
        eastEmote.replace("2", "1", 1), basicface,
        # iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]  
        # TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this

        # myleott: o.O and O.o are two of the biggest sources of differences
        #          between this and the Java version. One little hack won't hurt...
        oOEmote
)

Hearts = "(?:<+/?3+)+" #the other hearts are in decorations

Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+".encode('utf-8'))

# BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
# "hello (#hashtag)" ==> "hello (#hashtag )"  WRONG
# "hello (#hashtag)" ==> "hello ( #hashtag )"  RIGHT
# "hello (@person)" ==> "hello (@person )"  WRONG
# "hello (@person)" ==> "hello ( @person )"  RIGHT
# ... Some sort of weird interaction with edgepunct I guess, because edgepunct 
# has poor content-symbol detection.

# This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
# If you want good hashtag identification, use a different regex.
Hashtag = "#[a-zA-Z0-9_]+"  #optional: lookbehind for \b
#optional: lookbehind for \b, max length 15
AtMention = "[@＠][a-zA-Z0-9_]+"

# I was worried this would conflict with at-mentions
# but seems ok in sample of 5800: 7 changes all email fixes
# http://www.regular-expressions.info/email.html
Bound = r"(?:\W|^|$)"
Email = regex_or("(?<=(?:\W))", "(?<=(?:^))") + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=" +Bound+")"

# We will be tokenizing using these regexps as delimiters
# Additionally, these things are "protected", meaning they shouldn't be further split themselves.
Protected  = re.compile(
    unicode(regex_or(
        Hearts,
        url,
        Email,
        timeLike,
        #numNum,
        numberWithCommas,
        numComb,
        emoticon,
        Arrows,
        entity,
        punctSeq,
        arbitraryAbbrev,
        separators,
        decorations,
        embeddedApostrophe,
        Hashtag,  
        AtMention
    ).decode('utf-8')), re.UNICODE)

# Edge punctuation
# Want: 'foo' => ' foo '
# While also:   don't => don't
# the first is considered "edge punctuation".
# the second is word-internal punctuation -- don't want to mess with it.
# BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.  
# I remember it causing lots of trouble in the past as well.  Would be good to revisit or eliminate.

# Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
#edgePunctChars    = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
edgePunctChars    = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
edgePunct    = "[" + edgePunctChars + "]"
notEdgePunct = "[a-zA-Z0-9]" # content characters
offEdge = r"(^|$|:|;|\s|\.|,)"  # colon here gets "(hello):" ==> "( hello ):"
EdgePunctLeft  = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)

def splitEdgePunct(input):
    input = EdgePunctLeft.sub(r"\1\2 \3", input)
    input = EdgePunctRight.sub(r"\1 \2\3", input)
    return input

# The main work of tokenizing a tweet.
def simpleTokenize(text):

    # Do the no-brainers first
    splitPunctText = splitEdgePunct(text)

    textLength = len(splitPunctText)
    
    # BTO: the logic here got quite convoluted via the Scala porting detour
    # It would be good to switch back to a nice simple procedural style like in the Python version
    # ... Scala is such a pain.  Never again.

    # Find the matches for subsequences that should be protected,
    # e.g. URLs, 1.0, U.N.K.L.E., 12:53
    bads = []
    badSpans = []
    for match in Protected.finditer(splitPunctText):
        # The spans of the "bads" should not be split.
        if (match.start() != match.end()): #unnecessary?
            bads.append( [splitPunctText[match.start():match.end()]] )
            badSpans.append( (match.start(), match.end()) )

    # Create a list of indices to create the "goods", which can be
    # split. We are taking "bad" spans like 
    #     List((2,5), (8,10)) 
    # to create 
    #     List(0, 2, 5, 8, 10, 12)
    # where, e.g., "12" here would be the textLength
    # has an even length and no indices are the same
    indices = [0]
    for (first, second) in badSpans:
        indices.append(first)
        indices.append(second)
    indices.append(textLength)

    # Group the indices and map them to their respective portion of the string
    splitGoods = []
    for i in range(0, len(indices), 2):
        goodstr = splitPunctText[indices[i]:indices[i+1]]
        splitstr = goodstr.strip().split(" ")
        splitGoods.append(splitstr)

    #  Reinterpolate the 'good' and 'bad' Lists, ensuring that
    #  additonal tokens from last good item get included
    zippedStr = []
    for i in range(len(bads)):
        zippedStr = addAllnonempty(zippedStr, splitGoods[i])
        zippedStr = addAllnonempty(zippedStr, bads[i])
    zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)])

    # BTO: our POS tagger wants "ur" and "you're" to both be one token.
    # Uncomment to get "you 're"
    #splitStr = []
    #for tok in zippedStr:
    #    splitStr.extend(splitToken(tok))
    #zippedStr = splitStr
    
    return zippedStr

def addAllnonempty(master, smaller):
    for s in smaller:
        strim = s.strip()
        if (len(strim) > 0):
            master.append(strim)
    return master

# "foo   bar " => "foo bar"
def squeezeWhitespace(input):
    return Whitespace.sub(" ", input).strip()

# Final pass tokenization based on special patterns
def splitToken(token):
    m = Contractions.search(token)
    if m:
        return [m.group(1), m.group(2)]
    return [token]

# Assume 'text' has no HTML escaping.
def tokenize(text):
    return simpleTokenize(squeezeWhitespace(text))


# Twitter text comes HTML-escaped, so unescape it.
# We also first unescape &amp;'s, in case the text has been buggily double-escaped.
def normalizeTextForTagger(text):
    text = text.replace("&amp;", "&")
    text = HTMLParser.HTMLParser().unescape(text)
    return text

# This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
# 
# This function normalizes the input text BEFORE calling the tokenizer.
# So the tokens you get back may not exactly correspond to
# substrings of the original text.
def tokenizeRawTweetText(text):
    tokens = tokenize(normalizeTextForTagger(text))
    return tokens



In [84]:
def process(lst):
    prccd_item_list=[]
    for tweet in lst:
        # Normalizing utf8 formatting
        tweet = tweet.decode("unicode-escape").encode("utf8").decode("utf8")
        tweet = tweet.encode("ascii","ignore")
        tweet = tweet.strip('\t\n\r')
        # 1. Lowercasing
        tweet = tweet.lower()
        # Word-Level
        tweet = re.sub(' +',' ',tweet) # replace multiple spaces with a single space
        #  2. Normalizing digits
        tweet_words = tweet.strip('\r').split(' ')
        for word in [word for word in tweet_words if word.isdigit()]:
            tweet = tweet.replace(word, "D" * len(word))
        # 3. Normalizing URLs
        tweet_words = tweet.strip('\r').split(' ')
        for word in [word for word in tweet_words if '/' in word or '.' in word and  len(word) > 3]:
            tweet = tweet.replace(word, "httpAddress")
        # 4. Normalizing username
        tweet_words = tweet.strip('\r').split(' ')
        for word in [word for word in tweet_words if word[0] == '@' and len(word) > 1]:
            tweet = tweet.replace(word, "usrId")
        # 5. Removing special Characters
        punc = '@$%^&*()_+-={}[]:"|\'\~`<>/,'
        trans = string.maketrans(punc, ' '*len(punc))
        tweet = tweet.translate(trans)
        # 6. Normalizing +2 elongated char
        tweet = re.sub(r"(.)\1\1+",r'\1\1', tweet.decode('utf-8'))
        #print("[elong]", tweet)
        # 7. tokenization using tweetNLP
        tweet = ' '.join(simpleTokenize(tweet))
        #8. fix \n char
        tweet = tweet.replace('\n', ' ')

        prccd_item_list.append(tweet.strip())
    return prccd_item_list

In [85]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [86]:
def generat_word_dist_bigrams(filename):
    
    default_stopwords = set(nltk.corpus.stopwords.words('english'))
    
    fp = codecs.open(filename, 'r', 'utf-8',errors='ignore')

    words = nltk.word_tokenize(fp.read())

    # Remove single-character tokens (mostly punctuation)
    words = [word for word in words if len(word) > 1]

    # Remove numbers
    words = [word for word in words if not word.isnumeric()]

    # Lowercase all words (default_stopwords are lowercase too)
    words = [word.lower() for word in words]

    # Stemming words seems to make matters worse, disabled
    # stemmer = nltk.stem.snowball.SnowballStemmer('german')
    # words = [stemmer.stem(word) for word in words]

    # Remove stopwords
    words = [word for word in words if word not in default_stopwords]
    
    words = [clean_str(word) for word in words]
    
    words = nltk.bigrams(words)

    # Calculate frequency distribution
    fdist = nltk.FreqDist(words)
    return fdist
    # Output top 50 words

    #for word, frequency in fdist.most_common(50):
        #print(u'{};{}'.format(word, frequency))

In [87]:
def process_tweets (filename):
    with open(filename) as f:
        content = list(line for line in (l.strip() for l in f) if line)
        # you may also want to remove whitespace characters like `\n` at the end of each line
    tweets = [x.strip() for x in content] 
    processed_tweets = process(tweets)
    return processed_tweets


In [88]:
def generate_words_tweets_dis(tweet):
    
    words = []
    
    unwanted_words = ["httpaddress", "usrid", "dd", "rt", "amp", "pm", " ", "'s", "n't", "\t", '``', "''", "", "//", "\\", "\\'s", "\\?"]

    
    default_stopwords = set(nltk.corpus.stopwords.words('english'))
    
    words = nltk.word_tokenize(tweet)

    # Remove single-character tokens (mostly punctuation)
    words = [word for word in words if len(word) > 1]

    # Remove numbers
    words = [word for word in words if not word.isnumeric()]

    # Lowercase all words (default_stopwords are lowercase too)
    words = [word.lower() for word in words]

    # Stemming words seems to make matters worse, disabled
    # stemmer = nltk.stem.snowball.SnowballStemmer('german')
    # words = [stemmer.stem(word) for word in words]


    # Remove stopwords
    words = [word for word in words if word not in default_stopwords]
    
    words = [clean_str(word) for word in words]
    
    words = [word for word in words if word not in unwanted_words]

    # Calculate frequency distribution
    fdist = nltk.FreqDist(words)
    return fdist

In [89]:
def generat_word_dist(filename):
    
    unwanted_words = ["'s"," ","n't","\t", '``', "''", "", "//", "\\", "\\'s", "\\?"]
    
    default_stopwords = set(nltk.corpus.stopwords.words('english'))
    
    fp = codecs.open(filename, 'r', 'utf-8',errors='ignore')
    
    words = nltk.word_tokenize(fp.read())
    
    # Remove single-character tokens (mostly punctuation)
    words = [word for word in words if len(word) > 1]

    # Remove numbers
    words = [word for word in words if not word.isnumeric()]

    # Lowercase all words (default_stopwords are lowercase too)
    words = [word.lower() for word in words]

    # Stemming words seems to make matters worse, disabled
    # stemmer = nltk.stem.snowball.SnowballStemmer('german')
    # words = [stemmer.stem(word) for word in words]

    # Remove stopwords
    words = [word for word in words if word not in default_stopwords]
    
    words = [clean_str(word) for word in words]
    
    words = [word for word in words if word not in unwanted_words]
    
    print(len(words))    
    # Calculate frequency distribution
    fdist = nltk.FreqDist(words)
    return fdist

In [90]:
def generat_word_dist_for_doc(document):
    
    unwanted_words = ["'s"," ","n't","\t", '``', "''", "", "//", "\\", "\\'s", "\\?"]
    
    default_stopwords = set(nltk.corpus.stopwords.words('english'))
    
    words = nltk.word_tokenize(document)
   
    
    #words = [unicode(word,'utf-8') for word in words]
    
    # Remove single-character tokens (mostly punctuation)
    words = [word for word in words if len(word) > 1]

    # Remove numbers
    words = [word for word in words if not word.isnumeric()]

    # Lowercase all words (default_stopwords are lowercase too)
    words = [word.lower() for word in words]

    # Stemming words seems to make matters worse, disabled
    # stemmer = nltk.stem.snowball.SnowballStemmer('german')
    # words = [stemmer.stem(word) for word in words]

    # Remove stopwords
    words = [word for word in words if word not in default_stopwords]
    
    words = [clean_str(word) for word in words]
    
    words = [word for word in words if word not in unwanted_words]
    # Calculate frequency distribution
    fdist = nltk.FreqDist(words)
    return fdist

In [91]:
def kldiv(_s, _t):
    if (len(_s) == 0):
        return 1e33
 
    if (len(_t) == 0):
        return 1e33
 
    ssum = 0. + sum(_s.values())
    slen = len(_s)
 
    tsum = 0. + sum(_t.values())
    tlen = len(_t)
 
    vocabdiff = set(_s.keys()).difference(set(_t.keys()))
    lenvocabdiff = len(vocabdiff)
 
    """ epsilon """
    epsilon = min(min(_s.values())/ssum, min(_t.values())/tsum) * 0.001
 
    """ gamma """
    gamma = 1 - lenvocabdiff * epsilon
 
    # print "_s: %s" % _s
    # print "_t: %s" % _t
 
    """ Check if distribution probabilities sum to 1"""
    sc = sum([v/ssum for v in _s.itervalues()])
    st = sum([v/tsum for v in _t.itervalues()])
 
    if sc < 9e-6:
        print ("Sum P: %e, Sum Q: %e" % (sc, st))
        print ("*** ERROR: sc does not sum up to 1. Bailing out ..")
        sys.exit(2)
    if st < 9e-6:
        print ("Sum P: %e, Sum Q: %e" % (sc, st))
        print ("*** ERROR: st does not sum up to 1. Bailing out ..")
        sys.exit(2)
 
    div = 0.
    for t, v in _s.iteritems():
        pts = v / ssum
 
        ptt = epsilon
        if t in _t:
            ptt = gamma * (_t[t] / tsum)
 
        ckl = (pts - ptt) * math.log(pts / ptt)
 
        div +=  ckl
 
    return div

In [92]:
# word freq distribution in PV dataset
fp = codecs.open('Data/All_Pos_PV_DS.text', 'r', 'utf-8',errors='ignore')
doc_lst = fp.readlines()
Pv_doc_word_freq_dist = map (generat_word_dist_for_doc, doc_lst)

In [93]:
Pv_doc_word_freq_dist[0]

FreqDist({u'affiliated': 1,
          u'beni': 1,
          u'day': 1,
          u'group': 1,
          u'members': 1,
          u'monday': 1,
          u'oulbane': 1,
          u'protest': 2,
          u'protesters': 1,
          u'skikda': 1,
          u'staged': 2,
          u'unpef': 2})

In [94]:
# word freq distribution in reuters violence negative dataset
fp = codecs.open('Data/reuters_negative_path.txt', 'r', 'utf-8',errors='ignore')
doc_lst = fp.readlines()
reuters_f_dist = map (generat_word_dist_for_doc, doc_lst)

In [117]:
# word freq distribution in books neg reviews
fp = codecs.open('Data/amazon/books/processed/neg_books.txt', 'r', 'utf-8',errors='ignore')
doc_lst = fp.readlines()
books_neg_f_dist = map (generat_word_dist_for_doc, doc_lst)

In [118]:
# word freq distribution in books po reviews
fp = codecs.open('Data/amazon/books/processed/pos_books.txt', 'r', 'utf-8',errors='ignore')
doc_lst = fp.readlines()
books_pos_f_dist = map (generat_word_dist_for_doc, doc_lst)

In [119]:
# word freq distribution in movies neg reviews
fp = codecs.open('Data/amazon/movies/processed/neg_movies.txt', 'r', 'utf-8',errors='ignore')
doc_lst = fp.readlines()
movies_neg_f_dist = map (generat_word_dist_for_doc, doc_lst)

In [120]:
# word freq distribution in movies pos reviews
fp = codecs.open('Data/amazon/movies/processed/pos_movies.txt', 'r', 'utf-8',errors='ignore')
doc_lst = fp.readlines()
movies_pos_f_dist = map (generat_word_dist_for_doc, doc_lst)

In [95]:
# word freq distribution in turkish tweets dataset
turkish_tweets = process_tweets('Data/turkish_protests_unduplicated_100000_text.txt')

In [96]:
turkish_tweets = process_tweets('Data/turkish_protests_unduplicated_100000_text.txt')

In [97]:
# word freq distribution in unlabeled turkish tweets
turkish_tweets_f_dist = map(generate_words_tweets_dis, turkish_tweets)

In [121]:
CF_pos_tweets = process_tweets('Data/turkish_protest_test_pos_prccd2.txt')

In [123]:
CF_pos_tweets_f_dist = map (generate_words_tweets_dis, CF_pos_tweets)

In [124]:
CF_neg_tweets = process_tweets('Data/turkish_protest_test_neg_prccd2.txt')

In [125]:
CF_neg_tweets_f_dist = map(generate_words_tweets_dis, CF_neg_tweets)

In [98]:
def print_word_frequencies_file(freq_dist, filePath):
    term_dict = {}
    with open(filePath,'w') as f:
        for word, frequency in freq_dist.most_common(1000000):
            f.write(u'{} {}'.format(word, frequency)+ '\n')
            term_dict[word] = frequency
    return term_dict        

In [99]:
def print_word_frequencies(freq_dist):
    term_dict = {}
    for word, frequency in freq_dist.most_common(1000000):
        term_dict[word] = frequency
    return term_dict    

In [100]:
def merge_dict (x , y):
    z = x.copy()
    z.update(y)
    return z

In [101]:
pv_f_dist_dict = map(print_word_frequencies, Pv_doc_word_freq_dist)

In [102]:
turkish_tweets_f_dist_dict = map(print_word_frequencies, turkish_tweets_f_dist)

In [103]:
reuters_f_dist_dict = map(print_word_frequencies, reuters_f_dist)

In [126]:
books_pos_f_dist_dict = map(print_word_frequencies, books_pos_f_dist)

In [127]:
books_neg_f_dist_dict = map(print_word_frequencies, books_neg_f_dist)

In [277]:
books_neg_f_dist_dict[0]

{u'1 2': 1,
 u'anger': 1,
 u'author': 1,
 u'book': 1,
 u'certain': 1,
 u'children': 1,
 u'clear': 1,
 u'consumed': 1,
 u'cool': 1,
 u'entities': 1,
 u'heady': 1,
 u'imbibe': 1,
 u'interesting': 1,
 u'leaves': 1,
 u'like': 1,
 u'needs': 1,
 u'ofcourse': 1,
 u'one': 1,
 u'page': 1,
 u'perhaps': 1,
 u'pretty': 1,
 u'readable': 1,
 u'reading': 1,
 u'religion': 1,
 u'revisited': 1,
 u'silence': 1,
 u'slowly': 1,
 u'speech': 1,
 u'thin': 1,
 u'things': 1,
 u'thoughts': 2,
 u'various': 1}

In [128]:
movies_neg_f_dist_dict = map(print_word_frequencies, movies_neg_f_dist)

In [129]:
movies_pos_f_dist_dict = map(print_word_frequencies, movies_pos_f_dist)

In [130]:
CF_pos_tweets_f_dist_dict = map(print_word_frequencies, CF_pos_tweets_f_dist)

In [131]:
CF_neg_tweets_f_dist_dict = map(print_word_frequencies, CF_neg_tweets_f_dist)

In [104]:
KL_DIV_scores_PV_sample = []

for i in range(len(pv_f_dist_dict)):
    KL_DIV_scores_PV_sample.append(kldiv(pv_f_dist_dict[i],pv_f_dist_dict[i]))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_PV_sample)/len(KL_DIV_scores_PV_sample)
print("mean KL- div between PV samples",KL_DIV_mean)

mean KL- div between PV samples 0.0


In [276]:
print(encode(pv_f_dist_dict[0].keys()))

NameError: name 'encode' is not defined

In [222]:
pv_f_dist_dict_sample1 = random.sample(pv_f_dist_dict[0],25000)
pv_f_dist_dict_sample2 = random.sample(pv_f_dist_dict,25000)

KL_DIV_scores_PV_samples = []

for i in range(len(pv_f_dist_dict_sample1)):
    KL_DIV_scores_PV_samples.append(kldiv(pv_f_dist_dict_sample1[i],pv_f_dist_dict_sample2[i]))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_PV_samples)/len(KL_DIV_scores_PV_samples)
print("mean KL- div between PV samples",KL_DIV_mean)

mean KL- div between PV samples 7.10225080582


In [115]:
reuters_f_dist_dict_sample1 = random.sample(reuters_f_dist_dict,25000)
reuters_f_dist_dict_sample2 = random.sample(reuters_f_dist_dict,25000)

KL_DIV_scores_reuters_samples = []

for i in range(len(reuters_f_dist_dict_sample1)):
    KL_DIV_scores_reuters_samples.append(kldiv(reuters_f_dist_dict_sample1[i],reuters_f_dist_dict_sample2[i]))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_reuters_samples)/len(KL_DIV_scores_reuters_samples)
print("mean KL- div between reuters samples",KL_DIV_mean)

mean KL- div between reuters samples 6.85191514129


In [114]:
KL_DIV_scores_tweets_sample1 = random.sample(turkish_tweets_f_dist_dict,25000)
KL_DIV_scores_tweets_sample2 = random.sample(turkish_tweets_f_dist_dict,25000)

KL_DIV_scores_tweets_samples = []

for i in range(len(KL_DIV_scores_tweets_sample1)):
    KL_DIV_scores_tweets_samples.append(kldiv(KL_DIV_scores_tweets_sample1[i],KL_DIV_scores_tweets_sample2[i]))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_tweets_samples)/len(KL_DIV_scores_tweets_samples)
print("mean KL- div between tweets samples",KL_DIV_mean)

mean KL- div between tweets samples 2.848e+31


In [109]:
pv_f_dist_dict_samples = random.sample(pv_f_dist_dict,25000)
KL_DIV_scores_PV_reuters = []

for i in range(len(pv_f_dist_dict_samples)):
    KL_DIV_scores_PV_reuters.append(kldiv(pv_f_dist_dict_samples[i],reuters_f_dist_dict[i]))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_PV_reuters)/len(KL_DIV_scores_PV_reuters)
print("mean KL- div between PV DS (positive examples) and Reuters (negative examples)",KL_DIV_mean)

mean KL- div between PV DS (positive examples) and Reuters (negative examples) 7.01718673072


In [77]:
pv_f_dist_dict_samples = random.sample(pv_f_dist_dict,25000)
turkish_tweets_f_dist_dict_sample = random.sample(turkish_tweets_f_dist_dict, 25000)

KL_DIV_scores_PV_tweets = []

for i in range(len(pv_f_dist_dict_samples)):
    KL_DIV_scores_PV_tweets.append(kldiv(pv_f_dist_dict_samples[i],turkish_tweets_f_dist_dict_sample[i]))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_PV_tweets)/len(KL_DIV_scores_PV_tweets)
print(" mean KL-DIV between PV (labeled) and tweets (unlabeled):",KL_DIV_mean)

 mean KL-DIV between PV (labeled) and tweets (unlabeled): 1.432e+31


In [116]:
turkish_tweets_f_dist_dict_sample = random.sample(turkish_tweets_f_dist_dict, 25000)

KL_DIV_scores_tweets_reuters = []

for i in range(len(turkish_tweets_f_dist_dict_sample)):
    KL_DIV_scores_tweets_reuters.append(kldiv(turkish_tweets_f_dist_dict_sample[i],reuters_f_dist_dict[i]))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_tweets_reuters)/len(KL_DIV_scores_tweets_reuters)
print("mean KL- div between tweets and Reuters (negative examples)",KL_DIV_mean)

mean KL- div between tweets and Reuters (negative examples) 1.536e+31


In [147]:
books_pos_f_dist_dict_sample = random.sample(books_pos_f_dist_dict, 10000)
books_neg_f_dist_dict_sample = random.sample(books_neg_f_dist_dict, 10000)

KL_DIV_scores_books = []

for i in range(len(books_pos_f_dist_dict_sample)):
    KL_DIV_scores_books.append(kldiv(books_pos_f_dist_dict_sample[i],books_neg_f_dist_dict_sample[i]))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_books)/len(KL_DIV_scores_books)
print("mean KL- div between books pos and neg ",KL_DIV_mean)

mean KL- div between books pos and neg  7e+29


In [232]:
movies_pos_f_dist_dict_sample = random.sample(movies_pos_f_dist_dict, 9445)
movies_neg_f_dist_dict_sample = random.sample(movies_neg_f_dist_dict, 9445)

KL_DIV_scores_movies = []

for i in range(len(movies_pos_f_dist_dict_sample)):
    KL_DIV_scores_movies.append(kldiv(movies_pos_f_dist_dict_sample[i],movies_neg_f_dist_dict_sample[i]))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_movies)/len(KL_DIV_scores_movies)
print("mean KL- div between movies pos and neg ",KL_DIV_mean)

mean KL- div between movies pos and neg  4.23504499735e+29


In [184]:
books_pos_f_dist_dict_sample1 = random.sample(books_pos_f_dist_dict, 10000)
books_pos_f_dist_dict_sample2 = random.sample(books_pos_f_dist_dict, 10000)

KL_DIV_scores_books_pos_samples = []

for i in range(len(books_pos_f_dist_dict_sample1)):
    KL_DIV_scores_books_pos_samples.append(kldiv(books_pos_f_dist_dict_sample1[i],books_pos_f_dist_dict_sample2[i]))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_books_pos_samples)/len(KL_DIV_scores_books_pos_samples)
print("mean KL- div between books pos samples ",KL_DIV_mean)

mean KL- div between books pos samples  4e+29


In [215]:
movies_pos_f_dist_dict_sample1 = random.sample(movies_pos_f_dist_dict, 10000)
movies_pos_f_dist_dict_sample2 = random.sample(movies_pos_f_dist_dict, 10000)

KL_DIV_scores_movies_pos_samples = []

for i in range(len(movies_pos_f_dist_dict_sample1)):
    KL_DIV_scores_movies_pos_samples.append(kldiv(movies_pos_f_dist_dict_sample1[i],movies_pos_f_dist_dict_sample2[i]))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_movies_pos_samples)/len(KL_DIV_scores_movies_pos_samples)
print("mean KL- div between movies pos samples ",KL_DIV_mean)

mean KL- div between movies pos samples  1e+29


In [243]:
books_pos_f_dist_dict_sample = random.sample(books_pos_f_dist_dict, 10000)
movies_pos_f_dist_dict_sample = random.sample(movies_pos_f_dist_dict, 10000)

KL_DIV_scores_books_movies = []

for i in range(len(books_pos_f_dist_dict_sample)):
    KL_DIV_scores_books_movies.append(kldiv(books_pos_f_dist_dict_sample[i],movies_pos_f_dist_dict_sample[i]))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_books_movies)/len(KL_DIV_scores_books_movies)
print("mean KL- div between pos books and movies ",KL_DIV_mean)

mean KL- div between books and movies  1e+29


In [248]:
books_neg_f_dist_dict_sample = random.sample(books_neg_f_dist_dict, 9445)
movies_neg_f_dist_dict_sample = random.sample(movies_neg_f_dist_dict, 9445)

KL_DIV_scores_books_movies_neg = []

for i in range(len(books_neg_f_dist_dict_sample)):
    KL_DIV_scores_books_movies_neg.append(kldiv(books_neg_f_dist_dict_sample[i],movies_neg_f_dist_dict_sample[i]))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_books_movies_neg)/len(KL_DIV_scores_books_movies_neg)
print("mean KL- div between neg books and movies ",KL_DIV_mean)

mean KL- div between pos books and movies  5.29380624669e+29


In [254]:
CF_tweets_pos_f_dist_dict_sample = random.sample(CF_pos_tweets_f_dist_dict, 54)
CF_tweets_neg_f_dist_dict_sample = random.sample(CF_neg_tweets_f_dist_dict, 54)

KL_DIV_scores_CF_tweets = []

for i in range(len(CF_tweets_pos_f_dist_dict_sample)):
    KL_DIV_scores_CF_tweets.append(kldiv(CF_tweets_pos_f_dist_dict_sample[i],CF_tweets_neg_f_dist_dict_sample[i]))
    
KL_DIV_mean= reduce(lambda x, y: x+y,KL_DIV_scores_CF_tweets)/len(KL_DIV_scores_CF_tweets)
print("mean KL- div between CF tweets pos and neg ",KL_DIV_mean)

mean KL- div between CF tweets pos and neg  6.78159870939


In [255]:
def tokenize(_str):
    stopwords = ['and', 'for', 'if', 'the', 'then', 'be', 'is', 'are', 'will', 'in', 'it', 'to', 'that']
    tokens = collections.defaultdict(lambda: 0.)
    for m in re.finditer(r"(\w+)", _str, re.UNICODE):
        m = m.group(1).lower()
        if len(m) < 2: continue
        if m in stopwords: continue
        tokens[m] += 1
 
    return tokens

In [278]:
d1 = "kill me"
d2 = "money bank"

print(tokenize(d1))
print ("KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2)))
print ("KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1)))

defaultdict(<function <lambda> at 0x7f3d4e1318c0>, {'me': 1.0, 'kill': 1.0})
KL-divergence between d1 and d2: 6.9008475237
KL-divergence between d2 and d1: 6.9008475237


In [98]:
from sklearn.feature_extraction.text import TfidfVectorizer

text_files = ['Data/All_Pos_PV_DS.text', 'Data/reuters_negative_path.txt','Data/turkish_protests_unduplicated_100000_text.txt']

pv_file = codecs.open('Data/All_Pos_PV_DS.text', 'r', 'utf-8',errors='ignore').read()
reuters_file = codecs.open('Data/reuters_negative_path.txt', 'r', 'utf-8',errors='ignore').read()
pv_retures = pv_file + reuters_file
tweets_file = codecs.open('Data/turkish_protests_unduplicated_100000_text.txt', 'r', 'utf-8',errors='ignore').read()

documents= [pv_retures, tweets_file]
tfidf = TfidfVectorizer().fit_transform(documents)
pairwise_similarity = tfidf * tfidf.T

documents2= [pv_file, reuters_file, tweets_file]
tfidf2 = TfidfVectorizer().fit_transform(documents2)
pairwise_similarity2 = tfidf2 * tfidf2.T

In [102]:
cf_tweets_pos = codecs.open('Data/turkish_protest_test_pos_prccd2.txt', 'r', 'utf-8',errors='ignore').read()
cf_tweets_neg = codecs.open('Data/turkish_protest_test_neg_prccd2.txt', 'r', 'utf-8',errors='ignore').read()
cf_tweets = cf_tweets_pos + cf_tweets_neg

In [100]:
CF_tweets_docs= [cf_tweets_pos, cf_tweets_neg]
tfidf_cf_tweets = TfidfVectorizer().fit_transform(CF_tweets_docs)
cf_tweets_pairwise_similarity = tfidf_cf_tweets * tfidf_cf_tweets.T

In [103]:
source_target_test= [pv_retures, tweets_file, cf_tweets]
tfidf_source_target_test = TfidfVectorizer().fit_transform(source_target_test)
tfidf_source_target_test_pairwise_similarity = tfidf_source_target_test * tfidf_source_target_test.T

In [84]:
pairwise_similarity.A

array([[ 1.        ,  0.49462647],
       [ 0.49462647,  1.        ]])

In [85]:
pairwise_similarity2.A

array([[ 1.        ,  0.58096961,  0.45686109],
       [ 0.58096961,  1.        ,  0.32611657],
       [ 0.45686109,  0.32611657,  1.        ]])

In [101]:
cf_tweets_pairwise_similarity.A

array([[ 1.        ,  0.89386843],
       [ 0.89386843,  1.        ]])

In [104]:
tfidf_source_target_test_pairwise_similarity.A

array([[ 1.        ,  0.47272487,  0.26941537],
       [ 0.47272487,  1.        ,  0.35833355],
       [ 0.26941537,  0.35833355,  1.        ]])