In [28]:
# Read in some tweets
import csv
jack_reader = csv.reader(open('data/jack.csv', 'r'))

columns = next(jack_reader)

In [29]:
jack_tweets = list(jack_reader)

In [30]:
# Filter out retweets
jack_tweets_no_rts = list(filter(lambda x: not x[1].startswith('RT'), jack_tweets))

for i in range(10):
    print(jack_tweets_no_rts[i][1])


Wow this is so great #MannequinChallenge
Whoa https://t.co/W8Uhbf9mVi
Great update. So much faster https://t.co/eIKzlrQaj0
@ericfranchi @adambain thx!
@mcnees @Support not sure how this got past us. Fixed (and thanks for reporting)
@dangillmor @mcnees @Support fixed
@stephgrimes @twitter @safety @cwarzel @BuzzFeedNews fixed
@NathanHChan never said this (but seems like good advice)
@cwarzel not sure how this slipped past us, but now it's fixed


In [31]:
# Canonicalize the tweet text as lowercase
jack_tweets_no_rts_lowercase = [tweet[1].lower() for tweet in jack_tweets_no_rts]

# Canonicalize links to "URL" and @mentions to "USER"
jack_tweets_no_rts_lowercase = [re.sub(r'(https?:\/\/t\.co\/\w+)', 'URL', tweet) for tweet in jack_tweets_no_rts_lowercase]
jack_tweets_normalized = [re.sub(r'(@\w+)', 'USER', tweet) for tweet in jack_tweets_no_rts_lowercase]

for i in range(10):
    print(jack_tweets_no_rts_lowercase[i])
    print("  -> {}".format(jack_tweets_normalized[i]))

    



  -> 
wow this is so great #mannequinchallenge
  -> wow this is so great #mannequinchallenge
whoa URL
  -> whoa URL
great update. so much faster URL
  -> great update. so much faster URL
@ericfranchi @adambain thx!
  -> USER USER thx!
@mcnees @support not sure how this got past us. fixed (and thanks for reporting)
  -> USER USER not sure how this got past us. fixed (and thanks for reporting)
@dangillmor @mcnees @support fixed
  -> USER USER USER fixed
@stephgrimes @twitter @safety @cwarzel @buzzfeednews fixed
  -> USER USER USER USER USER fixed
@nathanhchan never said this (but seems like good advice)
  -> USER never said this (but seems like good advice)
@cwarzel not sure how this slipped past us, but now it's fixed
  -> USER not sure how this slipped past us, but now it's fixed


In [32]:
import nltk
from nltk import word_tokenize
# note: need to nltk.download() all the models the first time aroudn

# Frequency distribution of words.
main_dist = nltk.FreqDist([])
for tweet in jack_tweets_no_rts_lowercase:
    tokens = word_tokenize(tweet)
    
    # We don't want to count the mentions and hashtags
    tokens = list(filter(lambda x: not (x[0] == '@' or x[0] == '#'), tokens))
    
    main_dist.update(tokens)

In [34]:
main_dist.most_common()

[('URL', 368),
 ('!', 221),
 ('the', 153),
 ('to', 129),
 ('and', 126),
 ('.', 116),
 (',', 91),
 ('for', 88),
 (':', 85),
 ('you', 81),
 ('on', 79),
 ('twitter', 68),
 ('of', 57),
 ('a', 56),
 ('in', 53),
 ('thank', 50),
 ('this', 47),
 ('is', 43),
 ('``', 40),
 ("'s", 38),
 ('your', 37),
 ('all', 34),
 ("''", 32),
 ('by', 32),
 ('we', 32),
 ('us', 30),
 ('lovetwitter', 30),
 ('it', 30),
 ('with', 30),
 (')', 29),
 ('(', 28),
 ('square', 27),
 ('now', 26),
 ('periscope', 24),
 ('our', 23),
 ('watch', 21),
 ('live', 21),
 ('first', 17),
 ('listen', 17),
 ('i', 17),
 ('⚡️', 17),
 ("n't", 16),
 ('?', 16),
 (';', 15),
 ('an', 15),
 ('&', 15),
 ('amp', 15),
 ('people', 15),
 ('soundcloud', 14),
 ('vote', 14),
 ('great', 14),
 ('that', 13),
 ('help', 13),
 ('np', 13),
 ('vraa', 13),
 ('world', 13),
 ('at', 12),
 ('right', 12),
 ('about', 12),
 ('what', 12),
 ('from', 12),
 ('so', 12),
 ('or', 12),
 ('tap', 11),
 ('one', 11),
 ('go', 11),
 ('new', 11),
 ('can', 11),
 ('more', 11),
 ('my', 11

In [76]:
import hmac
import binascii
import struct
from nltk import word_tokenize

hash_key = "legomystego"

def tweet_hash(tweet):
    """
    Implement a keyed hash function according to section 3.3:
    
    1. Generate a keyed hash digest (HMAC-MD5) for each word
    2. Get the last four bits of the hash
    3. Bitwise rotate each value according to its position in the tweet
    4. XOR all the values together
    """
    tokens = word_tokenize(tweet)
    
    token_hashes = []
    tweet_hash = 0     # Start with 0, the XOR identity
    for n, token in enumerate(tokens):
        # Generate the keyed hash with the given key
        m = hmac.new(hash_key.encode(), msg=token.encode())
        
        m_hash = m.digest()
        
        # Get the last nibble of the hash
        m_bits = ord(m_hash[:1]) & 0x0f
        
        
        # ROT-N for the position in the tweet
        for i in range(n):
            m_bits_shifted = m_bits << 1
            m_bits_overflow = m_bits_shifted & 0xf0
            m_bits_lower = m_bits_shifted & 0x0f
            
            m_bits = m_bits_lower + (m_bits_overflow >> 4)
            
        tweet_hash ^= m_bits
        token_hashes.append(m_bits)
        
    return hex(tweet_hash)
    

In [77]:
tweet_hash("hello world")

'0xf'

In [78]:
tweet_hash("world hello")

'0x5'

In [80]:
tweet_hash("magical magical magic")

'0xa'