In [8]:
import re

emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""

regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs

    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

def tokenize(s):
    return tokens_re.findall(s)

def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

tweet = 'RT @JordiTorresBCN: just an example! :D http://JordiTorres.Barcelona #masterMEI'
print(preprocess(tweet))

['RT', '@JordiTorresBCN', ':', 'just', 'an', 'example', '!', ':D', 'http://JordiTorres.Barcelona', '#masterMEI']


In [9]:
tweet = 'SUPER PROMO! #FusionContigo con mas de 30% de #descuento y una linea de #regalo http://bit.ly/2mRX1xr'
print(preprocess(tweet))

['SUPER', 'PROMO', '!', '#FusionContigo', 'con', 'mas', 'de', '30', '%', 'de', '#descuento', 'y', 'una', 'linea', 'de', '#regalo', 'http://bit.ly/2mRX1xr']


In [15]:
tweet = '2 vs 4 ! ! ! @krimzCSGO @flusha_csgo #ESLProLeague http://youtube.com/eslcs'
print(preprocess(tweet))

['2', 'vs', '4', '!', '!', '!', '@krimzCSGO', '@flusha_csgo', '#ESLProLeague', 'http://youtube.com/eslcs']


In [16]:
tweet = 'Instant Delivery & Cheap Prices - #ForHonor ⚔️ on Kinguin, check it out TODAY!'
print(preprocess(tweet))

['Instant', 'Delivery', '&', 'Cheap', 'Prices', '-', '#ForHonor', '\xe2', '\x9a', '\x94', '\xef', '\xb8', '\x8f', 'on', 'Kinguin', ',', 'check', 'it', 'out', 'TODAY', '!']
