In [57]:
# Credit for some parts to: https://www.kaggle.com/kyakovlev/preprocessing-bert-public
# Number extraction and hashtags is my baby

# General imports|  
import pandas as pd
import re, warnings, pickle, itertools, emoji, unicodedata

# custom imports
from gensim.utils import deaccent
from collections import Counter
from bs4 import BeautifulSoup
from utils.datasets import *
from pandarallel import pandarallel
import fasttext

pandarallel.initialize()
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 10
pd.options.display.max_colwidth = 200


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [58]:
## Initial vars

HELPER_PATH             = '../../data/helpers/'
LOCAL_TEST = True       ## Local test - for test performance on part of the train set only
verbose = True
WPLACEHOLDER = 'word_placeholder'
URL_TAG = '@URL'
USER_TAG = '@USR'
NUMBER_TAG = '@NUM'
HASH_TAG = '@HTAG'
CURRENCY_TAG = '@CURR'
IMMUTABLES = [WPLACEHOLDER, URL_TAG, USER_TAG, NUMBER_TAG, HASH_TAG, CURRENCY_TAG]

SEED = 42               ## Seed for enviroment
seed_everything(SEED)   ## Seed everything

In [59]:
## Preprocess helpers
def place_hold(w, tag=WPLACEHOLDER):
    return tag + '[' + re.sub(' ', '___', w) + ']'

## Helpers
def check_replace(w):
    return not bool(re.search('|'.join(IMMUTABLES), w))

def make_cleaning(s, c_dict):
    if check_replace(s):
        s = s.translate(c_dict)
    return s

def make_dict_cleaning(s, w_dict, skip_check=False):
    # Replaces a word using dict if it is mutable
    if skip_check or check_replace(s):
        s = w_dict.get(s, s)
    return s

In [60]:
## Get basic helper data

bert_uncased_vocabulary = load_helper_file('helper_bert_uncased_vocabulary')
bert_cased_vocabulary   = load_helper_file('helper_bert_cased_vocabulary')
bert_char_list          = list(set([c for line in bert_uncased_vocabulary+bert_cased_vocabulary for c in line]))

url_extensions          = load_helper_file('helper_url_extensions')
html_tags               = load_helper_file('helper_html_tags')
good_chars_dieter       = load_helper_file('helper_good_chars_dieter')
bad_chars_dieter        = load_helper_file('helper_bad_chars_dieter')
helper_contractions     = load_helper_file('helper_contractions')
global_vocabulary       = load_helper_file('helper_global_vocabulary')
global_vocabulary_chars = load_helper_file('helper_global_vocabulary_chars')
normalized_chars        = load_helper_file('helper_normalized_chars')
white_list_chars        = load_helper_file('helper_white_list_chars')
white_list_punct        = " '*-.,?!/:;_()[]{}<>=" + '"'
pictograms_to_emoji     = load_helper_file('helper_pictograms_to_emoji')
helper_custom_synonyms     = load_helper_file('helper_custom_synonyms')
emoji_dict = set(e for lang in emoji.UNICODE_EMOJI.values() for e in lang)

In [61]:
## Load Data
good_cols       = ['_id', 'text']
data = pd.read_parquet('../../data/bitcoin_twitter_raw/part_0.parquet')
data = data.iloc[:20000][good_cols]

In [62]:
## Start preprocessing
texts = data['text']
local_vocab = bert_uncased_vocabulary
global_lower=True
texts = texts.astype(str)
if verbose: print('#' *20 ,'Initial State:'); check_vocab(texts, local_vocab)

#################### Initial State:
Unknown words: 63451 | Known words: 6880


In [63]:
def lower(texts):
    texts = texts.apply(lambda x: x.lower())
    if verbose: print('#'*10 ,'Step - Lowering everything:'); check_vocab(texts, local_vocab)
    return texts

if global_lower:
    texts = texts.pipe(lower)

########## Step - Lowering everything:
Unknown words: 54216 | Known words: 7938


In [64]:
# Normalize chars and dots - SEE HELPER FOR DETAILS
# Global
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,normalized_chars) for i in x.split()]))
texts = texts.apply(lambda x: re.sub('\(dot\)', '.', x))
texts = texts.apply(lambda x: deaccent(x))
if verbose: print('#'*10 ,'Step - Normalize chars and dots:'); check_vocab(texts, local_vocab)

########## Step - Normalize chars and dots:
Unknown words: 53957 | Known words: 7946


In [65]:
# Remove 'control' chars
global_chars_list = list(set([c for line in texts for c in line]))
chars_dict = {c:'' for c in global_chars_list if unicodedata.category(c)[0]=='C'}
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#'*10 ,'Step - Control Chars:'); check_vocab(texts, local_vocab)

########## Step - Control Chars:
Unknown words: 53957 | Known words: 7946


In [66]:
# Remove hrefs
texts = texts.apply(lambda x: re.sub(re.findall(r'\<a(.*?)\>', x)[0], '', x) if (len(re.findall(r'\<a (.*?)\>', x))>0) and ('href' in re.findall(r'\<a (.*?)\>', x)[0]) else x)
if verbose: print('#'*10 ,'Step - Remove hrefs:'); check_vocab(texts, local_vocab)

########## Step - Remove hrefs:
Unknown words: 53957 | Known words: 7946


In [67]:
# Convert or remove Bad Symbols
global_chars_list = list(set([c for line in texts for c in line]))
chars = ''.join([c for c in global_chars_list if (c not in bert_char_list) and (c not in emoji_dict) and (c not in white_list_chars)])
chars_dict = {}
for char in chars:
    try:
        new_char = unicodedata.name(char).split()[-1:][0].lower()
        if len(new_char)==1:
            chars_dict[ord(char)] = new_char
        else:
            chars_dict[ord(char)] = ''
    except:
        chars_dict[ord(char)] = ''
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove Bad Symbols:'); check_vocab(texts, local_vocab)
if verbose: print(chars)
if verbose: print_dict(chars_dict)

########## Step - Remove Bad Symbols:
Unknown words: 53826 | Known words: 7956
ùêõÎπÑ‰ª∑ùíïÂùóùñéùê•ùñó‚Ç∫üÜÉùü≤„ÄëÎÇòÔºÑùñì—µÁïôùüè‡πÑùêÄùë≥üÖ∑ùüµùêí‚Å©‡∏ÑùñîÈìæüá∏Î°úÈôÜùüé‚ñàÍ∑∏¬ØÂØÜ‡∏àÏßÄÏùµùíìùëªÌöåüá≥ùêûüá∑ùêÖùê¶ùñÜùñûüá®ÂÜÜ‚ùØüáªùê†ùïΩÏ§ëùêÇ‚ã∞„ÖúË∑åùíåùíÇÂèãÂè∞‚ü∂Í∏∏ùñöÌñâùëæùü†ùíçÛ†Å¢üáøùüî‚ñ¥Ïãú∆ÄÊ®°üá≠Êù°Î∞òÛ†Å¥üáÆÍπåÁÇÆüáßüÖ≥ùêØüáΩÏàò‚ãØùñâùê¨ùíÖ‰∫§‚ñìÏóêÎçî‚Ç≥‚úì‚ÄçÏù∏ùñäùêöË¥ßÁ∫¶üá±Îã§‚ñ∫ùêùÍÆ§‰∫ÜÿüÌÉëÂê¥Á¢≥Ïä§Î∞î‚ñëüÖºùíéùñãüá©‚ÄåÂøåÎ†§„Ö†üáπÎÇ¥üá≤‚ü†Îç∞‡∏ÇÊ∂®ùê≠Ìè¨ÍÆÜÌä∏üá¥ÎèÑùü∞‡∏øùüô‡πÜùê´ÏïÑÎäîùüòÎû¨‡∏úÎçïùüöüÖΩùñïùíêÏ†ïÛ†ÅßüÖª‚Å¶Í∏∞‡πÅ‡∏î‚Çøùñòüá∫ùñôùëºùñàÂ∏ÅÊÉ≥ùíâÛ†ÅøÂÄº‚ìúüáµË≤®ÎãàùíäÂä°‡∏∞‚Ç¶Ï§çüá¨Î¶¨‚û§ÏÑúÎ©¥„ÄêÏΩîÁâπÔø•Û†Å≥Í∞ÄÂØíÂïÜ‡∏äùë≤ÏÖòùíî‚É£·µõüá™ùê®‚Å†üá¶„ÜîÍ∂åùü≠ùíÜÎ†áùíÑÛ†Å£ùíèüÖ¥üá∞üÜÇÈÄöùñëÍÆáùü¨ùêÑùê°ùïÆÔøºùêÆùüìŸ™ùíó
119835 --- b
48708 --- 
20215 --- 
119957 --- t
22359 --- 
120206 --- i
119845 --- l
120215 --- r
8378 --- 
127363 --- t


In [68]:
# Remove Bad Symbols PART 2
# Global
global_chars_list = list(set([c for line in texts for c in line]))
chars = '¬∑' + ''.join([c for c in global_chars_list if (c not in white_list_chars) and (c not in emoji_dict) and (c not in white_list_punct) and (ord(c)>256)])
chars_dict = {}
for char in chars:
    try:
        new_char = unicodedata.name(char).split()[-1:][0].lower()
        if len(new_char)==1:
            chars_dict[ord(char)] = new_char
        else:
            chars_dict[ord(char)] = ''
    except:
        chars_dict[ord(char)] = ''
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove Bad Symbols PART 2:'); check_vocab(texts, local_vocab)
if verbose: print(chars)
if verbose: print_dict(chars_dict)

########## Step - Remove Bad Symbols PART 2:
Unknown words: 53659 | Known words: 7949
¬∑ÿ≥–∏–∫€å—Å„Ää‚òÜ—ç⁄∫ÿØÔºÅ‡∏≤ŒæÔºü—å–ªÀ¢‡∏ï‡∏ûŸÑÿ±ŸÜŒπ‡∏•‡§ï—èÂ≠¶„ÉàÁîü„É≥Âå∫‚àöÿ™„É´‡∏ó‚Äû‡∏ô‡∏ß„Ç≥„Ç§–≥‚Ä¶–µÿ´ÿ∏–º‡πÄÿÆ—Ñ„Éº„ÄÇÿµ–∑—É⁄©ŸÇ„Ç´⁄Ü—Ü„Éí‚óè–≤‰ªÆ‚â•‡∏°—éÿπ€ÅœÄ—àŸàÿ¥ÿ°—ã–±‚Ä∫ÂÆâ‡∏¢–¥„ÉÉÊØîŸÖ–∂Âä†Ïù¥‰∏ãÿ©–∞‡§öŸπ‚Ç¨‚ÜíŸÄ‚âàŸÉ‚Çπ‚Ä¢⁄ØŸæ„Äã—Ä„Çøÿ≠‡§¨‡∏≠„Éé⁄æ‰∏äÿ∂ÿß‡§Ø‡∏Å‡§æ‡§Ö—Ç—áŸáÔºåÿ∞–øÿ®Œ≤ÿ¨ŸäÂπ≥Â§ß–æŸÅ„ÉÑ‚àû–Ω
183 --- 
1587 --- 
1080 --- i
1082 --- 
1740 --- 
1089 --- 
12298 --- 
9734 --- 
1101 --- e
1722 --- 


In [69]:
# Remove html tags
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if ('<' in word) and ('>' in word):
        for tag in html_tags:
            if ('<'+tag+'>' in word) or ('</'+tag+'>' in word):
                temp_dict[word] = BeautifulSoup(word, 'html5lib').text
texts = texts.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
if verbose: print('#' * 10, 'Step - HTML tags:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - HTML tags:
Unknown words: 53659 | Known words: 7949


In [70]:
# Remove links (There is valuable information in links (probably you will find a way to use it))
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
url_rule = r'(?P<url>https?://[^\s]+)'
temp_dict = {k:domain_search(k) for k in temp_vocab if k!= re.compile(url_rule).sub('url', k)}

for word in temp_dict:
    new_value = temp_dict[word]
    if word.find('http')>2:
        temp_dict[word] =  word[:word.find('http')] + ' ' + place_hold(new_value, URL_TAG)
    else:
        temp_dict[word] = place_hold(new_value, URL_TAG)

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert urls part 1:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Convert urls part 1:
Unknown words: 39204 | Known words: 7949
https://t.co/q8zs9n3kaa --- @URL[t.co]
https://t.co/niqxaxb9ih --- @URL[t.co]
https://t.co/blcrihln2i --- @URL[t.co]
https://t.co/hxstfnzmqv --- @URL[t.co]
https://t.co/b3vuuavpog --- @URL[t.co]
https://t.co/fomn3ewz1w --- @URL[t.co]
https://t.co/fh8f2dkcde --- @URL[t.co]
https://t.co/f71875vzeu --- @URL[t.co]
https://t.co/rlwrm3ovxj --- @URL[t.co]
https://t.co/bbdycnjgy0 --- @URL[t.co]


In [71]:
# Remove twitter links
temp_dict = {
    f'{URL_TAG}[t.co]': ''
}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict, skip_check=True) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert urls part 1.5:'); check_vocab(texts, local_vocab);

########## Step - Convert urls part 1.5:
Unknown words: 39203 | Known words: 7949


In [72]:
# Remove escaped html
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
symbols = {
    '&quot;': '',
    '&amp;': ' and ',
    '&lt;': '',
    '&gt;': '',
}
temp_dict = {}
for word in temp_vocab:
    if any([rep in word for rep in symbols.keys()]):
        new_word = word
        for rep, to in symbols.items():
            new_word = new_word.replace(rep, to)
        temp_dict[word] = new_word

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict, skip_check=True) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove escaped html:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove escaped html:
Unknown words: 39129 | Known words: 7951
&amp;&amp; ---  and  and 
#jpmorganchase&amp;amp;co --- #jpmorganchase and amp;co
-------------&gt; --- -------------
&gt;coin --- coin
soon-&gt; --- soon-
order&gt;: --- order:
&lt;$1 --- $1
2nd,3rd&amp;4th --- 2nd,3rd and 4th
&gt;1000% --- 1000%
buy&amp;hold. --- buy and hold.


In [73]:
# Convert urls part 2
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}

for word in temp_vocab:
    url_check = False
    if 'file:' in word:
        url_check = True
    elif ('http' in word) or ('ww.' in word) or ('.htm' in word) or ('ftp' in word) or ('.php' in word) or ('.aspx' in word):
        if 'Aww' not in word:
            for d_zone in url_extensions:
                if '.' + d_zone in word:
                    url_check = True
                    break
    elif ('/' in word) and ('.' in word):
        for d_zone in url_extensions:
            if '.' + d_zone + '/' in word:
                url_check = True
                break

    if url_check:
        temp_dict[word] =  place_hold(domain_search(word), URL_TAG)

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert urls part 2:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Convert urls part 2:
Unknown words: 39129 | Known words: 7951
www.maverick-tech.con --- @URL[maverick-tech.con]
.www.rapidsnetwork.io --- @URL[rapidsnetwork.io]


In [74]:
# Normalize pictograms
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9]').sub('', word))>2:
        for pict in pictograms_to_emoji:
            if (pict in word) and (len(pict)>2):
                temp_dict[word] = word.replace(pict, pictograms_to_emoji[pict])
            elif pict==word:
                temp_dict[word] = pictograms_to_emoji[pict]

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Normalize pictograms:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Normalize pictograms:
Unknown words: 39128 | Known words: 7951
:-)! --- üòÅ!
:))) --- üòÅ)
:-) --- üòÅ
‚¨á@crypto_off --- ‚¨á@cryptüòÆff


In [75]:
# Isolate emoji
# Global
global_chars_list = list(set([c for line in texts for c in line]))
chars = ''.join([c for c in global_chars_list if c in emoji_dict])
chars_dict = {ord(c):f' {c} ' for c in chars}
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Isolate emoji:'); check_vocab(texts, local_vocab)
if verbose: print(chars)

########## Step - Isolate emoji:
Unknown words: 36781 | Known words: 7975
üé±‚ÄºüëÑ‚öôüèøü¶âü¶Ö‚òëüíäüõéüë¨ü•∞üçÆ„ÄΩüòç‚ô•ü§ñ‚òπüõçüê£‚èØ‚ôÇü§ûüíµüïµ‚úãüìåü§¢üè≠üì©üí™ü™¶ü•∏üêçüí≤‚Ñ¢üçπüèÇ‚ö°üí°ü§®ü§åü§ßü•ïüò°‚ùì‚õÖüòüüí¥‚ô¶ü¶µ‚¨á‚ùóü§çüö¶üö£üçîüêªüåô‚ùåüèÉüëå‚õΩüíáüçíüèá‚ôéüòéüí£üëäüöòü•≤‚ñ´‚ù£üç∑üíïüöÑüìêü§†ü¶àüôÄüîõüåñüíòüòÅüéâüîºüí∑üç∏üòÜüÖ±üòÖüòß‚öú‚û°üîóüôäüòíü™Öüüßüü†üß°üç´‚ú≥ü§∏üëèüõë‚åõüìûüöÇüóΩüë©üåºüü•üéáüé©üì¶üí∂üêÑüï∫üî•üò¢üòöü•≥‚õµüñêüëâüèóüê¢üòûüíâüìØüêùüé∞üîπüôåü§©‚ò¢üôÉ‚ùÑüó≥üå¥üí§üß¢ü§∑ü§°üò≠‚ôÄüåä‚§µüò®üö´üôàüçª‚è≤ü¶ßüìπüê∏‚öΩüêæüë≠ü©∏üå±üè¥üòù‚õîü¶¢ü¶∫üê¨üé≤üì∏üåáüò§üõí‚è¨üåøüëÅüåëüòºüîíüïØ‚¨úüë∏üèºüé®üçåüåÉ‚è±üê∞üó£üêºüí±üìóüöëüü©üÖ∞üí∞üíÉüêãü§¥üåêüì∫‚õ¥üòîüé¢üçøüë§üí≥üîØüíåüî∂ü§¶üí®ü•±üå≤üêà‚ûñüç≥ü§Ø‚≠êüÉèüèπüëæüë∫üòØ‚§¥üòôüéüüç¶üí•üü¢ü§§üòãü§≥ü•Åüåïüè°ü¶Æü´Çüçäüî´ü

In [76]:
# Duplicated dots, question marks and exclamations
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if (Counter(word)['.']>1) or (Counter(word)['!']>1) or (Counter(word)['?']>1) or (Counter(word)[',']>1):
        if (Counter(word)['.']>1):
            new_word = re.sub('\.\.+', ' . . . ', new_word)
        if (Counter(word)['!']>1):
            new_word = re.sub('\!\!+', ' ! ! ! ', new_word)
        if (Counter(word)['?']>1):
            new_word = re.sub('\?\?+', ' ? ? ? ', new_word)
        if (Counter(word)[',']>1):
            new_word = re.sub('\,\,+', ' , , , ', new_word)
        temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Duplicated Chars:'); check_vocab(texts, local_vocab);

########## Step - Duplicated Chars:
Unknown words: 34752 | Known words: 8029


In [77]:
# Remove underscore for spam words
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and ('_' in word):
        temp_dict[word] = re.sub('_', '', word)
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove underscore:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove underscore:
Unknown words: 34738 | Known words: 8029
_____________ --- 
__________ --- 
________ --- 
#_ --- #
#a__ --- #a
\_()_/ --- \()/
#___ --- #
_____________________ --- 
____ --- 
________________________ --- 


In [78]:
# Isolate spam chars repetition
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and (len(Counter(word))==1) and (len(word)>2):
        temp_dict[word] = ' '.join([' ' + next(iter(Counter(word).keys())) + ' ' for i in range(1)])
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Spam chars repetition:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Spam chars repetition:
Unknown words: 34729 | Known words: 8029
**** ---  * 
*** ---  * 
)))) ---  ) 
::::::::::::::::::::::::::: ---  : 
$$$$$$$$$$$$ ---  $ 
$$$$ ---  $ 
***** ---  * 
$$$ ---  $ 
$$$$$ ---  $ 


In [79]:
# Normalize pictograms part 2
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9]').sub('', word))>1:
        for pict in pictograms_to_emoji:
            if pict==word:
                temp_dict[word] = pictograms_to_emoji[pict]
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Normalize pictograms part 2:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Normalize pictograms part 2:
Unknown words: 34724 | Known words: 8029
=) --- üòÅ
:) --- üòÅ
;) --- üòú
:] --- üòÅ
:( --- üò°


In [80]:
# Isolate brakets and quotes
# Global
chars = '()[]{}<>"'
chars_dict = {ord(c):f' {c} ' for c in chars}
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Brackets and quotes:'); check_vocab(texts, local_vocab)
if verbose: print_dict(chars_dict)

########## Step - Brackets and quotes:
Unknown words: 33135 | Known words: 8088
40 ---  ( 
41 ---  ) 
91 ---  [ 
93 ---  ] 
123 ---  { 
125 ---  } 
60 ---  < 
62 ---  > 
34 ---  " 


In [81]:
# Break short words
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_vocab = [k for k in temp_vocab if len(k)<=20]

temp_dict = {}
for word in temp_vocab:
    if '/' in word and not word.startswith('u/') and not word.startswith('r/'):
        temp_dict[word] = re.sub('/', ' / ', word)

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Break long words:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Break long words:
Unknown words: 32745 | Known words: 8106
24/7 --- 24 / 7
¬£1.46/ --- ¬£1.46 / 
fil/usdt --- fil / usdt
coinomics/tokenomics --- coinomics / tokenomics
$0.00/tx --- $0.00 / tx
android/apple --- android / apple
health/inner --- health / inner
btc/usdt --- btc / usdt
2/9/21 --- 2 / 9 / 21
/#eth ---  / #eth


In [82]:
# Break long words
def break_long_words(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_vocab = [k for k in temp_vocab if len(k)>20]

    temp_dict = {}
    for word in temp_vocab:
        if '_' in word:
            temp_dict[word] = re.sub('_', ' ', word)
        elif '/' in word and not word.startswith('u/') and not word.startswith('r/'):
            temp_dict[word] = re.sub('/', ' / ', word)
        elif len(' '.join(word.split('-')).split())>2:
            temp_dict[word] = re.sub('-', ' ', word)
        for s in ',.:;':
            if s in word and not re.compile('[+#@$/,.:;-]').sub('', word).isnumeric():
                temp_dict[word] = word.replace(s, f' {s} ')

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Break long words:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

for i in range(3):
    texts = texts.pipe(break_long_words)

########## Step - Break long words:
Unknown words: 32747 | Known words: 8110
week/month/year/decade --- week / month / year / decade
crypto-dinner-futures --- crypto dinner futures
/jonathan/gabriel/ozo ---  / jonathan / gabriel / ozo
cryptosmartnow@gmail.com --- cryptosmartnow@gmail . com
0.078-0.085-0.099-0.105-0.12 --- 0.078 0.085 0.099 0.105 0.12
jnjamor2020@gmail.com --- jnjamor2020@gmail . com
chat@cryptoquestion.tech --- chat@cryptoquestion . tech
#quality_over_quantity --- #quality over quantity
every-once-in-a-while, --- every-once-in-a-while , 
#the_bull_run_has_just_started. --- #the_bull_run_has_just_started . 
########## Step - Break long words:
Unknown words: 32745 | Known words: 8110
#the_bull_run_has_just_started --- #the bull run has just started
casino-partner/stakeholder --- casino-partner / stakeholder
pullback/consolidation --- pullback / consolidation
every-once-in-a-while --- every once in a while
august/september/october --- august / september / october
########

In [83]:
# TODO: add number parsing before
# Diambiguate entities
# Split words on @,# and $ to clear up ambiguities between entitites
symbols = '@#$'
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('@' in k or '#' in k or '$' in k)]

temp_dict = {}
for word in temp_vocab:
    for symbol in symbols:
        if symbol not in word: continue
        left, *right = word.split(symbol)
        rightz = symbol.join(right)
        if len(left) > 0 and len(right[0]) > 0 and right[0].isalnum():
            temp_dict[word] = f'{left} {symbol}{rightz}'
        break

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Disambiguate entities:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Disambiguate entities:
Unknown words: 32635 | Known words: 8111
mex$100 --- mex $100
~$80 --- ~ $80
.#online --- . #online
.@paypal --- . @paypal
tight.#hodler --- tight. #hodler
+$42k --- + $42k
pumps@so --- pumps @so
328.71eur.#crypto --- 328.71eur. #crypto
bag!#altcoins --- bag! #altcoins
+$5k --- + $5k


In [84]:
def custom_synonyms(texts):
    temp_dict = {}
    for wfrom, wto in helper_custom_synonyms.items():
        temp_dict[wfrom] = wto
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Custom word synonyms:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(custom_synonyms)

########## Step - Custom word synonyms:
Unknown words: 32603 | Known words: 8111
#btc --- #bitcoin
btc --- $btc
bitcoins --- $btc
bitcoin --- $btc
@bitcoin --- $btc
#crypto --- #cryptocurrency
#eth --- $eth
ethereum --- $eth
eth --- $eth
#bch --- $bch


In [85]:
# Remove/Convert usernames and hashtags
def extract_entities(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if (len(word) > 2) and (word[1:len(word)-1].replace('\'s', '').replace('_', '').isalnum()):
            new_word = word.replace('\'s', '')
            if not re.compile('[#@$/,.:;]').sub('', new_word).isnumeric():
                new_word = re.compile('[,.:;]').sub('', new_word)
                if word.startswith('@'):
                    temp_dict[word] = place_hold(new_word[1:], USER_TAG)
                elif word.startswith('#'):
                    temp_dict[word] = place_hold(new_word[1:], HASH_TAG)
                elif word.startswith('u/'):
                    temp_dict[word] = place_hold(new_word[2:], USER_TAG)
                elif word.startswith('r/'):
                    temp_dict[word] = place_hold(new_word[2:], HASH_TAG)
                elif word.startswith('$') and word[1:].isalpha():
                    temp_dict[word] = place_hold(new_word[1:], CURRENCY_TAG)
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - UserName and Hashtag:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(extract_entities)

########## Step - UserName and Hashtag:
Unknown words: 32236 | Known words: 8111
#first --- @HTAG[first]
#chiliz --- @HTAG[chiliz]
#vouchers --- @HTAG[vouchers]
@stockheadau! --- @USR[stockheadau!]
#cnnindonesia --- @HTAG[cnnindonesia]
#awaamkojeenaydoniazi --- @HTAG[awaamkojeenaydoniazi]
#cross --- @HTAG[cross]
@btcclicks --- @USR[btcclicks]
#tradingeducation --- @HTAG[tradingeducation]
#freeaungsansuukyi --- @HTAG[freeaungsansuukyi]


In [86]:
# Hashtag and currency union
def hashtag_currency_union(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = set([k for k in temp_vocab if not check_replace(k)])
    temp_dict = {}
    for w in temp_vocab:
        if w.startswith(CURRENCY_TAG):
            if w.replace(CURRENCY_TAG, HASH_TAG) in temp_vocab:
                temp_dict[w.replace(CURRENCY_TAG, HASH_TAG)] = w
            if w.replace(CURRENCY_TAG, USER_TAG) in temp_vocab:
                temp_dict[w.replace(CURRENCY_TAG, USER_TAG)] = w
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict, skip_check=True) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Hashtag and currency union:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(hashtag_currency_union)

########## Step - Hashtag and currency union:
Unknown words: 31634 | Known words: 8111
@HTAG[trx] --- @CURR[trx]
@HTAG[xcur] --- @CURR[xcur]
@HTAG[uamy] --- @CURR[uamy]
@HTAG[up] --- @CURR[up]
@HTAG[unl] --- @CURR[unl]
@HTAG[dec] --- @CURR[dec]
@USR[dec] --- @CURR[dec]
@HTAG[orn] --- @CURR[orn]
@HTAG[zen] --- @CURR[zen]
@HTAG[stmx] --- @CURR[stmx]


In [87]:
# Remove ending underscore (or add quotation marks???)
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if word[len(word)-1]=='_':
        for i in range(len(word),0,-1):
            if word[i-1]!='_':
                new_word = word[:i]
                temp_dict[word] = new_word
                break
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove ending underscore:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove ending underscore:
Unknown words: 31633 | Known words: 8111
usdt_ --- usdt
'fu__ --- 'fu


In [88]:
# Remove starting underscore
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if word[0]=='_':
        for i in range(len(word)):
            if word[i]!='_':
                new_word = word[i:]
                temp_dict[word] = new_word
                break
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove starting underscore:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove starting underscore:
Unknown words: 31633 | Known words: 8111


In [89]:
# End word punctuations
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[len(k)-1].isalnum())]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    for i in range(len(word),0,-1):
        if word[i-1].isnumeric() and re.compile('[$¬£%‚Ç¨]').match(word[i]):
            break

        if word[i-1].isalnum():
            new_word = word[:i] + ' ' + word[i:]
            break
    temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - End word punctuations:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - End word punctuations:
Unknown words: 23495 | Known words: 8586
wife. --- wife .
days, --- days ,
20. --- 20 .
corrupt, --- corrupt ,
death! --- death !
credits: --- credits :
$avax: --- $avax :
usdt, --- usdt ,
what. --- what .
volando. --- volando .


In [90]:
scale_mapping = {
    'b': 1000000000,
    'bn': 1000000000,
    'bln': 1000000000,
    'billion': 1000000000,
    'm': 1000000,
    'mn': 1000000,
    'mln': 1000000,
    'million': 1000000,
    'k': 1000,
    'thousand': 1000,
    '-': -1,
}

translate = {
    '$': 'dollar', '¬£': 'pound','%': 'percent', '‚Ç¨': 'euro'
}

translate_suffix = {
    'x': 'times'
}

translate_prefix = {
    '~': 'around',
    '+-': 'around',
    '@': 'at',
    '=': 'equals',
    '*#': 'ranked',
    '#': 'ranked',
}

def serialize_numbers(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    re_inb = re.compile('[.,\'"`]')
    re_num = re.compile('^(~|\+-|@|=|#|\*#)?[-@+*^#:]?[$¬£%‚Ç¨]?(([.:]?[0-9])+)[$¬£%‚Ç¨]?')
    re_fix = re.compile('^[$¬£%‚Ç¨][-+][0-9]')
    for word in temp_vocab:
        prefilter = re_inb.sub('', word).replace(',', '.')
        if re_fix.search(prefilter):
            prefilter = prefilter[1] + prefilter[0] + prefilter[2:]
        result = re_num.search(prefilter)

        if result and result.pos == 0:
            # Process combined numbers / ranges in next iteration
            if '-' in word and not word.startswith('-') and not word.startswith('+-'):
                temp_dict[word] = ' '.join(word.split('-'))
                continue

            main_part = prefilter[:result.end()]
            prefix = ''
            for prefix_key, prefix_name in translate_prefix.items():
                if main_part.startswith(prefix_key):
                    prefix = prefix_name
                    main_part = main_part.replace(prefix_key, '', 1)
                    break

            main = re.compile('^[~@+*^#:]').sub('',main_part)
            currency = re.compile('[$¬£%‚Ç¨]').search(main)
            currency = main[currency.start():currency.end()] if currency else None
            main = re.compile('[$¬£%‚Ç¨]').sub('', main)
            suffix = prefilter[result.end():]

            multiplier = 1
            if re.compile('\.[0-9]{1,2}$').search(main): # decimal
                multiplier *= 0.01 if main[-1].isnumeric() else 0.1
            if '-' in main: # Neg numbers
                multiplier *= -1
                main = main.replace('-', '')
            # Textual scale
            if suffix in scale_mapping:
                multiplier *= scale_mapping[suffix]
                suffix = ''
            if suffix in translate_suffix:
                suffix = translate_suffix[suffix]

            number = round(float(main.replace('.', '').replace(':', '')) * multiplier, 2)
            # print(f'{number}  /  {currency}  /  {suffix}  /  {word}')
            # noinspection PyTypeChecker
            temp_dict[word] = ' '.join(filter(len,[
                prefix,
                place_hold(str(number), NUMBER_TAG),
                translate[currency] if currency else '',
                suffix
            ]))

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Serialize numbers:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts


# Clean up numbers
for i in range(4):
    texts = texts.pipe(serialize_numbers)

########## Step - Serialize numbers:
Unknown words: 21408 | Known words: 8606
$0.65 --- @NUM[65.0] dollar
$0.032 --- @NUM[32.0] dollar
10-15k --- 10 15k
10-k --- 10 k
$584m --- @NUM[584000000.0] dollar
404,101.38 --- @NUM[40410138.0]
~$40.8k --- around @NUM[408000.0] dollar
38727.83 --- @NUM[3872783.0]
280k --- @NUM[280000.0]
4.462665btc --- @NUM[4462665.0] btc
########## Step - Serialize numbers:
Unknown words: 21215 | Known words: 8606
10x --- @NUM[10.0] times
0.45800 --- @NUM[45800.0]
3.76 --- @NUM[376.0]
$3 --- @NUM[3.0] dollar
$46 --- @NUM[46.0] dollar
35xxx --- @NUM[35.0] xxx
1k --- @NUM[1000.0]
50k --- @NUM[50000.0]
~3 --- around @NUM[3.0]
25% --- @NUM[25.0] percent
########## Step - Serialize numbers:
Unknown words: 21213 | Known words: 8606
300$5001000$2000 --- @NUM[300.0] dollar 5001000$2000
^24 --- @NUM[24.0]
78$ --- @NUM[78.0] dollar
########## Step - Serialize numbers:
Unknown words: 21213 | Known words: 8606
5001000$2000 --- @NUM[5001000.0] dollar 2000


In [35]:
# Extract entities again
texts = texts\
    .pipe(custom_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Custom word synonyms:
Unknown words: 21200 | Known words: 8606
#btc --- #bitcoin
btc --- $btc
bitcoins --- $btc
bitcoin --- $btc
@bitcoin --- $btc
#crypto --- #cryptocurrency
#eth --- $eth
ethereum --- $eth
eth --- $eth
#bch --- $bch
########## Step - UserName and Hashtag:
Unknown words: 21037 | Known words: 8606
$usdt --- @CURR[usdt]
$usd --- @CURR[usd]
$aapl --- @CURR[aapl]
$hai --- @CURR[hai]
$bch --- @CURR[bch]
$strax --- @CURR[strax]
$riot --- @CURR[riot]
#cryptocurrencies --- @HTAG[cryptocurrencies]
$comp --- @CURR[comp]
$arbkf --- @CURR[arbkf]
########## Step - Hashtag and currency union:
Unknown words: 21029 | Known words: 8606
@HTAG[elt] --- @CURR[elt]
@HTAG[crypto] --- @CURR[crypto]
@HTAG[batman] --- @CURR[batman]
@HTAG[tezos] --- @CURR[tezos]
@USR[tezos] --- @CURR[tezos]
@HTAG[fma] --- @CURR[fma]
@HTAG[defi] --- @CURR[defi]
@HTAG[bitcoin] --- @CURR[bitcoin]


In [36]:
# Start word punctuations
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[0].isalnum() and k[0] not in ['@', '#', '$'])]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    for i in range(len(word)):
        if word[i].isalnum() or word[i] in ['#', '@', '$']:
            new_word = word[:i] + ' ' + word[i:]
            break
    temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
# texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Start word punctuations:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Start word punctuations:
Unknown words: 21029 | Known words: 8606
'science --- ' science
*telegram --- * telegram
.could --- . could
*police --- * police
-it --- - it
*q --- * q
.you --- . you
\4241491.0 --- \ 4241491.0
~the --- ~ the
¬£5 --- ¬£ 5


In [37]:
# Extract entities again and numbers
texts = texts.pipe(serialize_numbers)
texts = texts\
    .pipe(custom_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Serialize numbers:
Unknown words: 21029 | Known words: 8606
########## Step - Custom word synonyms:
Unknown words: 21029 | Known words: 8606
#btc --- #bitcoin
btc --- $btc
bitcoins --- $btc
bitcoin --- $btc
@bitcoin --- $btc
#crypto --- #cryptocurrency
#eth --- $eth
ethereum --- $eth
eth --- $eth
#bch --- $bch
########## Step - UserName and Hashtag:
Unknown words: 21029 | Known words: 8606
########## Step - Hashtag and currency union:
Unknown words: 21029 | Known words: 8606


In [38]:
# Find and replace acronims
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (Counter(word)['.']>1) and (check_replace(word)):
        if (domain_search(word)!='') and (('www' in word) or (Counter(word)['/']>3)):
            temp_dict[word] = place_hold('url ' + domain_search(word))
        else:
            if (re.compile('[\.\,]').sub('', word) in local_vocab) and (len(re.compile('[0-9\.\,\-\/\:]').sub('', word))>0):
                temp_dict[word] =  place_hold(re.compile('[\.\,]').sub('', word))
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Find and replace acronims:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Find and replace acronims:
Unknown words: 21029 | Known words: 8606
g.o.a.t --- word_placeholder[goat]
p.o.d --- word_placeholder[pod]
f.i.a.t --- word_placeholder[fiat]


In [39]:
# Apply spellchecker for contractions
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ("'" in k)]
temp_dict = {}
for word in temp_vocab:
    if word in helper_contractions:
        temp_dict[word] = helper_contractions[word] # place_hold(helper_contractions[word])
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Contractions:'); check_vocab(texts, local_vocab)
if verbose: print_dict(temp_dict)

########## Step - Contractions:
Unknown words: 20967 | Known words: 8606
i'd --- i would
this's --- this is
he's --- he is
shouldn't --- should not
ya'll --- you will
can't --- cannot
when's --- when is
who's --- who is
you've --- you have
they're --- they are


In [40]:
# Remove 's (DO WE NEED TO REMOVE IT???)
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {k:k[:-2] for k in temp_vocab if (check_replace(k)) and (k.lower()[-2:]=="'s")}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove "s:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove "s:
Unknown words: 20745 | Known words: 8617
c's --- c
sucker's --- sucker
satoshi's --- satoshi
#ether's --- #ether
greeneum's --- greeneum
case's --- case
@microstrategy's --- @microstrategy
robinhood's --- robinhood
quantum's --- quantum
bridgewater's --- bridgewater


In [41]:
# Convert backslash
# Global
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('\\' in k)]
temp_dict = {k:re.sub('\\\\+', ' / ', k) for k in temp_vocab}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert backslash:'); check_vocab(texts, local_vocab)
if verbose: print_dict(temp_dict)

########## Step - Convert backslash:
Unknown words: 20745 | Known words: 8617
\4241491.0 ---  / 4241491.0
\4301056.0 ---  / 4301056.0
\5058389.0 ---  / 5058389.0
\4299147.0 ---  / 4299147.0
\4233436.0 ---  / 4233436.0
\4238285.0 ---  / 4238285.0
\4240291.0 ---  / 4240291.0


In [42]:
# Extract entities again and numbers
texts = texts.pipe(serialize_numbers)
texts = texts\
    .pipe(custom_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Serialize numbers:
Unknown words: 20745 | Known words: 8617
5058389.0 --- @NUM[50583890.0]
4301056.0 --- @NUM[43010560.0]
4240291.0 --- @NUM[42402910.0]
4238285.0 --- @NUM[42382850.0]
4233436.0 --- @NUM[42334360.0]
4299147.0 --- @NUM[42991470.0]
4241491.0 --- @NUM[42414910.0]
########## Step - Custom word synonyms:
Unknown words: 20738 | Known words: 8617
#btc --- #bitcoin
btc --- $btc
bitcoins --- $btc
bitcoin --- $btc
@bitcoin --- $btc
#crypto --- #cryptocurrency
#eth --- $eth
ethereum --- $eth
eth --- $eth
#bch --- $bch
########## Step - UserName and Hashtag:
Unknown words: 20702 | Known words: 8617
#telcoin --- @HTAG[telcoin]
#cryptocurrencies --- @HTAG[cryptocurrencies]
#nyzo --- @HTAG[nyzo]
@iohk_charles --- @USR[iohk_charles]
@thedaomaker --- @USR[thedaomaker]
#cryptocurrency --- @HTAG[cryptocurrency]
@petermccormack --- @USR[petermccormack]
@cointelegraph --- @USR[cointelegraph]
$doge --- @CURR[doge]
#silver --- @HTAG[silver]
########## Step - Hashtag and curr

In [43]:
# Try remove duplicated chars (not sure about this!!!!!). TODO check fist against vocab?
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]

temp_dict = {}
temp_vocab_dup = []

for word in temp_vocab:
    if not word.isalpha():
        continue
    temp_vocab_dup.append(''.join(ch for ch, _ in itertools.groupby(word)))
temp_vocab_dup = set(temp_vocab_dup)
temp_vocab_dup = temp_vocab_dup.difference(temp_vocab_dup.difference(set(local_vocab)))

for word in temp_vocab:
    new_word = ''.join(ch for ch, _ in itertools.groupby(word))
    if new_word in temp_vocab_dup:
        temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if (k != v) and (v in local_vocab)}

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Dup chars (with vocab check):'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Dup chars (with vocab check):
Unknown words: 20436 | Known words: 8654
niine --- nine
ayyyyyeeeee --- aye
thousaaaaaand --- thousand
canvass --- canvas
ooh --- oh
bounceeeee --- bounce
brrr --- br
yeahhh --- yeah
richhh --- rich
untill --- until


In [44]:
# Extract entities again and numbers
texts = texts.pipe(serialize_numbers)
texts = texts\
    .pipe(custom_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Serialize numbers:
Unknown words: 20436 | Known words: 8654
########## Step - Custom word synonyms:
Unknown words: 20436 | Known words: 8654
#btc --- #bitcoin
btc --- $btc
bitcoins --- $btc
bitcoin --- $btc
@bitcoin --- $btc
#crypto --- #cryptocurrency
#eth --- $eth
ethereum --- $eth
eth --- $eth
#bch --- $bch
########## Step - UserName and Hashtag:
Unknown words: 20436 | Known words: 8654
########## Step - Hashtag and currency union:
Unknown words: 20436 | Known words: 8654


In [45]:
# Isolate numbers
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if re.compile('[a-zA-Z]').sub('', word) == word:
        if re.compile('[0-9]').sub('', word) != word:
            temp_dict[word] = word

global_chars_list = list(set([c for line in temp_dict for c in line]))
chars = ''.join([c for c in global_chars_list if not c.isdigit()])
chars_dict = {ord(c):f' {c} ' for c in chars}
temp_dict = {k:place_hold(k) for k in temp_dict}

#texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Isolate numbers:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Isolate numbers:
Unknown words: 20436 | Known words: 8654
:-6.11 --- word_placeholder[:-6.11]
*_100% --- word_placeholder[*_100%]


In [46]:
# Join dashes
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]

temp_dict = {}
for word in temp_vocab:
    temp_dict[word] = re.sub('\-\-+', '-', word)
temp_dict = {k: v for k, v in temp_dict.items() if k != v}

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Join dashes:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Join dashes:
Unknown words: 20430 | Known words: 8654
clockwork--up --- clockwork-up
--designed --- -designed
outshined--cryptocurrency --- outshined-cryptocurrency
----- --- -
------------- --- -
#crypto!--where --- #crypto!-where
--- --- -
-- --- -
aa--tag --- aa-tag
------------------------------------------ --- -


In [47]:
# Try join word (Sloooow)
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (Counter(k)['-']>1)]

temp_dict = {}
for word in temp_vocab:
    new_word = ''.join(['' if c in '-' else c for c in word])
    if (new_word in local_vocab) and (len(new_word)>3):
        temp_dict[word] = new_word

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Try Split word:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Try Split word:
Unknown words: 20430 | Known words: 8654


In [48]:
# Try Split word
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]

temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9\*]').sub('', word))>0:
        chars = re.compile('[a-zA-Z0-9\*]').sub('', word)
        temp_dict[word] = ''.join([' ' + c + ' ' if c in chars else c for c in word])

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Try Split word:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Try Split word:
Unknown words: 19331 | Known words: 8753
price:0.0000000019 --- price : 0 . 0000000019
'science ---  ' science
solidity-based --- solidity - based
geo-location --- geo - location
##btc ---  #  # btc
üß° ---  üß° 
üöÇ ---  üöÇ 
üñê ---  üñê 
target:47716.95 --- target : 47716 . 95
üíå ---  üíå 


In [49]:
# L33T vocabulary (SLOW)
# https://simple.wikipedia.org/wiki/Leet
# Local (only unknown words)
def convert_leet(word):
    # basic conversion
    word = re.sub('0', 'o', word)
    word = re.sub('1', 'i', word)
    word = re.sub('3', 'e', word)
    word = re.sub('\$', 's', word)
    word = re.sub('\@', 'a', word)
    return word

temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]

temp_dict = {}
for word in temp_vocab:
    new_word = convert_leet(word)
    if (new_word!=word):
        if (len(word)>2) and (new_word in local_vocab):
            temp_dict[word] = new_word

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - L33T (with vocab check):'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - L33T (with vocab check):
Unknown words: 19327 | Known words: 8756
t13 --- tie
or3 --- ore
sh1t --- shit
fa1 --- fai


In [50]:
# Extract entities again and numbers
texts = texts.pipe(serialize_numbers)
texts = texts\
    .pipe(custom_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Serialize numbers:
Unknown words: 19298 | Known words: 8757
588 --- @NUM[588.0]
43000 --- @NUM[43000.0]
2511 --- @NUM[2511.0]
012736 --- @NUM[12736.0]
47716 --- @NUM[47716.0]
2421 --- @NUM[2421.0]
2272 --- @NUM[2272.0]
078 --- @NUM[78.0]
07059741519 --- @NUM[7059741519.0]
047 --- @NUM[47.0]
########## Step - Custom word synonyms:
Unknown words: 19294 | Known words: 8757
#btc --- #bitcoin
btc --- $btc
bitcoins --- $btc
bitcoin --- $btc
@bitcoin --- $btc
#crypto --- #cryptocurrency
#eth --- $eth
ethereum --- $eth
eth --- $eth
#bch --- $bch
########## Step - UserName and Hashtag:
Unknown words: 19283 | Known words: 8757
@bitstamp --- @USR[bitstamp]
#cryptocurrencies --- @HTAG[cryptocurrencies]
$doge --- @CURR[doge]
$trx --- @CURR[trx]
@binance --- @USR[binance]
$btc --- @CURR[btc]
#altcoins --- @HTAG[altcoins]
$eth --- @CURR[eth]
#hodl --- @HTAG[hodl]
#crypto --- @HTAG[crypto]
########## Step - Hashtag and currency union:
Unknown words: 19282 | Known words: 8757
@HTAG[cr

In [51]:
# Remove placeholders
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (not check_replace(k) and k.startswith(WPLACEHOLDER))]
temp_dict = {}
for word in temp_vocab:
    temp_dict[word] = re.sub('___', ' ', word[17:-1])
texts = texts.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
texts = texts.apply(lambda x: ' '.join([i for i in x.split()]))
if verbose: print('#' * 10, 'Step - Open Holded words:'); check_vocab(texts, local_vocab)

########## Step - Open Holded words:
Unknown words: 19279 | Known words: 8759


In [52]:
# Search multiple form
# Local | example -> flashlights / flashlight -> False / True
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (k[-1:]=='s') and (len(k)>4)]
temp_dict = {k:k[:-1] for k in temp_vocab if (k[:-1] in local_vocab)}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Multiple form:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Multiple form:
Unknown words: 18977 | Known words: 8832
feedbacks --- feedback
coincides --- coincide
declarations --- declaration
panics --- panic
repays --- repay
showdowns --- showdown
informations --- information
evolves --- evolve
harvests --- harvest
anyways --- anyway


In [53]:
# Extract entities again and numbers
texts = texts.pipe(serialize_numbers)
texts = texts\
    .pipe(custom_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Serialize numbers:
Unknown words: 18977 | Known words: 8832
########## Step - Custom word synonyms:
Unknown words: 18977 | Known words: 8832
#btc --- #bitcoin
btc --- $btc
bitcoins --- $btc
bitcoin --- $btc
@bitcoin --- $btc
#crypto --- #cryptocurrency
#eth --- $eth
ethereum --- $eth
eth --- $eth
#bch --- $bch
########## Step - UserName and Hashtag:
Unknown words: 18977 | Known words: 8832
########## Step - Hashtag and currency union:
Unknown words: 18977 | Known words: 8832


In [54]:
# Cut away non english tweets
model = fasttext.load_model('../../data/kaggle/lid.176.ftz')

def langcheck(item, min_confidence=0.2):
    text = ' '.join([w for w in item.split() if not w.startswith('@')])
    if len(text) < 3:
        return True
    results = dict(zip(*model.predict(text, k=2)))
    return results.get('__label__en', 0) > min_confidence

mask = texts.parallel_map(langcheck)
if verbose: print(f'Deleted: {1 - sum(mask)/len(texts)}')
texts = texts[mask]
data = data[mask]
if verbose: print('#' * 10, 'Step - Language datection:'); check_vocab(texts, local_vocab);



Deleted: 0.03805000000000003
########## Step - Language datection:
Unknown words: 17403 | Known words: 8681


In [55]:
data['text'] = texts
data

Unnamed: 0,_id,text
0,1360142875330232324,when the top u . s . central banker gets photobombed by @CURR[btc] . üëâ üëÄ @CURR[bitcoin] @CURR[bitcoin] @HTAG[cryptocurrency] @HTAG[cryptocurrency] @HTAG[ethereum] @HTAG[ripple] @CURR[link] @HTAG[c...
1,1360140112861003776,best am arriving with exciting features @CURR[bsc] @USR[binance] @CURR[bitcoin] @HTAG[binancesmartchain] @CURR[defi] @HTAG[definews] @HTAG[stafi] @CURR[cake] @HTAG[pancakeswap] @HTAG[paraswap] @HT...
2,1360137307047694337,"to keep its ultra bullish run intact , @CURR[egld] bulls need to keep @CURR[egld] / @CURR[usdt] daily above @NUM[148.0] dollar . reclaiming @NUM[174.0] dollar would be superb . break @NUM[148.0] d..."
4,1360132401142366210,next coin that goes @NUM[100.0] percent . . . buckle up . . . @CURR[xtz] @CURR[xtz] @CURR[tezos] look @ my calls from last 2 weeks @CURR[iota] @CURR[coti] tezos will move hard incoming days . @CUR...
5,1360131434158170113,its gonna be huge ! üöÄ üòç üëë @HTAG[fetch_ai] üëë @CURR[xrp] @HTAG[vechain] @HTAG[chainlink] @HTAG[cardano] @HTAG[algorand] @HTAG[altcoins] @HTAG[artificialintelligence] @HTAG[blockchain]
...,...,...
19995,1357792968455946242,cash is trash @CURR[bitcoin]
19996,1357792933982928896,global central bank efforts to limit u . s . dollars decline raises specter of currency war @CURR[bitcoin]
19997,1357792930359107588,"what if @CURR[bitcoin] is a social experiment ? well , money was ."
19998,1357792864005095424,@CURR[bitcoin] btw that was pre close ny - cme friday dump . pl are closing positions b4 weekend .


### TODO:
* numbers
