In [1]:
import pathlib
import numpy as np
import pandas as pd
#For displaying complete rows info
from utils.datasets import load_helper_file
pd.options.display.max_colwidth=500
import nltk
from nltk.corpus import stopwords
from collections import Counter

In [4]:
nltk.download('stopwords')
all_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/egordm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
counter = nltk.FreqDist()
files = pathlib.Path("../../data/bitcoin_twitter_labeled/").glob("part_*.parquet")
for chunk, file in enumerate(files):
    print(f'Processing chunk: {chunk}')
    data = pd.read_parquet(file)
    for i, row in data.iterrows():
        for w in row['text'].split():
            if w not in all_stopwords:
                counter[w] += 1

Processing chunk: 0
Processing chunk: 1
Processing chunk: 2
Processing chunk: 3
Processing chunk: 4
Processing chunk: 5
Processing chunk: 6
Processing chunk: 7
Processing chunk: 8
Processing chunk: 9
Processing chunk: 10
Processing chunk: 11
Processing chunk: 12


In [6]:
counter.most_common(50)

[('.', 2470299),
 ('#bitcoin', 2023088),
 ('#btc', 1175659),
 (',', 986565),
 (':', 829146),
 ('-', 790412),
 ('!', 660153),
 ('#crypto', 543418),
 ('bitcoin', 534323),
 ('$', 532188),
 ('#cryptocurrency', 514838),
 ('#eth', 403194),
 ('#blockchain', 393863),
 ('#ethereum', 332338),
 ('#', 290881),
 ('?', 289975),
 ('(', 242684),
 ('%', 231298),
 ('"', 230656),
 (')', 228900),
 ('/', 217360),
 ('#altcoins', 196779),
 ('0', 196000),
 ('1', 184940),
 ('#xrp', 172261),
 ('#ltc', 160163),
 (';', 158103),
 ('btc', 151414),
 ('price', 150865),
 ('new', 136892),
 ('crypto', 136638),
 ("'", 130968),
 ('&', 126308),
 ('amp', 125962),
 ('get', 124667),
 ('via', 121782),
 ('#litecoin', 121723),
 ('free', 117143),
 ('#ico', 111569),
 ('buy', 100579),
 ('2', 95117),
 ('#altcoin', 91862),
 ('#dogecoin', 90125),
 ('|', 87441),
 ('us', 87023),
 ('market', 86074),
 ('#fintech', 85534),
 ('blockchain', 82585),
 ('+', 82252),
 ('3', 80082)]

In [7]:
cleaned_counter = counter.copy()

In [8]:
custom_synonyms = {
 "#btc": '#bitcoin',
 "btc": '#bitcoin',
 "bitcoins": '#bitcoin',
 "bitcoin": '#bitcoin',
 "@bitcoin": '#bitcoin',
 "#crypto": '#cryptocurrency',
 "#eth": '#ethereum',
 "ethereum": '#ethereum',
 "eth": '#ethereum',
 "#bch": '#bitcoincash',
 "bch": '#bitcoincash',
 "hodl": '#hold',
 "bitcoincash": '#bitcoincash',
 "#ltc": '#litecoin',
 "litecoin": '#litecoin',
 "#doge": '#dogecoin',
 "doge": '#dogecoin',
 "airdrop": '#airdrop',
 "cryptocurrencies": '#cryptocurrencies',
 "crypto": '#crypto',
 "xrp": '#xrp',
 "altcoin": '#altcoins',
 "altcoins": '#altcoins',
 "#trx": '#tron',
 "trx": '#tron',
 "fb": '#facebook',
}

for k, v in custom_synonyms.items():
    if k in cleaned_counter:
        cleaned_counter[v] += cleaned_counter[k]
    cleaned_counter.pop(k)

In [9]:
SYMBOLS = '{}()[].,:;?#!+-*/&|<>@%"\'=~$1234567890'
for k in SYMBOLS:
    if k in cleaned_counter:
        cleaned_counter.pop(k)

In [10]:
cleaned_counter.most_common(100)

[('#bitcoin', 3884484),
 ('#cryptocurrency', 1058256),
 ('#ethereum', 797136),
 ('#blockchain', 393863),
 ('#litecoin', 281886),
 ('#altcoins', 196779),
 ('#xrp', 172261),
 ('price', 150865),
 ('new', 136892),
 ('crypto', 136638),
 ('#dogecoin', 132817),
 ('amp', 125962),
 ('get', 124667),
 ('via', 121782),
 ('free', 117143),
 ('#ico', 111569),
 ('#bitcoincash', 104133),
 ('buy', 100579),
 ('#altcoin', 91862),
 ('us', 87023),
 ('market', 86074),
 ('#fintech', 85534),
 ('blockchain', 82585),
 ('#trading', 75810),
 ('#ripple', 75657),
 ('time', 73849),
 ('10', 72676),
 ('join', 71169),
 ('one', 69867),
 ('#cryptocurrencies', 69273),
 ('like', 67861),
 ('#news', 67193),
 ('#etc', 65900),
 ('exchange', 65465),
 ('today', 65404),
 ('#trx', 64429),
 ('trading', 63899),
 ('money', 63424),
 ('cryptocurrency', 62098),
 ('usd', 61108),
 ('000', 60605),
 ('news', 57204),
 ('first', 56884),
 ('token', 56761),
 ('#eos', 56459),
 ('#dash', 56429),
 ('#money', 55143),
 ('mining', 54979),
 ('wallet', 

In [3]:
bert_uncased_vocabulary = set(load_helper_file('helper_bert_uncased_vocabulary'))

In [11]:
used_words = set(k for k in cleaned_counter.keys() if k in bert_uncased_vocabulary)
for k in bert_uncased_vocabulary:
    if k in cleaned_counter:
        cleaned_counter.pop(k)

In [12]:
print(len(used_words))
print(len(cleaned_counter))

21704
728687


In [23]:
cleaned_counter.most_common(1000)

[('#bitcoin', 3884484),
 ('#cryptocurrency', 1058256),
 ('#ethereum', 797136),
 ('#blockchain', 393863),
 ('#litecoin', 281886),
 ('#altcoins', 196779),
 ('#xrp', 172261),
 ('crypto', 136638),
 ('#dogecoin', 132817),
 ('#ico', 111569),
 ('#bitcoincash', 104133),
 ('#altcoin', 91862),
 ('#fintech', 85534),
 ('blockchain', 82585),
 ('#trading', 75810),
 ('#ripple', 75657),
 ('#cryptocurrencies', 69273),
 ('#news', 67193),
 ('#etc', 65900),
 ('#trx', 64429),
 ('cryptocurrency', 62098),
 ('#eos', 56459),
 ('#dash', 56429),
 ('#money', 55143),
 ('#airdrop', 52101),
 ('#cryptonews', 51699),
 ('#binance', 49606),
 ('#xlm', 47056),
 ('#ada', 45868),
 ('#neo', 45398),
 ('bitcoins', 36650),
 ('#xmr', 34374),
 ('#bitcoinnews', 33107),
 ('#bitcoin,', 32356),
 ('#investing', 32248),
 ('#btcusd', 31384),
 ('#mining', 30445),
 ('#xvg', 29520),
 ('#bnb', 28977),
 ('#forex', 28731),
 ('#tron', 27652),
 ('#tokensale', 27622),
 ('airdrop', 26928),
 ('#cryptotrading', 26760),
 ('#exchange', 26621),
 ('#co

In [22]:
unused_words = set(w for w in bert_uncased_vocabulary if not w.startswith('#') and not w.endswith('#')) - used_words
print(len(unused_words))

2993


Exchanges
* coinbase
* binance
* bitstamp
* bitpay

hashtags should not end with symbol

Currency symbol normalization

country/time hashtag expansion