In [25]:
import json
import re
import pickle
from collections import defaultdict

In [26]:
with open('data/stat.txt') as fin:
    emojis = set(fin.read().strip())


In [27]:
re_num = re.compile('-?[0-9]+(\.)?[0-9]*')
re_tag_user = re.compile('[#@][^\s!?.]+')
re_url = re.compile(r'(?:https?:\/\/)?(?:[\w\.]+\/+)+(?:\.?[\w]{2,})+')
re_invalid = re.compile('[^\w.!?,\']|RT[: ]')
re_spaces = re.compile('\s+')

def preprocess(text):
    text = re_tag_user.sub('', text)
    text = re_num.sub('N', text)
    text = re_url.sub('URL', text)
    text = re_invalid.sub(' ', text)
    text = re_spaces.sub(' ', text)
    return text.strip().lower()

preprocess('RT: aaa #bbb_ccc i\'m ddd #e 1234.58 # https://t.co/orz/fjiewa gg @user!')

"aaa i'm ddd n url gg !"

In [28]:
MAX_SAMPLES = 500000
samples = defaultdict(set)
with open('data/extracted.list') as fin:
    run = True
    for line in fin:
        if not line.strip():
            continue
        t = json.loads(line)
        text = t['text']
        cat = set()
        for ch in text:
            if ch in emojis:
                cat.add(ch)
        if not cat:
            continue
        normalized = preprocess(text)
        if not normalized:
            continue
        for ch in cat:
            if len(samples[ch]) < MAX_SAMPLES:
                samples[ch].add(normalized)
            if all(len(v) == MAX_SAMPLES for v in samples.values()):
                run = False
        if not run:
            break

# picked data: map<emoji, set<text>>
#     where len(data[*]) == MAX_SAMPLES
with open('data/dataset.pickle', 'wb') as fout:
    pickle.dump(samples, fout)

In [22]:
with open('data/extracted.list') as fin:
    for _ in range(200):
        t = json.loads(fin.readline())
        print(preprocess(t['text']))

happy new year!! url
like.... url
finally n !!!!! happy new year from mountain standard time !!!!!!!!!!
when you say you're just gonna take a sip of wine and suddenly you're on your nth glass
king of popsplit url url
happy new years!!!
happy new years url
happy new year
friends.... on a new day of n i wnts to say, open your blind eyes sickular hindu really they r in trouble https
lionel richie all night long url
orbitcounter
i want your attention, but i won't beg for it
henlo n
amp i are so mean but we don't give a fuck
she hid under the bed to test her relationship. what her boyfriend did had her floored url
can't figure out if it's the end or beginning
happy new years from the savage url url
instagram cookiemonstern with url url
happy new years. be safe amp stay blessed
highly blessed to see another year
if i said i was at church, don't ask me what my religion is!
wishing our beautiful little wombat, wattle, a very happy nst birthday url
oiya, happy new year n pic url
hope n gives me

In [None]:
len(samples['😂'])

In [23]:
with open('data/extracted.list') as fin:
    for _ in range(200):
        t = json.loads(fin.readline())
        print(t['text'])


RT @AllyBrooke: HAPPY NEW YEAR!! 🎆🎈 🎉🥂 https://t.co/Nk5xeQAO5t
RT @Whoadieeupnext: 💁🏽🙄 like.... https://t.co/t81GzcvfZ7
FINALLY 2017 !!!!! Happy New Year from Mountain Standard Time !!!!!!!!!! 🎉🎉🎉
When you say you're just gonna take a sip of wine and suddenly you're on your 4th glass 🍷
KING OF POPSPLIT 👑
https://t.co/HdbX7luOUb
#agario #FUNNY #TURKEY #europe #game #trolling #POKEMONGO #NEW… https://t.co/VNEx4CXcFM
HAPPY NEW YEARS!!! 💞✨
RT @seIfcritics: Happy New Years ✨💕 https://t.co/K0o0hACXdj
RT @BrandonGottFans: Happy New Year🌎❤️
RT @IntolerantMano2: Friends....
On a new day of 2017 I wnts to say,

Open Your Blind Eyes Sickular Hindu
Really they r in trouble😏 https:/…
RT @mrblackcat1069: Lionel Richie 
All Night Long 🔥🔥🔥
#Hello2017
#Inspiration 
#Motivation 
#MrBlackCat1069 
#NewDay… https://t.co/tbrNxL9Y…
orbitCounter++

🎉
RT @TantrumDealo: i want your attention, but I won't beg for it 💁
Henlo 2015😎
RT @clariecazares18: @Robin_nicole71 &amp; I are so mean but we don't give a fuck 😂