In [2]:
import os
import re
import json
import pandas as pd
import numpy as np
import asyncio
from functools import partial
import twitterscraper as tws
from IPython.core.debugger import set_trace
from tqdm import tqdm_notebook
# from konlpy.tag import Okt
from multiprocessing import Pool, Process

In [3]:
df = pd.read_excel('keywords.xlsx', skiprows=1)[['item','keyword_eng','keyword_kr','intersection']].set_index('item')
_or = lambda kw: ' OR '.join(['#' + k.strip() for k in kw.split(',')])

df['keyword'] = df.keyword_eng + ', ' + df.keyword_kr
df['qry'] = df.keyword.apply(_or)

intersect_with = ['ootd', 'category', 'fashion']
screener = ' OR '.join(df.loc[intersect_with, 'qry'])
df.loc[df.intersection==True, 'qry'] = '(' + df.qry + ') AND (' + screener + ')'

qry = dict(df.qry)

In [4]:
qry

{'ootd': '#dailylook OR #outfit OR #selfie OR #bestofstreetwear OR #beststreetoutfit OR #ootd OR #outfitoftheday OR #데일리룩 OR #오오티디',
 'category': '#designer OR #streetwear OR #luxury OR #casual OR #sportswear OR #디자이너 OR #길거리패션 OR #럭셔리 OR #캐주얼 OR #캐쥬얼 OR #스포츠웨어 OR #스포츠패션',
 'fashion': '#fashion OR #brand OR #fashionbrand OR #shoe OR #shoes OR #clothes OR #wear OR #apparel OR #패션 OR #브랜드 OR #패션브랜드 OR #옷 OR #의류 OR #슈즈 OR #신발',
 '8seconds': '#8seconds OR #에잇세컨즈',
 'adidas': '#adidas OR #아디다스',
 'adidasoriginal': '#adidasoriginal OR #아디다스오리지널스 OR #아디다스오리지널 OR #아디다스오리지날스 OR #아디다스오리지날',
 'asics': '#asics OR #아식스',
 'balensiaga': '#balensiaga OR #발렌시아가',
 'bally': '#bally OR #발리',
 'beanpole': '#beanpole OR #빈폴',
 'benetton': '#benetton OR #베네통',
 'blackyak': '#blackyak OR #블랙야크',
 'buckaroo': '#buckaroo OR #버커루',
 'burberry': '#burberry OR #버버리',
 'calvinklein': '#calvinklein OR #케빈클라인 OR #켈빈클라인 OR #캘빈클라인',
 'canadagoose': '#canadagoose OR #캐나다구스',
 'chanel': '#chanel OR #샤넬',
 'coach': '#co

In [39]:
def _period(start=None, end=None):
    if start is None: start = '2010-01-01'
    if end is None: end = pd.Timestamp.today()
    
    return pd.Timestamp(start).date(), pd.Timestamp(end).date()

In [40]:
def scrap(qry_dict, what=['id','text'], lang=None, start=None, end=None, download_to='twitter_data'):
    start, end = _period(start=start, end=end)
    filename = download_to + '/{item} (' + str(start).replace('-','.') + '-' + str(end).replace('-','.') + ').pkl'
    
    if not os.path.exists(download_to):
        os.makedirs(download_to)    
    
    
    def _period(start=None, end=None):
        if start is None: start = '2010-01-01'
        if end is None: end = pd.Timestamp.today()

        return pd.Timestamp(start).date(), pd.Timestamp(end).date()

    
    def _pretty(queried):
        return [{k:v for k,v in qrd.__dict__.items() if k in what} for qrd in queried]
    
    
    async def _scrap(item, q):
        try:
            q_tw_partial = partial(tws.query_tweets, lang=lang, begindate=start, enddate=end)
            queried = await loop.run_in_executor(None, q_tw_partial, q)
            res = _pretty(queried)

            df = pd.DataFrame(res)
            df.columns = pd.MultiIndex.from_arrays([[item]*len(df.columns), df.columns])
            df.to_pickle(filename.format(item=item))
            
        except:
            print(item)
    
    
    async def main():
        fts = [asyncio.ensure_future(_scrap(item, q)) for item, q in qry_dict.items()]
        await asyncio.gather(*fts)
    
    
    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
    loop.close()

In [41]:
qry_test = {'aldo':qry['aldo'], 'allsaints':qry['allsaints'], 'crocs':qry['crocs'], 'freitag':qry['freitag'], 'salomon':qry['salomon']}

In [42]:
%%time
scrap(qry_test, lang='ko')

INFO: queries: ['#aldo OR #알도 since:2010-01-01 until:2010-06-16', '#aldo OR #알도 since:2010-06-16 until:2010-11-29', '#aldo OR #알도 since:2010-11-29 until:2011-05-14', '#aldo OR #알도 since:2011-05-14 until:2011-10-27', '#aldo OR #알도 since:2011-10-27 until:2012-04-11', '#aldo OR #알도 since:2012-04-11 until:2012-09-24', '#aldo OR #알도 since:2012-09-24 until:2013-03-09', '#aldo OR #알도 since:2013-03-09 until:2013-08-22', '#aldo OR #알도 since:2013-08-22 until:2014-02-04', '#aldo OR #알도 since:2014-02-04 until:2014-07-21', '#aldo OR #알도 since:2014-07-21 until:2015-01-03', '#aldo OR #알도 since:2015-01-03 until:2015-06-18', '#aldo OR #알도 since:2015-06-18 until:2015-12-01', '#aldo OR #알도 since:2015-12-01 until:2016-05-15', '#aldo OR #알도 since:2016-05-15 until:2016-10-29', '#aldo OR #알도 since:2016-10-29 until:2017-04-13', '#aldo OR #알도 since:2017-04-13 until:2017-09-26', '#aldo OR #알도 since:2017-09-26 until:2018-03-11', '#aldo OR #알도 since:2018-03-11 until:2018-08-24', '#aldo OR #알도 since:2018-08-24 unt

Wall time: 1min 59s


In [3]:
pd.read_pickle('twitter_data/allsaints (2010.01.01-2019.02.06).pkl').droplevel(0, axis=1)

Unnamed: 0,id,text
0,394879338367418369,새롭게 론칭하는 슈즈 컬렉션 below the knee #올세인츠 pic.twitt...
1,385702682800824320,WORN IN LONDON. 최고급 캐시미어 니트와 레더 스커트를 통해 올세인츠가 ...
2,382463170624778242,고쳐야 저녁에 반짝반짝 불이 들어오지.. 아저씨들 고고 #올세인츠 pic.twitt...
3,377359278979293184,#올세인츠 #NYFW S14 프리젠테이션pic.twitter.com/ikJL5wDqq4
4,376804666035961856,발등에 피어싱이 장식된 시크한 로퍼! 올세인츠에서 가을부터 선보이는 슈즈 컬렉션 ...
5,376804230742691840,올세인츠의 프리젠테이션장에는 입고 싶은 룩 가득! 내년에 우리나라에서도 정식런칭된다...
6,375914648026873856,#올세인츠 가을 신상품 이 코트 정말 괜찮은듯 pic.twitter.com/RzLf...
7,714423832585449472,Outfit Over The World.\n: #allsaints 청자켓 #fog ...
8,704816282122612736,#dailylook #daily #neilbarrett #frenchbulldog ...
9,703247033507840000,Outfit Over The World.\n: #allsaints 셔츠 #johne...


In [2]:
def preproc(text, url=True, mention=True, hashtag=True, remove=True):
    LINEBREAK = r'\n' # str.replace에서는 r'\n'으로 검색이 안된다
    RT = ' rt '
    EMOJI = r'[\U00010000-\U0010ffff]'
    
    #URL = r'(?P<url>(https?://)?(www[.])?[^ \u3131-\u3163\uac00-\ud7a3]+[.][a-z]{2,6}\b([^ \u3131-\u3163\uac00-\ud7a3]*))'
    URL = r'(?:https?:\/\/)?(?:www[.])?[^ :\u3131-\u3163\uac00-\ud7a3]+[.][a-z]{2,6}\b(?:[^ \u3131-\u3163\uac00-\ud7a3]*)'
    # \u3131-\u3163\uac00-\ud7a3 는 한글을 의미함
    HASHTAG = r'(#[^ #@]+)'
    MENTION = r'(@[^ #@]+)' 
    
    PTNS = '|'.join((LINEBREAK, RT, URL, HASHTAG, MENTION, EMOJI))
    
    out = {}
    text = re.sub('|'.join((LINEBREAK,RT)), '', text.lower())
    
    if url:
        urls = re.findall(URL, text) #[m.groupdict()['url'] for m in re.finditer(URL, text)]
        urls = [url.replace('\xa0', '') for url in urls]
        out['urls'] = urls
        
    if hashtag:
        hashtags = re.findall(HASHTAG, text)
        out['hashtags'] = [h[1:] for h in hashtags]
        
    if mention:
        mentions = re.findall(MENTION, text)
        out['mention'] = [m[1:] for m in mentions]
        
    if remove:    
        text = re.sub(PTNS, '', text)
        
    out['text'] = text.strip()
    return out

In [3]:
preproc(pd.read_pickle('twitter_data/allsaints (2010.01.01-2019.02.06).pkl').droplevel(0,axis=1).text.iloc[210], remove=True)

{'urls': ['pic.twitter.com/02uefuhs6n'],
 'hashtags': ['올세인츠', 'allsaints'],
 'mention': [],
 'text': 'vic_to_ri_a2: 음..괜찮다~ㅎ'}

In [5]:
def tokenize(file):
    df = pd.read_pickle(file).droplevel(0, axis=1)
    tokenized = []
    
    tagger = Okt()
    for doc in tqdm_notebook(df.text.iloc[:]):
        preprocessed = preproc(doc)
        text, hashtags = preprocessed['text'], preprocessed['hashtags']
        text += ' ' + ' '.join(hashtags)
        _tokenized = [t[0] for t in tagger.pos(text, norm=True, stem=True) if t[1] not in ['Punctuation', 'Josa']]
        tokenized.append(_tokenized)
    
    out = df[['id']].copy()
    
    #df['brand'] = file.split('.')[0]
    out['tokenized'] = tokenized
    return out

In [15]:
%%time
tokenize('twitter_data/allsaints (2010.01.01-2019.02.07).pkl')
tokenize('twitter_data/aldo (2010.01.01-2019.02.07).pkl')
tokenize('twitter_data/crocs (2010.01.01-2019.02.07).pkl')
tokenize('twitter_data/freitag (2010.01.01-2019.02.07).pkl')
tokenize('twitter_data/salomon (2010.01.01-2019.02.07).pkl')

HBox(children=(IntProgress(value=0, max=335), HTML(value='')))




HBox(children=(IntProgress(value=0, max=170), HTML(value='')))




HBox(children=(IntProgress(value=0, max=891), HTML(value='')))




HBox(children=(IntProgress(value=0, max=403), HTML(value='')))




HBox(children=(IntProgress(value=0, max=109), HTML(value='')))


Wall time: 20.9 s


In [20]:
%%time
proc1 = Process(target=tokenize, args=('twitter_data/allsaints (2010.01.01-2019.02.06).pkl',))
proc1.start()
proc1.join()

proc2 = Process(target=tokenize, args=('twitter_data/aldo (2010.01.01-2019.02.06).pkl',))
proc2.start()
proc2.join()

Wall time: 1.93 s


In [2]:
from concurrent.futures import ProcessPoolExecutor as Executor
from texcrapy.twitter import tokenize

In [24]:
%%time
with Executor() as executor:
    future1 = executor.submit(tokenize, 'twitter_data/allsaints (2010.01.01-2019.02.06).pkl')
    future2 = executor.submit(tokenize, 'twitter_data/aldo (2010.01.01-2019.02.06).pkl')
    val1, val2 = future1.result(), future2.result()

Wall time: 26.1 s


In [9]:
%%time
if __name__ == '__main__':
    res = Pool(4).map(tokenize, fnames)

Wall time: 53.7 s


In [14]:
%%time
with Pool(4) as pool:
    out = []
    for fname in fnames:
        res = pool.apply_async(tokenize, (fname, ))
        out.append(res.get())

Wall time: 1min 49s


In [12]:
out

[                      id                                          tokenized
 0     424094692545150976  [sunny, 오늘, 뱅쿠버, 날씨, 짱, 따다, ♡♡, daily, dailylo...
 1     665816281694752768  [한, 여름밤, 꿈, e, ourtros, 31, seguiram, aos, 12....
 2     665072806250442753  [하이, 오르다, 해외, 구매, 대행, 쇼핑몰, aldo, 알도, 슈즈, 세, 일,...
 3     480341446751289345  [facebook, update, 140621, 가로수길, 오픈, 행사, 해, 저,...
 4     477203073966673920  [iamhanbonnie, 오빠, 선물, 해주다, 두번째, 신발, 나다, 하나, 지...
 5     476051667125682176  [이쁘다, 사이즈, 없다, aldo, 알도, 알도, aldo, 알도, 매장, 강남역...
 6     473398446322249728  [칼퇴, 택시, 크다, 록스, 알도, 다, 오다, 보다, 찾다, 신발, 모두, 품절...
 7     458017584474882048     [lamberti, my, italian, cousin, 도랏, ㅋㅋㅋ, aldo]
 8     444365352588173312  [알도, vs, 마스, 알도, 스탠딩, 전략, 동영상, 알도, vs, 마스, 알도,...
 9     444264867030380544                        [우, 오다, 강남역, 강남역, 알도, aldo]
 10    369020028038811649  [blog, post, 2013-08, 18, 04, h, korea, topic,...
 11    369003690138423296  [blog, post, 2013-08, 18, 03, h, korea, topic,...

In [10]:
srcdir = 'twitter_data'
fnames = [srcdir + '/' + fname for fname in os.listdir(srcdir)]

In [5]:
fnames

['twitter_data/aldo (2010.01.01-2019.02.07).pkl',
 'twitter_data/allsaints (2010.01.01-2019.02.07).pkl',
 'twitter_data/crocs (2010.01.01-2019.02.07).pkl',
 'twitter_data/freitag (2010.01.01-2019.02.07).pkl',
 'twitter_data/salomon (2010.01.01-2019.02.07).pkl']

In [74]:
tagger = Okt()

def tokenize2(docs):
    tokenized = []    
    
    for doc in docs:
        preprocessed = preproc(doc)
        text, hashtags = preprocessed['text'], preprocessed['hashtags']
        text += ' ' + ' '.join(hashtags)
        _tokenized = [t[0] for t in tagger.pos(text, norm=True, stem=True) if t[1] not in ['Punctuation', 'Josa']]
        tokenized.append(_tokenized)
    
    out = df[['id']].copy()
    
    #df['brand'] = file.split('.')[0]
    out['tokenized'] = tokenized
    return out

In [82]:
dfs = df.text[:50], df.text[50:100], df.text[100:150], df.text[150:200], df.text[200:250], df.text[250:300], df.text[300:]

In [None]:
pool = Pool(processes=4)
pool.map(tokenize2, dfs)

In [1]:
import pandas as pd
import json
with open('./model/tokens_dict.json') as f:
    tokens_dict = json.load(f)

In [70]:
with open('./model/tokens_dict2.json', 'w', encoding='UTF-8-sig') as f:
    json.dump(tokens_dict, f, ensure_ascii=False)