In [73]:
import os
import re
import pandas as pd
import numpy as np
import asyncio
from functools import partial
import twitterscraper as tws
from IPython.core.debugger import set_trace
from tqdm import tqdm_notebook
from konlpy.tag import Okt
from multiprocessing import Pool

In [2]:
df = pd.read_excel('keywords.xlsx')[['item','keyword_eng','keyword_kr','intersection']].set_index('item')
_or = lambda kw: ' OR '.join(['#' + k.strip() for k in kw.split(',')])

df['keyword'] = df.keyword_eng + ', ' + df.keyword_kr
df['qry'] = df.keyword.apply(_or)

intersect_with = ['ootd', 'category', 'fashion']
screener = ' OR '.join(df.loc[intersect_with, 'qry'])
df.loc[df.intersection==True, 'qry'] = '(' + df.qry + ') AND (' + screener + ')'

qry = dict(df.qry)

In [3]:
qry

{'ootd': '#dailylook OR #outfit OR #selfie OR #bestofstreetwear OR #beststreetoutfit OR #ootd OR #outfitoftheday OR #데일리룩 OR #오오티디',
 'category': '#designer OR #streetwear OR #luxury OR #casual OR #sportswear OR #디자이너 OR #길거리패션 OR #럭셔리 OR #캐주얼 OR #캐쥬얼 OR #스포츠웨어 OR #스포츠패션',
 'fashion': '#fashion OR #brand OR #fashionbrand OR #shoe OR #shoes OR #clothes OR #wear OR #apparel OR #패션 OR #브랜드 OR #패션브랜드 OR #옷 OR #의류 OR #슈즈 OR #신발',
 '8seconds': '#8seconds OR #에잇세컨즈',
 'adidas': '#adidas OR #아디다스',
 'adidasoriginal': '#adidasoriginal OR #아디다스오리지널스 OR #아디다스오리지널 OR #아디다스오리지날스 OR #아디다스오리지날',
 'asics': '#asics OR #아식스',
 'balensiaga': '#balensiaga OR #발렌시아가',
 'bally': '#bally OR #발리',
 'beanpole': '#beanpole OR #빈폴',
 'benetton': '#benetton OR #베네통',
 'blackyak': '#blackyak OR #블랙야크',
 'buckaroo': '#buckaroo OR #버커루',
 'burberry': '#burberry OR #버버리',
 'calvinklein': '#calvinklein OR #케빈클라인 OR #켈빈클라인 OR #캘빈클라인',
 'canadagoose': '#canadagoose OR #캐나다구스',
 'chanel': '#chanel OR #샤넬',
 'coach': '#co

In [6]:
def _period(start=None, end=None):
    if start is None: start = '2010-01-01'
    if end is None: end = pd.Timestamp.today()
    
    return pd.Timestamp(start).date(), pd.Timestamp(end).date()

In [7]:
def scrap(qry_dict, what=['id','text'], lang=None, start=None, end=None, download_to='twitter_data'):
    start, end = _period(start=start, end=end)
    filename = download_to + '/{item} (' + str(start).replace('-','.') + '-' + str(end).replace('-','.') + ').pkl'
    
    if not os.path.exists(download_to):
        os.makedirs(download_to)    
    
    
    def _pretty(queried):
        return [{k:v for k,v in qrd.__dict__.items() if k in what} for qrd in queried]
    
    
    async def _scrap(item, q):
        q_tw_partial = partial(tws.query_tweets, lang=lang, begindate=start, enddate=end)
        queried = await loop.run_in_executor(None, q_tw_partial, q)
        res = _pretty(queried)
        
        df = pd.DataFrame(res)
        df.to_pickle(filename.format(item=item))
    
    
    async def main():
        fts = [asyncio.ensure_future(_scrap(item, q)) for item, q in qry_dict.items()]
        await asyncio.gather(*fts)
    
    
    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
    loop.close()

In [7]:
qry_test = {'aldo':qry['aldo'], 'allsaints':qry['allsaints']}

In [8]:
%%time
scrap(qry_test, lang='ko')

INFO: queries: ['#allsaints OR #올세인츠 since:2010-01-01 until:2010-06-15', '#allsaints OR #올세인츠 since:2010-06-15 until:2010-11-28', '#allsaints OR #올세인츠 since:2010-11-28 until:2011-05-13', '#allsaints OR #올세인츠 since:2011-05-13 until:2011-10-26', '#allsaints OR #올세인츠 since:2011-10-26 until:2012-04-09', '#allsaints OR #올세인츠 since:2012-04-09 until:2012-09-22', '#allsaints OR #올세인츠 since:2012-09-22 until:2013-03-07', '#allsaints OR #올세인츠 since:2013-03-07 until:2013-08-20', '#allsaints OR #올세인츠 since:2013-08-20 until:2014-02-02', '#allsaints OR #올세인츠 since:2014-02-02 until:2014-07-18', '#allsaints OR #올세인츠 since:2014-07-18 until:2014-12-31', '#allsaints OR #올세인츠 since:2014-12-31 until:2015-06-15', '#allsaints OR #올세인츠 since:2015-06-15 until:2015-11-28', '#allsaints OR #올세인츠 since:2015-11-28 until:2016-05-12', '#allsaints OR #올세인츠 since:2016-05-12 until:2016-10-25', '#allsaints OR #올세인츠 since:2016-10-25 until:2017-04-09', '#allsaints OR #올세인츠 since:2017-04-09 until:2017-09-22', '#allsaints OR 

Wall time: 54.3 s


In [4]:
def preproc(text, url=True, mention=True, hashtag=True, remove=True):
    LINEBREAK = r'\n' # str.replace에서는 r'\n'으로 검색이 안된다
    RT = ' rt '
    EMOJI = r'[\U00010000-\U0010ffff]'
    
    #URL = r'(?P<url>(https?://)?(www[.])?[^ \u3131-\u3163\uac00-\ud7a3]+[.][a-z]{2,6}\b([^ \u3131-\u3163\uac00-\ud7a3]*))'
    URL = r'(?:https?:\/\/)?(?:www[.])?[^ :\u3131-\u3163\uac00-\ud7a3]+[.][a-z]{2,6}\b(?:[^ \u3131-\u3163\uac00-\ud7a3]*)'
    # \u3131-\u3163\uac00-\ud7a3 는 한글을 의미함
    HASHTAG = r'(#[^ #@]+)'
    MENTION = r'(@[^ #@]+)' 
    
    PTNS = '|'.join((LINEBREAK, RT, URL, HASHTAG, MENTION, EMOJI))
    
    out = {}
    text = re.sub('|'.join((LINEBREAK,RT)), '', text.lower())
    
    if url:
        urls = re.findall(URL, text) #[m.groupdict()['url'] for m in re.finditer(URL, text)]
        urls = [url.replace('\xa0', '') for url in urls]
        out['urls'] = urls
        
    if hashtag:
        hashtags = re.findall(HASHTAG, text)
        out['hashtags'] = [h[1:] for h in hashtags]
        
    if mention:
        mentions = re.findall(MENTION, text)
        out['mention'] = [m[1:] for m in mentions]
        
    if remove:    
        text = re.sub(PTNS, '', text)
        
    out['text'] = text.strip()
    return out

In [5]:
preproc(pd.read_pickle('twitter_data/allsaints (2010.01.01-2019.02.02).pkl').text.iloc[210], remove=True)

{'urls': ['http://instagram.com/p/u2l5sqxtlq/'],
 'hashtags': ['november',
  'allsaints',
  'autumn',
  'sky',
  'skyporn',
  'rooftop',
  'sunset',
  'skyscape',
  'vsco',
  'vscocam…'],
 'mention': [],
 'text': '11월. be good.'}

In [35]:
def tokenize(file):
    df = pd.read_pickle(file)
    tokenized = []
    
    tagger = Okt()
    for doc in tqdm_notebook(df.text.iloc[:]):
        preprocessed = preproc(doc)
        text, hashtags = preprocessed['text'], preprocessed['hashtags']
        text += ' ' + ' '.join(hashtags)
        _tokenized = [t[0] for t in tagger.pos(text, norm=True, stem=True) if t[1] not in ['Punctuation', 'Josa']]
        tokenized.append(_tokenized)
    
    out = df[['id']].copy()
    
    #df['brand'] = file.split('.')[0]
    out['tokenized'] = tokenized
    return out

In [70]:
%%time
tokenize('twitter_data/allsaints (2010.01.01-2019.02.02).pkl')

HBox(children=(IntProgress(value=0, max=343), HTML(value='')))

Wall time: 2.08 s


Unnamed: 0,id,tokenized
0,208596651231817729,"[멋지다, 이, 와중, 작동, 하나, 싶다, sewing, machines, all..."
1,965872025007984640,"[패션, 업계, 상아탑, 갇히다, 듯, 계절, 맞다, 않다, 올세인츠]"
2,947456821966655488,"[올세인츠, 50%, 세, 일, 품목, 추가, 20%, 진행, 중, 올세인츠, 올세..."
3,929542860700991489,"[햄볶, 날, allsaints, 올세인츠]"
4,925961224528920577,"[올세인츠, 겨울, 홀리데이, 캠페인, come, together, 공개, 올세인츠]"
5,919105381975990272,"[pop, up, store, at, urbansource, official, 어,..."
6,919096932068237312,"[pop, up, store, at, urbansource, official, 어,..."
7,667176714015936512,"[cosmo, promotion, 쇼핑, 전, 를, 떠올리다, 코스모, 인스타, 진..."
8,663269064832217093,"[november, lookbook, 와이드, 레더, 피코트, 및, 11월, 룩북,..."
9,663268404418080768,"[november, lookbook, 조이, 데님, 시, 얼링, 재킷, 및, 11월..."


In [74]:
tagger = Okt()

def tokenize2(docs):
    tokenized = []    
    
    for doc in docs:
        preprocessed = preproc(doc)
        text, hashtags = preprocessed['text'], preprocessed['hashtags']
        text += ' ' + ' '.join(hashtags)
        _tokenized = [t[0] for t in tagger.pos(text, norm=True, stem=True) if t[1] not in ['Punctuation', 'Josa']]
        tokenized.append(_tokenized)
    
    out = df[['id']].copy()
    
    #df['brand'] = file.split('.')[0]
    out['tokenized'] = tokenized
    return out

In [82]:
dfs = df.text[:50], df.text[50:100], df.text[100:150], df.text[150:200], df.text[200:250], df.text[250:300], df.text[300:]

In [None]:
pool = Pool(processes=4)
pool.map(tokenize2, dfs)

In [61]:
tagger = Okt()
def tokenize_vec(doc):
    preprocessed = preproc(doc)
    text, hashtags = preprocessed['text'], preprocessed['hashtags']
    text += ' ' + ' '.join(hashtags)
    return ','.join([t[0] for t in tagger.pos(text, norm=True, stem=True) if t[1] not in ['Punctuation', 'Josa']])

In [57]:
df = pd.read_pickle('twitter_data/allsaints (2010.01.01-2019.02.02).pkl')
tokenize_vec(df.text[0])

'멋지다'

In [62]:
foo = np.vectorize(tokenize_vec)

In [72]:
%%time
foo(df.text.values)

Wall time: 4.33 s


array(['멋지다,이,와중,작동,하나,싶다,sewing,machines,allsaints,london,windowdisplay',
       '패션,업계,상아탑,갇히다,듯,계절,맞다,않다,올세인츠',
       '올세인츠,50%,세,일,품목,추가,20%,진행,중,올세인츠,올세인츠,세,일,프렌켓,올세인츠,가죽,재킷,올세인츠,가방,올세인츠,가죽,자켓,올세인츠,무스,탕',
       '햄볶,날,allsaints,올세인츠', '올세인츠,겨울,홀리데이,캠페인,come,together,공개,올세인츠',
       'pop,up,store,at,urbansource,official,어,반,소스,allsaints',
       'pop,up,store,at,urbansource,official,어,반,소스,allsaints',
       'cosmo,promotion,쇼핑,전,를,떠올리다,코스모,인스타,진행,중인,아우,터,룩,투표,이벤트,주목,이스트,런던,감성,담다,올세인츠,2015,아우,터,시크,청,크다,올세인츠',
       'november,lookbook,와이드,레더,피코트,및,11월,룩북,지금,바로,확인,해보다,allsaints',
       'november,lookbook,조이,데님,시,얼링,재킷,및,11월,룩북,지금,바로,확인,해보다,allsaints',
       'i,m,just,curious,how,this,candle,change,its,color,정말,놀랍다,candle,allsaints,november',
       'october,lookbook,새롭다,출시,되다,여성,신제품,들,지금,바로,확인,해보다,allsaints',
       '올,세인트,디피,되다,멋있다,가죽,의자,allsaints,leftroad,레프트,로드',
       'savvy,shopper,11,타다,event,선글라스,자다,어울리다,그녀,은,역시,savvy,shopper,신예지,씨,이야기,여주,프리미엄,아울렛,만난,가죽,자켓,allsaints,11,번