In [5]:
import os
import re
import pandas as pd
import asyncio
from functools import partial
import twitterscraper as tws
from IPython.core.debugger import set_trace

In [2]:
df = pd.read_excel('keywords.xlsx')[['item','keyword_eng','keyword_kr','intersection']].set_index('item')
_or = lambda kw: ' OR '.join(['#' + k.strip() for k in kw.split(',')])

df['keyword'] = df.keyword_eng + ', ' + df.keyword_kr
df['qry'] = df.keyword.apply(_or)

intersect_with = ['ootd', 'category', 'fashion']
screener = ' OR '.join(df.loc[intersect_with, 'qry'])
df.loc[df.intersection==True, 'qry'] = '(' + df.qry + ') AND (' + screener + ')'

qry = dict(df.qry)

In [3]:
qry

{'ootd': '#dailylook OR #outfit OR #selfie OR #bestofstreetwear OR #beststreetoutfit OR #ootd OR #outfitoftheday OR #데일리룩 OR #오오티디',
 'category': '#designer OR #streetwear OR #luxury OR #casual OR #sportswear OR #디자이너 OR #길거리패션 OR #럭셔리 OR #캐주얼 OR #캐쥬얼 OR #스포츠웨어 OR #스포츠패션',
 'fashion': '#fashion OR #brand OR #fashionbrand OR #shoe OR #shoes OR #clothes OR #wear OR #apparel OR #패션 OR #브랜드 OR #패션브랜드 OR #옷 OR #의류 OR #슈즈 OR #신발',
 '8seconds': '#8seconds OR #에잇세컨즈',
 'adidas': '#adidas OR #아디다스',
 'adidasoriginal': '#adidasoriginal OR #아디다스오리지널스 OR #아디다스오리지널 OR #아디다스오리지날스 OR #아디다스오리지날',
 'asics': '#asics OR #아식스',
 'balensiaga': '#balensiaga OR #발렌시아가',
 'bally': '#bally OR #발리',
 'beanpole': '#beanpole OR #빈폴',
 'benetton': '#benetton OR #베네통',
 'blackyak': '#blackyak OR #블랙야크',
 'buckaroo': '#buckaroo OR #버커루',
 'burberry': '#burberry OR #버버리',
 'calvinklein': '#calvinklein OR #케빈클라인 OR #켈빈클라인 OR #캘빈클라인',
 'canadagoose': '#canadagoose OR #캐나다구스',
 'chanel': '#chanel OR #샤넬',
 'coach': '#co

In [6]:
def _period(start=None, end=None):
    if start is None: start = '2010-01-01'
    if end is None: end = pd.Timestamp.today()
    
    return pd.Timestamp(start).date(), pd.Timestamp(end).date()

In [7]:
def scrap(qry_dict, what=['id','text'], lang=None, start=None, end=None, download_to='twitter_data'):
    start, end = _period(start=start, end=end)
    filename = download_to + '/{item} (' + str(start).replace('-','.') + '-' + str(end).replace('-','.') + ').pkl'
    
    if not os.path.exists(download_to):
        os.makedirs(download_to)    
    
    
    def _pretty(queried):
        return [{k:v for k,v in qrd.__dict__.items() if k in what} for qrd in queried]
    
    
    async def _scrap(item, q):
        q_tw_partial = partial(tws.query_tweets, lang=lang, begindate=start, enddate=end)
        queried = await loop.run_in_executor(None, q_tw_partial, q)
        res = _pretty(queried)
        
        df = pd.DataFrame(res)
        df.to_pickle(filename.format(item=item))
    
    
    async def main():
        fts = [asyncio.ensure_future(_scrap(item, q)) for item, q in qry_dict.items()]
        await asyncio.gather(*fts)
    
    
    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
    loop.close()

In [7]:
qry_test = {'aldo':qry['aldo'], 'allsaints':qry['allsaints']}

In [8]:
%%time
scrap(qry_test, lang='ko')

INFO: queries: ['#allsaints OR #올세인츠 since:2010-01-01 until:2010-06-15', '#allsaints OR #올세인츠 since:2010-06-15 until:2010-11-28', '#allsaints OR #올세인츠 since:2010-11-28 until:2011-05-13', '#allsaints OR #올세인츠 since:2011-05-13 until:2011-10-26', '#allsaints OR #올세인츠 since:2011-10-26 until:2012-04-09', '#allsaints OR #올세인츠 since:2012-04-09 until:2012-09-22', '#allsaints OR #올세인츠 since:2012-09-22 until:2013-03-07', '#allsaints OR #올세인츠 since:2013-03-07 until:2013-08-20', '#allsaints OR #올세인츠 since:2013-08-20 until:2014-02-02', '#allsaints OR #올세인츠 since:2014-02-02 until:2014-07-18', '#allsaints OR #올세인츠 since:2014-07-18 until:2014-12-31', '#allsaints OR #올세인츠 since:2014-12-31 until:2015-06-15', '#allsaints OR #올세인츠 since:2015-06-15 until:2015-11-28', '#allsaints OR #올세인츠 since:2015-11-28 until:2016-05-12', '#allsaints OR #올세인츠 since:2016-05-12 until:2016-10-25', '#allsaints OR #올세인츠 since:2016-10-25 until:2017-04-09', '#allsaints OR #올세인츠 since:2017-04-09 until:2017-09-22', '#allsaints OR 

Wall time: 54.3 s


In [15]:
pd.read_pickle('twitter_data/allsaints (2010.01.01-2019.02.02).pkl').text.iloc[4]

"올세인츠, 겨울 홀리데이 캠페인 'Come Together' 공개  http://www.fashionn.com/board/read_new.php?table=1006&number=22448&sns_number=1098515\xa0… #올세인츠 pic.twitter.com/J4gUbehMN1"

In [17]:
pd.read_pickle('twitter_data/allsaints (2010.01.01-2019.02.02).pkl').text.iloc[0]

'멋지다. 이 와중에 작동은 하나? 싶고.. RT @foreverkimchi: Sewing machines #allsaints #london #windowdisplay http://instagr.am/p/LVnuYWtXjh/\xa0'

In [84]:
pd.read_pickle('twitter_data/allsaints (2010.01.01-2019.02.02).pkl').text.iloc[7]

'<COSMO PROMOTION> #아우터 쇼핑 전 #시크 #청크 를 떠올렸다면, \n코스모 인스타에서 진행 중인 #올세인츠 아우터 룩 투표 이벤트에 주목! \n이스트 런던의 감성을 담은 올세인츠의 2015... http://fb.me/5shv0dqlm\xa0'

In [85]:
re.sub(r'\n', '', pd.read_pickle('twitter_data/allsaints (2010.01.01-2019.02.02).pkl').text.iloc[7])

'<COSMO PROMOTION> #아우터 쇼핑 전 #시크 #청크 를 떠올렸다면, 코스모 인스타에서 진행 중인 #올세인츠 아우터 룩 투표 이벤트에 주목! 이스트 런던의 감성을 담은 올세인츠의 2015... http://fb.me/5shv0dqlm\xa0'

In [306]:
def preproc(text, url=True, mention=True, hashtag=True, remove=True):
    LINEBREAK = r'\n' # str.replace에서는 r'\n'으로 검색이 안된다
    RT = ' rt '
    #URL = r'(?P<url>(https?://)?(www[.])?[^ \u3131-\u3163\uac00-\ud7a3]+[.][a-z]{2,6}\b([^ \u3131-\u3163\uac00-\ud7a3]*))'
    URL = r'(?:https?:\/\/)?(?:www[.])?[^ :\u3131-\u3163\uac00-\ud7a3]+[.][a-z]{2,6}\b(?:[^ \u3131-\u3163\uac00-\ud7a3]*)'
    # \u3131-\u3163\uac00-\ud7a3 는 한글을 의미함
    HASHTAG = r'(#[^ #@]+)'
    MENTION = r'(@[^ #@]+)' 
    
    PTNS = '|'.join((LINEBREAK, RT, URL, HASHTAG, MENTION))
    
    text = text.lower()
    out = {}
    
    #def _remove(text, *ptns):
    #    return re.sub('|'.join(ptns), '', text)#.strip()
        
    #def _remove_url(text, *urls):
    #    for url in urls: text = text.replace(url, '')
    #    return text
    
    text = re.sub('|'.join((LINEBREAK,RT)), '', text)
    
    if url:
        urls = re.findall(URL, text) #[m.groupdict()['url'] for m in re.finditer(URL, text)]
        urls = [url.replace('\xa0', '') for url in urls]
        out['urls'] = urls
        
    if hashtag:
        hashtags = re.findall(HASHTAG, text)
        out['hashtags'] = [h[1:] for h in hashtags]
        
    if mention:
        mentions = re.findall(MENTION, text)
        out['mention'] = [m[1:] for m in mentions]
        
    if remove:    
        text = re.sub(PTNS, '', text)
        
    out['text'] = text.strip()
    return out

In [337]:
preproc(pd.read_pickle('twitter_data/allsaints (2010.01.01-2019.02.02).pkl').text.iloc[210], remove=True)

{'urls': ['http://instagram.com/p/u2l5sqxtlq/'],
 'hashtags': ['november',
  'allsaints',
  'autumn',
  'sky',
  'skyporn',
  'rooftop',
  'sunset',
  'skyscape',
  'vsco',
  'vscocam…'],
 'mention': [],
 'text': '11월. be good.'}