In [1]:
import re
import urllib3
import requests
import pandas as pd
import matplotlib.pyplot as plt

from requests.auth import HTTPBasicAuth

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
urllib3.disable_warnings(UserWarning)

plt.rcParams.update({'font.size': 15})
plt.figure(figsize=(20, 20))

pd.set_option('precision', 0)
pd.options.display.float_format = '{:,.0f}'.format

In [2]:
class CorpusSizeUtils(object):

    def __init__(self):
        pass

    @staticmethod
    def get_index_size(site='https://nlp.ncsoft.com:9200'):
        """인덱스 크기를 조회한다."""
        url = '{site}/_cat/indices?v&s=index&h=index,docs.count'.format(
            site=site)

        resp = requests.get(
            url=url,
            auth=HTTPBasicAuth('elastic', 'nlplab'),
            verify=False,
        )

        alias = {
            'sports-reply': 'sports_reply'
        }

        index = {'total': 0}
        for l in resp.text.split('\n')[1:]:
            if l == '' or l[0] == '.':
                continue

            for src in alias:
                l = l.replace(src, alias[src])

            # 사이즈 분리
            i_name, count = re.sub(
                r'crawler-(.+?)\s+(\d+)$',
                r'\g<1>\t\g<2>',
                l,
            ).split('\t', maxsplit=1)

            if count.isdecimal():
                count = int(count)

            if count == 0:
                continue

            # 인덱스명 파싱: site - category - year
            t = i_name.split('-')

            year = ''

            # 년도
            if t[-1].isdecimal():
                year = t[-1]
                t.pop()

            site = t[0]
            t = t[1:]

            category = '_'.join(t)

            index['total'] += count

            if site not in index:
                index[site] = {}

            if category not in index[site]:
                index[site][category] = {}

            if year != '':
                if year not in index[site][category]:
                    index[site][category][year] = 0

                index[site][category][year] += count
            else:
                category = '-'.join([site, category])

                if 'etc' not in index:
                    index['etc'] = {}

                if category not in index['etc']:
                    index['etc'][category] = 0

                index['etc'][category] += count

        return index

    @staticmethod
    def add_sum_column(data):
        """합계 컬럼을 넣는다."""
        df = pd.DataFrame(data).fillna(0)

        df.index.name = 'year'

        df['합계'] = df.sum(axis=1)
        df.loc['합계'] = df.sum(axis=0)

        return df

    def get_index_size_df(self):
        """ """
        index_size = self.get_index_size()
        
        df = {'total': index_size['total']}
        for k in index_size:
            if k == 'total':
                continue

            if k == 'etc':
                df[k] = pd.DataFrame({'count': index_size[k]}).fillna(0)
                df[k] = df[k].astype('float')
            else:
                df[k] = pd.DataFrame(index_size[k]).fillna(0)
                df[k] = df[k].astype('float')

                df[k].index.name = 'year'
                df[k].sort_index(inplace=True)

        return df

    def get_report(self, size_df):
        """ """
        from IPython.display import display, HTML

        html = '<h1>전체 수량: {total:,.0f}</h1>'.format(total=size_df['total'])
        display(HTML(html))

        for k in size_df:
            if k == 'total':
                continue

            df = self.add_sum_column(size_df[k])

            html = '<style>.dataframe td { text-align: right; } .dataframe th { text-align: center; }</style>'
            html += '<h1>{title}: {total:,.0f}</h1>'.format(
                title=k,
                total=df.at['합계', '합계'],
            )
            html += df.to_html(index_names=False)
            display(HTML(html))

        return

    def get_report2(self, size_df):
        """ """
        from IPython.display import display, HTML

        for k in size_df:
            if k == 'total':
                continue

            size_df[k].plot(kind='barh', figsize=(15, 10))

        return

In [3]:
utils = CorpusSizeUtils()
size_df = utils.get_index_size_df()

size_df.keys(), '{:,}'.format(size_df['total'])

(dict_keys(['total', 'bbs', 'daum', 'jisikman', 'nate', 'naver', 'etc']),
 '180,706,471')

In [4]:
utils.add_sum_column(size_df['naver']).T.to_excel('naver.xlsx')

In [5]:
utils.get_report(size_df=size_df)

Unnamed: 0,inven,lineagem,mlbpark_bullpen,mlbpark_kbo,mlbpark_mlb,ruliweb,합계
2000,0,0,0,0,0,2,2
2003,0,0,0,0,0,10260,10260
2004,0,0,0,0,0,24385,24385
2005,0,0,0,0,0,17605,17605
2006,0,0,0,0,0,13311,13311
2007,0,0,0,0,0,10474,10474
2008,0,0,0,0,0,12139,12139
2009,0,0,0,0,0,9172,9172
2010,0,0,0,0,0,8953,8953
2011,0,0,0,103250,0,5679,108929


Unnamed: 0,culture,economy,international,it,opinion,politics,society,sports,합계
2001,385,0,10842,350,0,4705,0,0,16282
2002,3343,26282,31897,9903,0,14360,16787,0,102572
2003,1551,48715,49197,22321,0,23507,69363,0,214654
2004,23917,53853,42164,20034,0,32794,224841,0,397603
2005,41096,57435,48635,37207,0,53125,452793,0,690291
2006,44075,57170,53478,45037,0,54233,724673,0,978666
2007,52446,428927,59860,45828,0,55350,613518,0,1255929
2008,51968,625506,59276,47121,0,56266,669768,0,1509905
2009,51967,840584,59142,50501,0,57451,832287,0,1891932
2010,48722,845377,57702,52975,0,58469,729388,0,1792633


Unnamed: 0,Unnamed: 1,합계
2007,473479,473479
2008,3940787,3940787
2009,4372347,4372347
2010,4153137,4153137
2011,2893529,2893529
2012,1172001,1172001
2013,741273,741273
2014,1192179,1192179
2015,761653,761653
2016,143234,143234


Unnamed: 0,economy,entertainment,international,it,politics,society,sports,합계
2004,0,0,0,0,34589,0,0,34589
2005,298506,43168,61844,47643,37631,225993,26108,740893
2006,387091,59262,88359,68991,43561,285968,37058,970290
2007,470275,151262,75044,89216,74992,493558,73841,1428188
2008,557529,173423,95018,118891,103464,635651,90524,1774500
2009,823297,233038,128252,82539,107824,717095,130461,2222506
2010,1024136,362637,134752,118429,122867,776296,127555,2666672
2011,1285242,515059,180285,139724,168302,917503,172418,3378533
2012,1460373,576967,157940,162287,146091,1165287,231097,3900042
2013,1517049,675518,190841,200299,185061,1516499,291540,4576807


Unnamed: 0,economy,international,international_error,it,kin_detail,living,opinion,politics,society,sports,sports_reply,terms_detail,tv,tv_error,합계
1997,29,25,0,50,0,48,0,0,0,0,0,0,0,0,152
1998,28725,12392,0,3526,0,7345,711,24480,33903,0,0,0,0,0,111082
1999,34039,15824,0,4242,0,8867,562,28806,64972,0,0,0,0,0,157312
2000,132507,29147,0,11717,0,9724,18696,36357,81462,0,0,0,0,0,319610
2001,169841,46820,0,15343,0,17639,26994,44832,102387,0,0,0,0,0,423856
2002,176123,55857,0,58657,0,13760,35243,62881,118225,0,0,0,0,0,520746
2003,580971,86765,0,110666,0,40489,49044,105687,164981,0,0,0,34259,0,1172862
2004,727553,89894,0,112423,0,107851,41435,157740,191538,24853,0,0,69980,0,1523267
2005,696190,108077,0,140354,0,135910,52752,165361,197313,42880,0,0,99288,0,1638125
2006,733966,128364,0,142605,0,182411,50430,167803,196038,63938,0,0,141083,0,1806638


Unnamed: 0,count,합계
naver-international_error,2,2
naver-kin_detail,3342508,3342508
naver-terms_detail,1363306,1363306
naver-tv_error,4,4
합계,4705820,4705820
