In [1]:
import os

# data 필요!
tickers = []
for file in os.listdir("data"):
    ticker, suffix = file.split(".")
    size = os.path.getsize(f'data/{file}')
    if ticker and suffix == "csv":
        tickers.append((ticker, size/1024/1024))
tickers.sort()
tickers[:10]

[('000020', 0.7889432907104492),
 ('000030', 1.8329582214355469),
 ('000040', 0.32431793212890625),
 ('000050', 0.169891357421875),
 ('000060', 1.3198652267456055),
 ('000070', 0.25897789001464844),
 ('000080', 1.709299087524414),
 ('000100', 1.685617446899414),
 ('000120', 1.8949947357177734),
 ('000140', 0.13136768341064453)]

In [2]:
from typing import List, Tuple

def evaluate(data: List[Tuple[str, int]], chunk_size=3):
    s = len(data) // chunk_size
    sizes = []
    for i in range(chunk_size):
        sizes.append(sum(size for _, size in data[s * i: s * (i + 1)]))
    avg = sum(sizes)/chunk_size
    mse = sum((size - avg) ** 2 for size in sizes)/ chunk_size
    return sizes, mse

## Rowkey Design

data의 용량을 고르게 분산

## default

In [33]:
sizes, mse = evaluate(tickers)
print(sizes)
print(f'mse: {mse:.2f}')

[397.2962188720703, 305.57104206085205, 266.82598304748535]
mse: 2993.02


## reverse ticker

In [34]:
reverse_ticker = sorted([(ticker[::-1], size) for ticker, size in tickers])
sizes, mse = evaluate(reverse_ticker)
print(sizes)
print(f'mse: {mse:.2f}')

[319.2034454345703, 329.9751272201538, 320.5146713256836]
mse: 23.03


## Input Order

처음에 기존의 큰 데이터를 넣어줄때만 유의미!

region server의 부하를 고르게 분산


In [35]:

def overheads(data: list):
    regions = [[]]
    max_size = 500
    jobs = deque()

    while data:
        row = data.popleft()
        for idx in range(len(regions)):
            region = regions[idx]
            if not region:
                region.append(row)
                region.sort()
                jobs.append(idx)
                break
            if row < region[-1]:
                region.append(row)
                region.sort()
                jobs.append(idx)
                if len(region) == max_size:
                    new_region = []
                    while len(region) > max_size // 2:
                        new_region.append(region.pop())
                    regions.append(new_region[::-1])
                break
        else:
            region = regions[-1]
            region.append(row)
            region.sort()
            jobs.append(len(regions) - 1)
            if len(region) == max_size:
                new_region = []
                while len(region) > max_size // 2:
                    new_region.append(region.pop())
                regions.append(new_region[::-1])
                
    idx = 0
    repeat = 0
    repeats = []
    while jobs:
        job = jobs.popleft()
        if idx == job:
            repeat += 1
            continue
        else:
            repeats.append(repeat)
            repeat = 1
            idx = job
    return repeats

## default

In [36]:
reverse_ticker = sorted([(ticker[::-1], size) for ticker, size in tickers])
overheads(deque(reverse_ticker))

[500, 250, 250, 250, 250, 250, 250]

## random

In [37]:
import random

copied = reverse_ticker[::]
random.shuffle(copied)

In [38]:
from collections import Counter

Counter(overheads(deque(copied)))

Counter({500: 1,
         2: 209,
         4: 36,
         3: 95,
         1: 639,
         5: 12,
         8: 1,
         10: 2,
         7: 3,
         6: 6,
         16: 1,
         9: 1})

## hbase


### key design

#### reverse_code:date:hashed_title
    예시) 005930, 2018-07-13, 어쩌구저쩌구 -> b'03950020180713151cc02d351112665700a298daf6b4fe'
    
    특정 종목의 특정 날짜 범위의 모든 뉴스를 가져올 때 유리

In [39]:
from hashlib import md5

md5('어쩌구저쩌구'.encode()).hexdigest()

'151cc02d351112665700a298daf6b4fe'

In [95]:
# hbase.rpc.timeout                   60 sec
# hbase.hregion.memstore.flush.size   128mb
# hbase.hregion.max.filesize          10gb

In [200]:
256*1024*1024

268435456

In [6]:
import happybase

connection = happybase.Connection('localhost', port=9090)
connection.open()


### make table and get handler

In [62]:
# 테이블 생성
table_name = 'tickers'
if bytes(table_name, "utf-8") not in connection.tables():
    connection.create_table(table_name, {'cf': dict()})
connection.tables()

# 'my_table'에 대한 핸들을 얻음
table = connection.table(table_name)

### upload data

In [10]:
from hashlib import md5
import os, csv, random
from tqdm import tqdm
from time import time

tickers = []
for file in os.listdir("data"):
    ticker, suffix = file.split(".")
    if ticker and suffix == "csv":
        tickers.append(ticker)
random.shuffle(tickers)

s = time()
with table.batch(batch_size=1000) as b:
    for ticker in tqdm(tickers):
        with open(f'data/{ticker}.csv', 'r') as csv_file:
            reader = csv.reader(csv_file)
            for i, row in enumerate(reader):
                if i:
                    source, symbol, link, date, title, content = row
                    rowkey = f'{symbol[::-1]}:{date[:10].replace("-","")}:{md5(title.encode()).hexdigest()}'
                    b.put(rowkey, {'cf:title': title, 'cf:content': content})
print(time() - s, 'sec') # 45 sec

100%|███████████████████████████████████████| 2160/2160 [00:52<00:00, 41.24it/s]

52.4082407951355 sec





### make table and get handler for processed data

In [63]:
# 테이블 생성
table_name = 'tickers_processed'
if bytes(table_name, "utf-8") not in connection.tables():
    connection.create_table(table_name, {'cf': dict()})
connection.tables()

# 'my_table'에 대한 핸들을 얻음
table = connection.table(table_name)

In [12]:
from hashlib import md5
import os, csv, random
from tqdm import tqdm
from time import time

tickers = []
for file in os.listdir("data"):
    ticker, suffix = file.split(".")
    if ticker and suffix == "csv":
        tickers.append(ticker)
random.shuffle(tickers)

s = time()
with table.batch(batch_size=1000) as b:
    for ticker in tqdm(tickers):
        with open(f'data/{ticker}.csv', 'r') as csv_file:
            reader = csv.reader(csv_file)
            for i, row in enumerate(reader):
                if i:
                    source, symbol, link, date, title, content = row
                    title, content = preprocess_words(title), preprocess_words(content)
                    rowkey = f'{symbol[::-1]}:{date[:10].replace("-","")}:{md5(title.encode()).hexdigest()}'
                    b.put(rowkey, {'cf:title': title, 'cf:content': content})
print(time() - s, 'sec') # 45 sec

100%|███████████████████████████████████████| 2160/2160 [19:42<00:00,  1.83it/s]

1182.1367540359497 sec





### read data

In [64]:
connection = happybase.Connection('localhost', port=9090)
connection.open()

table_name = 'tickers' # 'tickers_processed'
table = connection.table(table_name)
n = 10
for key, data in table.scan(row_start='039500:2018', row_stop='039500:2019', columns=['cf:title', 'cf:content']):
    #print(key, data)  # key와 data를 출력
    print(data[b'cf:title'].decode())
    #print(data[b'cf:content'].decode())
    n -= 1
    if not n:
        break

### delete row

In [None]:
table.delete(b'row_key')

### delete table

In [60]:
# 테이블을 비활성화합니다.
connection.disable_table('tickers')

# 테이블을 삭제합니다.
connection.delete_table('tickers')

### check for airflow

In [53]:
t1, t2 = connection.table('tickers'), connection.table('tickers_processed')
for key in keys:
    t1.delete(key)
    t2.delete(key)

In [14]:
connection = happybase.Connection('localhost', port=9090)
connection.open()

["005930", "035420"]

keys = []
table_name = 'tickers' # 'tickers_processed'
ticker = '035420'
table = connection.table(table_name)
n = 0
for key, data in table.scan(row_start=f'{ticker[::-1]}:2023', row_stop=f'{ticker[::-1]}:20231', columns=['cf:title', 'cf:content']):
    #print(key, data)  # key와 data를 출력
    print(key)
    print(data[b'cf:title'].decode())
    # print(data[b'cf:content'].decode())
    n += 1
    print()
    keys.append(key)
print(n, 'articles')
print('-' * 100)

n = 0
table_name = 'tickers_processed' # 'tickers_processed'
table = connection.table(table_name)
for key, data in table.scan(row_start=f'{ticker[::-1]}:2023', row_stop=f'{ticker[::-1]}:20231', columns=['cf:title', 'cf:content']):
    #print(key, data)  # key와 data를 출력
    print(key)
    print(data[b'cf:title'].decode())
    #print(data[b'cf:content'].decode())
    n += 1
    print()
    keys.append(key)
print(n, 'articles')

b'024530:20230620:2d7b7eeaaee56a55e57b3c1d3aab3b59'
네이버 AI 챗봇 서비스명 '큐:'…내달 중 공개

b'024530:20230620:a54142d8eed28429bc49c174e8f0f014'
[미리보는 이데일리 신문] “AI인력 스카우트 멈춰”…네이버, SKT에 경고장

b'024530:20230620:eae8024798aec003571f14a437593a10'
"AI 스카우트 그만!" 네이버, SKT 상대로 내용증명

b'024530:20230620:f71dfcde83b0d27bbe1c6a81661f672f'
신세계發 멤버십 전쟁…쿠팡·네이버, 강력한 도전자를 만나다

b'024530:20230622:25e500d3ec1a3db0bbbae7518b5b785f'
"AI 편향적 발언 차단 지원" 네이버, 학습 데이터셋 전면 공개

b'024530:20230622:62438cd2d1d2e9bca7bf35edb36f2b04'
“ ‘큐:’ 확정한 NAVER, 3Q 두자릿수 이익 성장률 회복…성장 주도주 귀환 가능” [투자360]

b'024530:20230622:84c7b63a4d4f5e8d0e4eb8100c9f5825'
[단독] 서치GPT 말고 더 있다…반격 준비하는 네이버

b'024530:20230622:a83bdab3e469e234fbceee7fae4a1359'
세계가 주목할 때 드라이브…네이버웹툰, 국제 행사서 존재감 ‘뿜뿜’

b'024530:20230622:b05349d1310ab3a5eb760f07f4659106'
“AI로 주가조작 전방위 포착”…네이버 만나는 금감원

b'024530:20230623:55f594da25d72ce453be374b86082d68'
"심리 상담받고 싶다면 네이버 인물정보로 찾아보세요"

b'024530:20230623:858ec281719ab1c5bddafd98b0c199e7'
오션뷰 숙소 가고픈데…네이버 검색으로 쉽게 찾는다

b'024530:20230623:863d1a2a96