In [78]:
%load_ext line_profiler
import os, re
import sys
import glob
import datetime as dt
from pathlib import Path
from dateutil.parser import parse
from subprocess import run

from numexpr import evaluate as ne_eval
import numpy as np
import pandas as pd
import pickle as pkl
import logging

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [11]:
def set_logger(level=logging.INFO, name=__name__):
    logger = logging.getLogger(__class__.__qualname__) or logging.getLogger(name)
    logger.setLevel(level)
    handler = logging.FileHandler('log_file.log')
    formatter = logging.Formatter(
        '%(asctime)s : %(name)s  : %(funcName)s : %(levelname)s : %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    return logger


In [12]:
# PATH = os.path.abspath(r"../data/10X/cleaned/1998/QTR2/") # on home
PATH = os.path.abspath(r"C:\Users\wonhyeong\workings\data\10X\cleaned") # on office
# PATH = os.path.abspath(r"/Users/wonhyeong/workings/data/10X/cleaned")  # on mac

In [118]:
class spider:
    # import modules
    import os
    import datetime as dt
    from pathlib import Path
    from dateutil.parser import parse

    import numpy as np
    import pandas as pd
    from numexpr import evaluate as ne_eval

    def __init__(self, data_path, index_path = 'index.pkl'):
        # self.index.index = ['acc']
        # self.index.cols = ['cik', 'date', 'type', 'name', 'ticker', 'exchange', 'path']
        self.dir = Path(data_path)
        self.index_path = self.dir / index_path
        self.index = pd.read_pickle(self.index_path) if (self.index_path).exists() else None
        self.columns = ['acc', 'cik', 'date', 'type', 'name', 'ticker', 'exchange', 'path']
        return

    def search(self, keywords=None, output='acc'):
        """"""
        # assert is_string keywords.isstr < 2, "list input is not supported currently"
        assert output in self.columns, f"output must be one of {self.columns}"
        if not keywords:
            return os.listdir(self.dir)
        # search by keyword
        key, modifier = self._search_preprocess(str(keywords[0]))
        assert modifier or key, "There's something wrong with your input"
        matrix = []
        if modifier:
            matrix.append([self._get_from_modifier(mod, value)
                          for mod, value in modifier.items()])
        if key:
            matrix.append([self._get_from_key(key, value)
                          for key, value in key.items()])
        idx = np.multiply.reduce(matrix)
        # to boolean array
        idx = np.array([True if i else False for i in idx[0]])
        result = getattr(self.index, output)[idx]
        return result

    def company(self, after=0, before=99999999, std_freq=None, get_period=True, get_count=False):
        """
        get company list
        :param after: after this date
        :param before: before this date
        :param std_freq: standard frequency
        :param get_period: get period in tuple
        :param get_count: get count in tuple
        :return: list(tuple or str)
        """
        after, before = self._date_preprocess(after, before)
        co = self.search(f'after:{after} before:{before}', output='cik').value_counts()
        assert co is not None, "There's no company matching in query"

        zipped = [co.index]
        # get count of filings
        if get_count:
            zipped.append(co.values)
        # get period
        if get_period:
            zipped.append(list(np.full(len(co), '-'.join([str(after), str(before)]))))
        if len(zipped) == 1:
            return zipped[0]
            
        return list(zip(zipped))

    def _search_preprocess(self, keyword):
        # params : string for search
        # return : list of keywords + tags
        key_tags = ['cik', 'date', 'name', 'symbol', 'acc', 'form']
        modifier_tags = ['after', 'before', 'year', 'qtr']
        keyword_list = keyword.split(' ')
        # unknown 인식 및 처리 과정 필요
        unknown = list(filter(lambda x: ':' not in x, keyword_list))
        tag = list(filter(lambda x: ':' in x, keyword_list))
        tag = dict(tuple(x.split(':')) for x in tag) if tag else {}
        key = {k: v for k, v in tag.items() if k in key_tags}
        modifier = {k: v for k, v in tag.items() if k in modifier_tags}
        # 현재 dateparser 적용 안되는 오류
        for k, val in modifier.items():
            val = self._date_preprocess(val)
        modifier = {k: v for k, v in modifier.items() if v}
        return key, modifier

    def _date_preprocess(self, date):
        if len(date) == 4:
            if date.startswith(('19', '20')):
                # 연도만 입력된 경우
                return date+'0101'
            if date <= '1231':
                # 월일만 입력된 경우, 분기조건 수정 필요
                year = str(dt.now().year)
                return year + date
        # try 내에서 return 사용 가능?
        try:
            result = parse(date).strftime('%Y%m%d')
        except:
            return None
        return result

    def _get_from_key(self, key, value):
        col = self.columns[key]
        idx = ne_eval(f'(col == {value})')
        return idx

    def _get_from_modifier(self, modifier, value):
        date = self.columns["date"]
        value = str(value)
        after = "10000000"
        before = "99999999"
        if modifier == 'after':
            after = value
        elif modifier == 'before':
            before = value
        elif modifier == 'year':
            after = value[:4] + '0101'
            before = value[:4] + '1231'
        elif modifier == 'qtr':
            after = value
            before = value[:4] + str(int(value[4:6])+3) + value[6:]
        idx = ne_eval(f'({int(after)} < date) & (date < {int(before)})')
        return idx

    def _date_to_path(self, date):
        date = str(date)
        year = date[:4]
        # month to quarter
        month = int(date[4:6])
        quarter = (month-1) // 3 + 1
        quarter = ''.join(['QTR', str(quarter)])
        path = self.dir / year / quarter
        return path
        
    def _get_index(self):
        """ 전부 수정 필요 """
        f = open(self.dir / 'log.txt', 'r')
        log = [l.strip() for l in f.readlines()]
        log = (l for l in log if l)

        df = pd.DataFrame(log, columns=['path'])
        df['path'] = df['path'].apply(lambda x: x.replace(str(self.dir), ''))
        df['path'] = df['path'].apply(lambda x: x[1:])

        df = df[df['path'].contains(os.sep)]
        df.reset_index(drop=True, inplace=True)

        df['acc'] = df['path'].apply(lambda x: x.split('_')[5].split('.')[0])
        df.set_index('acc', inplace=True)

        self.index = df
        return 

    def _get_dir_index_faster(self):
        """
        lot more faster than os.walk
        """
        # check path is pathlib.Path
        index_path = Path(os.getcwd()) / 'index.txt'
        options = f'/MIR /FP /NC /NS /NDL /NJH /NJS /LOG:index.txt /L'
        cmd = f'robocopy {str(self.path)} NULL {options}'
        # why this is too slow????
        run(['powershell', '-c', cmd])

        f = open(index_path, 'r')
        index = [line.strip() for line in f.readlines()]

        return index

    def _get_dir_tree(self):
        tree = pd.DataFrame(columns=["name", "path", "depth", "files"])
        for root, dirs, files in os.walk(self.dir):
            name = os.path.basename(root)
            this = root.replace(self.dir, "")
            depth = int(this.count(os.sep))
            count = len(files)
            # 숨김 폴더거나, 그 자식 노드일 경우 스킵
            if any(f.startswith(".") for f in this.split(os.sep)):
                print("skip:", this)
                continue
            tree.loc[len(tree)] = [name, this, depth, count]
        return tree
    
    def show_tree(self):
        result = ''
        for row in self.tree.itertuples():
            list = ['\t'*row.depth, row.name, f"\tfiles: {row.count}\n"]
            result.append(''.join(list))
        return result

In [119]:
files = spider(PATH)
files.search('cik:0001337930')

AssertionError: list input is not supported currently

In [85]:
def get_index(dir):
    f = open(dir / 'log.txt', 'r')
    log = [l.strip() for l in f.readlines()]
    log = (l for l in log if l)

    df = pd.DataFrame(log, columns=['path'])
    df['path'] = df['path'].apply(lambda x: x.replace(str(dir), ''))
    df['path'] = df['path'].apply(lambda x: x[1:])

    df = df[df['path'].str.contains(os.sep, regex=False)]
    df.reset_index(drop=True, inplace=True)

    df['acc'] = df['path'].apply(lambda x: x.split('_')[5].split('.')[0])
    df.set_index('acc', inplace=True)
    return df

def get_summary(dir):
    return pd.read_csv(dir / 'summaries.csv')[['CIK', 'FILING_DATE', 'ACC_NUM', 'FORM_TYPE', 'CoName']]

def get_ticker(dir):
    ticker = pd.read_csv(dir / 'cik.csv')
    ticker.drop(ticker.columns[0], axis=1, inplace=True)
    # limit exchange to NYSE or NASDAQ
    # due to duplicated cik are common in OTC, we need to limit exchange first
    ticker.drop(ticker.index[(ticker['exchange'] != 'NYSE') & (
        ticker['exchange'] != 'Nasdaq')], inplace=True)
    # drop all duplicates
    ticker.drop_duplicates(subset='cik', keep='first', inplace=True)
    ticker.drop_duplicates(subset='name', keep='first', inplace=True)
    ticker.drop_duplicates(subset='ticker', keep='first', inplace=True)
    # column cik to index
    ticker.set_index('cik', inplace=True)
    return ticker

def findticker(cik):
    try:
        result = ticker.at[cik, 'ticker']
    except:
        return
    return result

def findexchange(cik):
    try:
        result = ticker.at[cik, 'exchange']
    except:
        return
    return result

dir = Path(PATH)
index = get_index(dir)
summary = get_summary(dir)
ticker = get_ticker(dir)

summary['TICKER'] = summary['CIK'].map(findticker)
summary['EXCHANGE'] = summary['CIK'].map(findexchange)


summary['path'] = summary['FILING_DATE'].apply(str) + '_' + summary['FORM_TYPE'].apply(str) + '_' + 'edgar_data' + '_' + summary['CIK'].apply(str) + '_' + summary['ACC_NUM'].apply(str)+'.txt'
# change column name

summary.rename(columns={'FILING_DATE': 'date', 'FORM_TYPE': 'type', 'CoName': 'name', 'CIK': 'cik', 'TICKER': 'ticker', 'EXCHANGE':'exchange'}, inplace=True)
summary.rename(columns={'ACC_NUM': 'acc'}, inplace=True)

summary.set_index('acc', inplace=True)
summary.to_pickle(dir / 'summary.pkl')
a = pd.read_pickle(dir / 'summary.pkl')
a