특정 폴더의 파일 목록을 조회

In [25]:
import os,sys
files = os.listdir('.')
for f in files:
    if os.path.isfile(f):
        print(f, '\t\t',os.path.abspath(f))
    else:
        print("%s/" % f, '\t\t', os.path.abspath(f))

.git/ 		 D:\study\python\python-sample\.git
.gitignore 		 D:\study\python\python-sample\.gitignore
.idea/ 		 D:\study\python\python-sample\.idea
.ipynb_checkpoints/ 		 D:\study\python\python-sample\.ipynb_checkpoints
LICENSE 		 D:\study\python\python-sample\LICENSE
pandas/ 		 D:\study\python\python-sample\pandas
README.md 		 D:\study\python\python-sample\README.md
snippets.ipynb 		 D:\study\python\python-sample\snippets.ipynb


현재 디렉토리 조회

In [26]:
print('os.curdir =','\t', os.curdir)
print('os.getcwd() =','\t', os.getcwd())

os.curdir = 	 .
os.getcwd() = 	 D:\study\python\python-sample


In [53]:
import pandas as pd

def load_vocab(folder):
    cates = []
    vocab_fpaths = []
    for fname in os.listdir(folder):
        fpath = os.path.join(folder, fname)
        if os.path.isfile(fpath):
            _, ext = os.path.splitext(fpath)
            if '.vocab' == ext:
                fname_part = fname.replace('.vocab','')
                arr = fname_part.split('-')
                if len(arr) == 2:
                    vocab_fpaths.append(os.path.normpath(fpath))
                    cates.append(arr[1])
    
    word_set = set()
    for vocab_file in vocab_fpaths:
        print('load vocab file:', vocab_file)
        df = pd.read_csv(vocab_file, names=['word', 'freq'])
        word_set = word_set.union(df['word'].tolist())
        del df
    vocabulary_inv = list(word_set)
    vocabulary_inv.insert(0, '<PAD/>')
    vocabulary = {word: (index + 1) for index, word in enumerate(vocabulary_inv)}
    return cates, vocabulary, vocabulary_inv

cates, vocabulary, vocabulary_inv = load_vocab('x:/train/vocab')
print(cates)

        

load vocab file: x:\train\vocab\ko-news.vocab
load vocab file: x:\train\vocab\ko-porn.vocab
['news', 'porn']


In [55]:
import os

def _list_sentence_files(sentence_folder, file_size_filter=None):
    if file_size_filter != None:
        filter_txt = '-' + file_size_filter + '.sentence.txt'
    else:
        filter_txt = '.sentence.txt'
        
    fname_list = []
    for fname in os.listdir(sentence_folder):
        if fname.endswith(filter_txt):
            fname_list.append(fname)
    return filter_txt, fname_list

def load_data(sentence_folder, vocab_folder=None, file_size_filter=None):
    print(sentence_folder)
    filter_txt, fname_list = _list_sentence_files(sentence_folder, file_size_filter)
    labels = []
    
    # read cates
    # key: filename, value=label
    fname_label_dict = {}
    for fname in fname_list:
        fname_part = fname.replace(filter_txt, '')
        arr = fname_part.split('.')[0].split('-')
        if len(arr) == 2:
            lang = arr[0]
            label = arr[1]
            fname_label_dict[fname] = label

    labels = fname_label_dict.values()
    num_labels = len(labels)
    one_hot = np.zeros((num_labels, num_labels), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))
    
    x_raw = []
    y_raw = []
    # read sentences from file
    for fname in fname_label_dict.keys():
        x_vals = []
        label = fname_label_dict[fname]
        print('reading sentences from: ', fname)
        with open(os.path.join(sentence_folder, fname), 'r', encoding='UTF-8') as f:
            while True:
                line = f.readline()
                if line == '':
                    break
                x_vals.append(line.strip().split(' '))
                y_raw.append(label_dict[label])
        x_raw.extend(x_vals)
    print(x_raw)
    
    # load vocab
    cates, vocabulary, vocabulary_inv = load_vocab(vocab_folder)
    
    # pad sentences
    x_raw = pad_sentences(x_raw, vocabulary=vocabulary, forced_sequence_length=30)
    x = np.array([[vocabulary[word] for word in sentence] for sentence in x_raw])
    y = np.array(y_raw)
    return x, y, vocabulary, vocabulary_inv, labels
    
load_data('x:/sentence', vocab_folder='x:/train/vocab', file_size_filter='small')

x:/sentence
reading sentences from:  ko-news-small.sentence.txt
reading sentences from:  ko-porn-small.sentence.txt
[['중국과', '세계', '사상의', '대화', '칭화대', '포럼', '주요', '발언'], ['커피콩', '한', '알', '안', '나는데', '이탈리아', '커피의', '비밀'], ['커피', '열매의', '주요', '산지는', '적도', '부근의', '북위', '남위', '사이에', '있는', '소위', '커피콩', '벨트', '에', '집중돼', '있다'], ['커피의', '원산지인', '아프리카', '에티오피아를', '비롯한', '케냐', '중남미의', '브라질과', '콜롬비아', '동남아에선', '베트남과', '인도네시아', '등이', '주요', '수출국이다'], ['커피콩은', '커피', '열매', '속에', '든', '씨앗의', '껍질을', '벗기고', '말린', '것이다'], ['딱딱한', '초록색', '생두를', '볶는', '작업이', '로스팅인데', '이', '과정을', '거치면', '카페에서', '볼', '수', '있는', '갈색', '원두가', '된다'], ['그렇다면', '커피콩', '한', '알', '나지', '않는', '이탈리아가', '스타벅스를', '탄생시킨', '커피의', '본산으로', '불리는', '까닭은', '무엇일까'], ['유럽에서', '가장', '긴', '커피', '역사와', '수많은', '지역', '카페들', '덕분이다'], ['당시', '이탈리아', '해상무역의', '거점이던', '베네치아를', '통해', '초', '오스만제국에서', '유럽으로', '커피가', '전파됐다'], ['커피를', '마시는', '공간인', '카페가', '유럽에서', '처음', '등장한', '곳도', '베네치아다'], ['다양한', '커피', '음료의', '기본이', '되는', '에스프레소는', '곱게', '간', '원두에', '뜨거

load vocab file: x:\train\vocab\ko-porn.vocab


NameError: name 'pad_sentences' is not defined