In [2]:
import json
import os
import datetime as dt 
from tqdm import tqdm
import pandas as pd 
import numpy as np
import time

In [3]:
# 文件名按照字符串取第一位置的字符串
def f2cat(filename :str) -> str:
    return filename.split('.')[0]

class Simplified():
    def __init__(self, input_path = '/share/data/quickdraw-doodle-recognition/train_simplified/'):
        self.input_path = input_path

    # 提取所有的label
    def list_all_categories(self):
        files = os.listdir(self.input_path)
        return sorted([f2cat(f) for f in files], key = str.lower)

    def read_training_csv(self, category, nrows = None, usecols = None, drawing_transform = False):
        df = pd.read_csv(
            os.path.join(self.input_path, category + '.csv'),
            nrows = nrows, parse_dates=['timestamp'], usecols = usecols
        )

        if drawing_transform:
            df['drawing'] = df['drawing'].apply(json.loads)
        return df

start = dt.datetime.now()

# 加载数据路径
s = Simplified()
NVSVS = 100

categories = s.list_all_categories()
print(len(categories))

340


In [4]:
for y, cat in tqdm(enumerate(categories)):
    df = s.read_training_csv(cat)
    df['y'] = y
    df['cv'] = (df.key_id// 10**7) % NVSVS # 处理成100个小文件
    # 写入数据
    for k in range(NVSVS):
        filename = '/data/python/tensorflow/shuffle_data/train_k{}.csv'.format(k)
        chunk = df[df.cv == k]
        chunk = chunk.drop(['key_id'], axis = 1)
        if y == 0: 
            chunk.to_csv(filename, index = False)
        else:
            chunk.to_csv(filename, mode = 'a', header = False,index = False)

340it [15:53,  2.80s/it]


In [5]:
# 对csv数据打乱，进行压缩并存储，删除原有数据
for k in tqdm(range(NVSVS)):
    filename = '/data/python/tensorflow/shuffle_data/train_k{}.csv'.format(k)
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        df['rnd'] = np.random.rand(len(df))
        df = df.sort_values(by = 'rnd').drop('rnd',axis = 1) # 对数据做随机打乱
        df.to_csv('/data/python/tensorflow/shuffle_data_gzip/train_k{}.csv.gz'.format(k), compression = 'gzip',index = False)
        print(time.strftime('%Y-%m-%d %H:%M:%S'),'\t',f'第{k}个数据压缩成功')
        #os.remove(filename)
print(df.shape)

end = dt.datetime.now()
print('lastest run {}.\nTotal time {}s'.format(end, (end - start).seconds))

1%|          | 1/100 [02:09<3:32:57, 129.07s/it]2020-09-17 09:20:03 	 第0个数据压缩成功
  2%|▏         | 2/100 [04:18<3:31:11, 129.30s/it]2020-09-17 09:22:13 	 第1个数据压缩成功
  3%|▎         | 3/100 [06:28<3:29:22, 129.51s/it]2020-09-17 09:24:23 	 第2个数据压缩成功
  4%|▍         | 4/100 [08:38<3:27:26, 129.66s/it]2020-09-17 09:26:33 	 第3个数据压缩成功
  5%|▌         | 5/100 [10:48<3:25:12, 129.61s/it]2020-09-17 09:28:42 	 第4个数据压缩成功
  6%|▌         | 6/100 [12:57<3:22:48, 129.46s/it]2020-09-17 09:30:51 	 第5个数据压缩成功
  7%|▋         | 7/100 [15:07<3:20:42, 129.49s/it]2020-09-17 09:33:01 	 第6个数据压缩成功
  8%|▊         | 8/100 [17:15<3:18:10, 129.24s/it]2020-09-17 09:35:09 	 第7个数据压缩成功
  9%|▉         | 9/100 [19:24<3:15:58, 129.22s/it]2020-09-17 09:37:19 	 第8个数据压缩成功
 10%|█         | 10/100 [21:33<3:13:43, 129.15s/it]2020-09-17 09:39:28 	 第9个数据压缩成功
 11%|█         | 11/100 [23:42<3:11:23, 129.02s/it]2020-09-17 09:41:36 	 第10个数据压缩成功
 12%|█▏        | 12/100 [25:51<3:08:59, 128.86s/it]2020-09-17 09:43:45 	 第11个数据压缩成功
 13%|█▎      