# Split Data

In [11]:
import numpy as np
import os
from tqdm import tqdm
import pandas as pd

In [None]:
data_group = pd.read_csv('data_group.csv')
data_group

In [3]:
original_data_path = './data/original_data'
new_data_path = './data/new_data'
limit = 295000

In [None]:
# group the data by group id
groups = data_group.groupby("group")

# go through each group
for group_id, group in tqdm(groups):
    
    # summary the length of the original data and split data
    sum_group_len = group.num_data.sum()
    split_data_len = 0
    
    # go through each file in the group
    for j in range(len(group)):
        split_file_len = 0
        file_name = group.file_name.values[j]
        file_len = group.num_data.values[j]
        need_key_number = int(np.ceil(file_len / limit))
        data = pd.read_stata(os.path.join(original_data_path, file_name))
        
        # split each file
        # rename the new file in the form like (groupId_fileName_numOfSplit)
        for i in range(need_key_number):
            new_file_name = "{}_{}_{}.csv".format(group_id, file_name.split('.')[0], i)
            sub_data = data.loc[i*limit: (i+1)*limit-1]
            split_file_len += len(sub_data)
            split_data_len += len(sub_data)
            sub_data.to_csv(os.path.join(new_data_path, new_file_name), index=False)
            
        assert split_file_len == file_len
        
    assert split_data_len == sum_group_len

## Merge Data

In [135]:
import pandas as pd
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [136]:
def get_file_list(path):
    for root, _, files in os.walk(path):
        return files
    
    
def get_file_name(group_id, file_path='./data/transformed_data'):
    '''
    get the file name according to the group id
    '''
    
    # get the list of all files
    files = get_file_list(file_path)
    file_name_list = []
    for file in files:
        if str(group_id) in file.split('_')[0]:
            file_name_list.append(file)
            
    province_set = set([i.split('_')[1] for i in file_name_list])
    sort_list = []
    for province in province_set:
        temp = [j  for j in file_name_list if j.split('_')[1]==province]
        temp.sort(key = lambda x:int(x.split('_')[2].split('.')[0]))
        sort_list.append(temp)
            
    return sort_list

In [137]:
def merge_data(group_id, transformed_path, error_path, merge_path, merge_error_path, group_path):
    
    transformed_list = get_file_name(group_id, transformed_path)
    error_list = get_file_name(group_id, error_path)
    group = pd.read_csv(group_path)
    
    
    error_num_1 = []
    error_num_2 = []
    for province in transformed_list:
        new_data = pd.DataFrame()
        for i in province:
            transformed_data = pd.read_csv(os.path.join(transformed_path, i))
            new_data = new_data.append(transformed_data)
            
        error_num_1.append(len(new_data[new_data.longitude==-1]))
        province_name = i.split("_")[1]
        assert group[group.file_name==province_name+".dta"].num_data.item()==len(new_data)
        new_data.to_csv(os.path.join(merge_path, province_name+".csv"))
        
    for province in error_list:
        all_error_data = pd.DataFrame()
        for i in province:
            error_data = pd.read_csv(os.path.join(error_path, i))
            all_error_data = all_error_data.append(error_data)
            
        error_num_2.append(len(all_error_data))
        province_name = i.split("_")[1]
        all_error_data.to_csv(os.path.join(merge_error_path, province_name+"_error.csv"))
    
    assert error_num_1==error_num_2

In [138]:
transformed_path='./data/transformed_data'
error_path = './data/error_info'
merge_path = './data/merge_data'
merge_error_path = './data/merge_error'
group_path = './data_group.csv'

In [139]:
for group_id in tqdm(range(1,12)):
    merge_data(group_id, transformed_path, error_path, merge_path, merge_error_path, group_path)

100%|██████████| 11/11 [02:13<00:00, 12.14s/it]
