In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import os

In [2]:
def filename(index):
    return f'export_file_{index}.csv'

_dtypes={0: np.int32,
         2: np.int32,
         3: np.int32,
         4: str,
         5: np.float64,
         6: np.float64}

def get_chunks(filename, encoding='utf-8', decimal=b'.', sep=','):
    return pd.read_csv(filename,
                         sep=sep,
                         encoding=encoding,
                         chunksize=20000,
                         skiprows=[2],
                         header=0,
                         dtype=_dtypes,
                         infer_datetime_format=True,
                         parse_dates=[1],
                         decimal=decimal,
                         names=['Shop_id',
                                'Date',
                                'Check',
                                'Product_id',
                                'Product_name',
                                'Quantity',
                                'Price'])
    

In [6]:
# refines all of the files by removing the old ones and creating clean copies
for i in range(1,16):
    for chunk in get_chunks(filename(i), encoding='cp1251', decimal=b',', sep=';'):
        chunk.update({'Product_name': [x.strip() for x in chunk['Product_name']]})
        chunk.to_csv(f'{i}.csv',
                     mode='a', 
                     columns=['Shop_id',
                              'Date',
                              'Check',
                              'Product_id',
                              'Product_name',
                              'Quantity',
                              'Price'],
                     float_format = '%.3f',
                     index=False,
                     header = not os.path.isfile(f'{i}.csv'))
    os.remove(filename(i))
    print('\r{} out of 15'.format(i), end='')

15 out of 15

In [4]:
# creates a file of product name and product code correspondence
_dtypes={0: np.int32,
         2: np.int32,
         3: np.int32,
         4: str,
         5: np.float64,
         6: np.float64}

for i in range(1, 16):
    for chunk in get_chunks(f'source/{i}.csv'):
        chunk[['Product_id','Product_name']].drop_duplicates(subset='Product_id').set_index('Product_id').to_csv(
            'pr_id_name.csv',
            mode='a',
            columns=['Product_name'],
            header=not os.path.isfile('pr_id_name.csv'))
    print('\r{} out of 15'.format(i), end='')
    
name_id = pd.read_csv('pr_id_name.csv')
name_id.drop_duplicates(subset='Product_id').to_csv('pr_id_name.csv')

In [3]:
# creates a file for each date located in /dates
for i in range(15, 16):
    for chunk in get_chunks(f'{i}.csv'):
        for date_ in chunk.Date.drop_duplicates():
            out = f'dates/{date_:%Y-%m-%d}.csv'
            chunk[chunk.Date == date_][['Check', 'Product_id']].to_csv(out,
                                                                   index=False,
                                                                   mode='a',
                                                                   header=not os.path.isfile(out),
                                                                   columns=['Check','Product_id'])
    print('\r{} out of 15'.format(i), end='')

15 out of 15