### Cleaning

In [1]:
from pathlib import Path
import pandas as pd
import gc
from sklearn.preprocessing import LabelEncoder
import pickle
import bz2

In [2]:
data_path = Path('./data/').resolve()
!ls {data_path}

09.csv		     10.csv  products_20200901.txt  products_20201101.txt
09_prepared.pkl.bz2  11.csv  products_20201001.txt


In [3]:
!du -sh {data_path / '09.csv'}
!du -sh {data_path / '10.csv'}
!du -sh {data_path / '11.csv'}

3,8G	/data/Projects/inno_stats/data/09.csv
4,1G	/data/Projects/inno_stats/data/10.csv
4,2G	/data/Projects/inno_stats/data/11.csv


In [4]:
file_path = data_path/'09.csv' 

In [5]:
# sed -i '274011d' 10.csv

In [6]:
df = pd.read_csv(file_path, sep=';',
                 parse_dates=['date'], 
                 infer_datetime_format=True, 
                 dtype={'id_doc':'category', 'id_card':'category'}
                )
df.head()

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green
0,2020-09-01 17:01:25,0E05D308-5CEC-EA11-B444-005056A7539A,0,8594499,52,271.0,1000,108,0
1,2020-09-01 17:35:36,610205A1-61EC-EA11-B444-005056A7539A,0,1452388,52,271.0,1000,108,0
2,2020-09-01 19:19:21,8AF19602-70EC-EA11-B444-005056A7539A,0,3493538,52,271.0,1000,10795,0
3,2020-09-01 21:53:34,EB6C71A3-84EC-EA11-B444-005056A7539A,0,2491281,52,271.0,1000,108,0
4,2020-09-01 18:42:31,0706023F-6BEC-EA11-B444-005056A7539A,0,5732396,61,97.0,2000,88,0


In [7]:
df.head()

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green
0,2020-09-01 17:01:25,0E05D308-5CEC-EA11-B444-005056A7539A,0,8594499,52,271.0,1000,108,0
1,2020-09-01 17:35:36,610205A1-61EC-EA11-B444-005056A7539A,0,1452388,52,271.0,1000,108,0
2,2020-09-01 19:19:21,8AF19602-70EC-EA11-B444-005056A7539A,0,3493538,52,271.0,1000,10795,0
3,2020-09-01 21:53:34,EB6C71A3-84EC-EA11-B444-005056A7539A,0,2491281,52,271.0,1000,108,0
4,2020-09-01 18:42:31,0706023F-6BEC-EA11-B444-005056A7539A,0,5732396,61,97.0,2000,88,0


In [8]:
df.dtypes

date        datetime64[ns]
id_doc            category
id_order             int64
id_card           category
id_tov               int64
id_kontr           float64
quantity            object
sum                 object
is_green             int64
dtype: object

In [9]:
na_int_val = -9999
df['quantity'] = df['quantity'].str.replace(',', '.').astype(float)
df['sum'] = df['sum'].str.replace(',', '.').astype(float)
df['is_green'] = df['is_green'].astype('bool')
df['id_kontr'] = df['id_kontr'].fillna(na_int_val).astype(int)

In [11]:
df.dtypes

date        datetime64[ns]
id_doc            category
id_order             int64
id_card           category
id_tov               int64
id_kontr             int64
quantity           float64
sum                float64
is_green              bool
dtype: object

In [12]:
df['id_card'] = df['id_card'].str.strip()
df['id_doc'] = df['id_doc'].str.strip()

In [13]:
df.head()

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green
0,2020-09-01 17:01:25,0E05D308-5CEC-EA11-B444-005056A7539A,0,8594499,52,271,1.0,108.0,False
1,2020-09-01 17:35:36,610205A1-61EC-EA11-B444-005056A7539A,0,1452388,52,271,1.0,108.0,False
2,2020-09-01 19:19:21,8AF19602-70EC-EA11-B444-005056A7539A,0,3493538,52,271,1.0,107.95,False
3,2020-09-01 21:53:34,EB6C71A3-84EC-EA11-B444-005056A7539A,0,2491281,52,271,1.0,108.0,False
6,2020-09-01 19:50:31,1B34A539-74EC-EA11-B444-005056A7539A,0,4119786,61,97,1.0,56.0,False


In [14]:
lbe_card = LabelEncoder()
lbe_doc = LabelEncoder()

lbe_card.fit(df['id_card'])
df['id_card_int'] = lbe_card.transform(df['id_card'])

lbe_doc.fit(df['id_doc'])
df['id_doc_int'] = lbe_doc.transform(df['id_doc'])

df.head()

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,id_card_int,id_doc_int
0,2020-09-01 17:01:25,0E05D308-5CEC-EA11-B444-005056A7539A,0,8594499,52,271,1.0,108.0,False,1087018,386563
1,2020-09-01 17:35:36,610205A1-61EC-EA11-B444-005056A7539A,0,1452388,52,271,1.0,108.0,False,199266,2676152
2,2020-09-01 19:19:21,8AF19602-70EC-EA11-B444-005056A7539A,0,3493538,52,271,1.0,107.95,False,444332,3832560
3,2020-09-01 21:53:34,EB6C71A3-84EC-EA11-B444-005056A7539A,0,2491281,52,271,1.0,108.0,False,325059,6489899
6,2020-09-01 19:50:31,1B34A539-74EC-EA11-B444-005056A7539A,0,4119786,61,97,1.0,56.0,False,533043,750155


In [15]:
df.dtypes

date           datetime64[ns]
id_doc                 object
id_order                int64
id_card                object
id_tov                  int64
id_kontr                int64
quantity              float64
sum                   float64
is_green                 bool
id_card_int             int64
id_doc_int              int64
dtype: object

In [16]:
with bz2.open(data_path / '09_prepared.pkl.bz2', 'wb') as f:
    pickle.dump(df, f, protocol=4)