### PREPROCESS TXTs WITH DATA AND PRODUCE PICKLES BY MONTHS

In [None]:
import pandas as pd
from datetime import datetime
import pickle
import bz2
from pathlib import Path

In [None]:
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

In [None]:
data_path = Path('./data/').resolve()
filenumber = '11'

In [None]:
colunms = [s for s in 'date;id_doc;id_order;id_card;id_tov;id_kontr;quantity;sum;is_green'.split(';')]
colunms

In [None]:
df_chunks = pd.read_csv(
    filepath_or_buffer= data_path / (filenumber + '.txt'),
    header=0,
    sep=';',
    names = colunms,
    usecols = colunms,
    parse_dates=['date'],
    date_parser=dateparse,
    chunksize=1000000,
    # skiprows=274011
)

In [None]:
with bz2.open(data_path / 'card_lbe.pkl.bz2', 'rb') as f:
    card_mapper = pickle.load(f)
card_mapper.id_card = card_mapper.id_card.str.strip()
card_mapper.head()

In [None]:
with bz2.open(data_path / 'doc_lbe.pkl.bz2', 'rb') as f:
    doc_mapper = pickle.load(f)
doc_mapper.id_doc = doc_mapper.id_doc.str.strip()
doc_mapper.head()

In [None]:
def map_data(df, card_mapper, doc_mapper):
    df.id_card = df.id_card.str.strip()
    df.id_doc = df.id_doc.str.strip()

    df = pd.merge(df, card_mapper, on='id_card', how='left')
    df = pd.merge(df, doc_mapper, on='id_doc', how='left')

    del df['id_card']
    del df['id_doc']

    return df

def cleanup_data(df):
    na_int_val = -9999
    df['quantity'] = df['quantity'].str.replace(',', '.').astype(float)
    df['sum'] = df['sum'].str.replace(',', '.').astype(float)
    df['is_green'] = df['is_green'].astype('bool')
    df['id_kontr'] = df['id_kontr'].fillna(na_int_val).astype(int)

    print(df.dtypes)

    if df.isnull().values.any():
        print("ERROR: THERE ARE STILL NULL VALUES")
        print(df.loc[:, df.isnull().any()].columns)

    return df

In [None]:
for chunk in df_chunks:
    chunk = map_data(chunk, card_mapper, doc_mapper)
    chunk = cleanup_data(chunk)
    print(chunk.head())
    with bz2.open(data_path / (filenumber + '_myprepared.pkl.bz2'), 'ab') as f:
        pickle.dump(chunk, f, protocol=4)

### FIND BAD CARD IDS AND SAVE TO TXT

In [1]:
from pathlib import Path
import pandas as pd
import gc
from sklearn.preprocessing import LabelEncoder
import pickle
import bz2
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data_path = Path('./data/').resolve()
!ls {data_path}

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
with bz2.open(data_path / '11_prepared.pkl.bz2', 'rb') as f:
    df = pickle.load(f)
df.head()

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,id_card_int,id_doc_int
0,2020-11-01 15:29:01,2220AF19-3E1C-EB11-B444-005056A7539A,0,1826606,52,271,1.0,107.98,False,332351,3121751
1,2020-11-01 12:41:10,F102DC7D-261C-EB11-B444-005056A7539A,8293317,C560312,52,-9999,1.0,108.0,False,1875438,22047017
2,2020-11-01 13:17:09,4632D419-2C1C-EB11-B444-005056A7539A,0,3225041,61,379,2.0,112.0,False,563112,6419825
3,2020-11-01 12:31:31,706B9E66-251C-EB11-B444-005056A7539A,0,B700679,61,379,1.0,55.91,False,1841710,10284541
4,2020-11-01 13:06:15,71F3E090-2A1C-EB11-B444-005056A7539A,0,7024046,61,271,2.0,87.8,False,1304277,10424967


In [4]:
df.drop(['id_card', 'id_doc'], axis=1, inplace=True)
df.head()

Unnamed: 0,date,id_order,id_tov,id_kontr,quantity,sum,is_green,id_card_int,id_doc_int
0,2020-11-01 15:29:01,0,52,271,1.0,107.98,False,332351,3121751
1,2020-11-01 12:41:10,8293317,52,-9999,1.0,108.0,False,1875438,22047017
2,2020-11-01 13:17:09,0,61,379,2.0,112.0,False,563112,6419825
3,2020-11-01 12:31:31,0,61,379,1.0,55.91,False,1841710,10284541
4,2020-11-01 13:06:15,0,61,271,2.0,87.8,False,1304277,10424967


In [5]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,date,id_order,id_tov,id_kontr,quantity,sum,is_green,id_card_int,id_doc_int
0,2020-11-23 16:20:14,0,18968,21847,1.0,149.0,False,276039,20875398
1,2020-11-09 21:07:55,8806483,22516,-9999,2.0,62.0,False,1284109,16188924
2,2020-11-15 19:30:37,0,19120,131,1.0,35.0,False,1358347,5985850
3,2020-11-21 18:50:30,0,27494,18670,1.0,93.0,True,1426130,370379
4,2020-11-06 11:53:41,8582923,34071,13997,1.0,135.0,False,535985,18268849


In [6]:
len(df)

46496621

In [7]:
bad_cards = set()

In [8]:
def find_bad_id_cards(chunk):
    bad_cards.update(chunk[chunk['sum'] < 0]['id_card_int'].unique())
    bad_cards.update(chunk[chunk['quantity'] < 0]['id_card_int'].unique())
    bad_cards.update(chunk[chunk['is_green'] < 0]['id_card_int'].unique())
    bad_cards.update(chunk[chunk['id_tov'] < 0]['id_card_int'].unique())
    bad_cards.update(chunk[chunk['id_order'] < 0]['id_card_int'].unique())
    bad_cards.update(chunk[(chunk['id_kontr'] < 0) & (chunk['id_kontr'] != -9999)]['id_card_int'].unique())
    
    card_unique_days = chunk.groupby(['id_card_int'])['day'].nunique()
    card_unique_days = card_unique_days.sort_values(ascending=False)
    card_unique_days = pd.DataFrame(card_unique_days)
    bad_days = card_unique_days[card_unique_days['day'] > 15].reset_index()
    bad_cards.update(bad_days['id_card_int'].unique())
    # print(bad_days['id_card_int'].unique())

    doc_card_day_sum_grouped = chunk.groupby(['id_card_int','day'])
    cards_sum_by_days = doc_card_day_sum_grouped.sum()
    cards_sum_by_days = cards_sum_by_days.reset_index()
    
    bad_cards.update(cards_sum_by_days[cards_sum_by_days['sum'] > 50000]['id_card_int'].unique())
    bad_cards.update(cards_sum_by_days[cards_sum_by_days['quantity'] > 1000]['id_card_int'].unique())

def write_bad_ids_to_file_smart(file_name):
    known_bad_ids = set()
    with open(data_path / file_name, 'r') as input_file:
        for item in input_file.readlines():
            known_bad_ids.add(int(item.strip()))

    known_bad_ids.update(bad_cards)

    with open(data_path / file_name, 'w') as output_file:
        for item in known_bad_ids:
            output_file.write(str(item) + "\n")


In [13]:
CHUNK_SIZE = 2000000
OUTPUT_TXT_NAME = 'bad_ids.txt'

In [10]:
for chunk_id in range((len(df) + CHUNK_SIZE - 1) // CHUNK_SIZE):
    chunk = df[chunk_id * CHUNK_SIZE : (chunk_id + 1) * CHUNK_SIZE]
    chunk['day'] = chunk['date'].dt.date
    find_bad_id_cards(chunk)
    write_bad_ids_to_file_smart(OUTPUT_TXT_NAME)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package s

### TRANSFORM TXT OUTPUT TO PICKLE

In [17]:
def copy_from_txt_to_pickle(file_name):
    known_bad_ids = set()
    with open(data_path / file_name, 'r') as input_file:
        for item in input_file.readlines():
            known_bad_ids.add(int(item.strip()))

    df_txt = pd.DataFrame({'id_card_int' : list(known_bad_ids)})
    print(df_txt)
    with bz2.open(data_path / 'hw1.1.pkl.bz2', 'wb') as f:
        pickle.dump(df_txt, f, protocol=4)

In [16]:
copy_from_txt_to_pickle(OUTPUT_TXT_NAME)

### SOME TRASH NEXT. I ANALYZED STATISTICS THERE, NOW IT IS BROKEN.

In [None]:

fig = plt.figure()
ax = fig.add_subplot(111)
n, bins, rectangles = ax.hist(card_unique_days, 50)
fig.canvas.draw()
plt.show()

In [None]:
pd.DataFrame(card_unique_days).boxplot()

In [None]:

good_cards = card_unique_days[card_unique_days['day'] <= 15]
good_cards = good_cards.reset_index()

In [None]:
chunk['id_card_int'].nunique() - len(good_cards)

In [None]:

bad_cards = bad_cards.reset_index()
100*(len(bad_cards) / len(chunk))

In [None]:
list_good_cards = set(good_cards['id_card_int'])
doc_card_day_sum_clean = chunk[chunk.id_card_int.isin(list_good_cards)]
len(doc_card_day_sum_clean)

In [None]:
len(doc_card_day_sum_clean['id_card_int'].unique())

In [None]:
doc_card_day_sum_clean_grouped = doc_card_day_sum_clean.groupby(['id_card_int','day'])
doc_card_day_sum_clean_grouped_sum = doc_card_day_sum_clean_grouped.sum()
doc_card_day_sum_clean_grouped_sum_index = doc_card_day_sum_clean_grouped_sum.reset_index()
doc_card_day_sum_clean_grouped_sum_index.max()

In [None]:
len(doc_card_day_sum_clean_grouped_sum_index)

In [None]:
len(doc_card_day_sum_clean)

In [None]:
doc_card_day_sum_clean_grouped_sum_index.dtypes

In [None]:
cards = doc_card_day_sum_clean_grouped_sum_index[doc_card_day_sum_clean_grouped_sum_index['id_card_int'] == 100]
cards.head()

In [None]:
cards = doc_card_day_sum_clean[(doc_card_day_sum_clean['id_card_int'] == 100) & (doc_card_day_sum_clean['date'] >= '2020-09-24')]
cards.head()

In [None]:
doc_card_day_sum_clean_grouped_sum_index.mean()

In [None]:
doc_card_day_sum_clean_grouped_sum_index.median()

In [None]:
print(
    np.percentile(doc_card_day_sum_clean_grouped_sum_index['sum'], q=75),
    np.percentile(doc_card_day_sum_clean_grouped_sum_index['sum'], q=95),
    np.percentile(doc_card_day_sum_clean_grouped_sum_index['sum'], q=99)
)

In [None]:
pd.DataFrame(doc_card_day_sum_clean['is_green']).boxplot()

In [None]:
doc_card_day_sum_clean_grouped_sum_index[doc_card_day_sum_clean_grouped_sum_index['sum'] > 5000]

In [None]:
bad_cards = doc_card_day_sum_clean_grouped_sum_index[doc_card_day_sum_clean_grouped_sum_index['sum'] > 10000]
bad_cards = list(bad_cards['id_card_int'].unique())
final_clean = doc_card_day_sum_clean_grouped_sum_index[~doc_card_day_sum_clean_grouped_sum_index.id_card_int.isin(bad_cards)]
fig = plt.figure()
ax = fig.add_subplot(111)
n, bins, rectangles = ax.hist(final_clean['sum'], 50)
fig.canvas.draw()
plt.show()

In [None]:
pd.DataFrame(final_clean['quantity']).boxplot()

In [None]:
final_clean.describe()

In [None]:
pd.DataFrame(final_clean['id_kontr']).boxplot()

In [None]:
bad_cards = final_clean[(final_clean['id_kontr'] < 0) & (final_clean['id_kontr'] != -9999)]
bad_cards = list(bad_cards['id_card_int'].unique())
len(bad_cards)
# bad_cards.head()

In [None]:
len(final_clean)

In [None]:
len(bad_cards)

In [None]:
bad_cards = final_clean[final_clean['quantity'] < 0]
bad_cards = list(bad_cards['id_card_int'].unique())
final_clean = final_clean[~final_clean.id_card_int.isin(bad_cards)]

In [None]:
bad_cards = final_clean[final_clean['sum'] < 0]
bad_cards = list(bad_cards['id_card_int'].unique())
final_clean = final_clean[~final_clean.id_card_int.isin(bad_cards)]

In [None]:
bad_cards = final_clean[(final_clean['id_kontr'] < 0) & (final_clean['id_kontr'] != -9999)]
bad_cards = list(bad_cards['id_card_int'].unique())
final_clean = final_clean[~final_clean.id_card_int.isin(bad_cards)]

In [None]:
bad_cards = final_clean[(final_clean['quantity'] > 20)]
bad_cards.head()

In [None]:
len(bad_cards)

In [None]:
quantities = final_clean.groupby(['id_card_int'])['quantity'].nunique()
quantities = quantities.sort_values(ascending=False)
fig = plt.figure()
ax = fig.add_subplot(111)
n, bins, rectangles = ax.hist(quantities, 50)
fig.canvas.draw()
plt.show()