# App processes cleaning

In [None]:
import sys

sys.path.append('..')

In [2]:
import os
import gc
import yaml
import json
import requests
import numpy as np
import pandas as pd
import warnings

from pymongo import MongoClient

from utils import get_files, memory_usage, \
correct_encoding, update_page_metadata

warnings.filterwarnings("ignore")

In [3]:
BASE_PATH = '../../'

CONFIG_DIR = os.path.join(BASE_PATH, 'config')
STORAGE_DIR = os.path.join(BASE_PATH, 'storage')
DATA_DIR = os.path.join(BASE_PATH, 'data')

config = yaml.load(open(os.path.join(CONFIG_DIR, 'env.yml')),
                   Loader=yaml.FullLoader)

In [4]:
model = 'app_processes'
model_storage = os.path.join(STORAGE_DIR, model)

if not os.path.exists(model_storage):
    os.makedirs(model_storage)
    
client = MongoClient('mongodb://{}:{}@{}:{}'.format(config['MONGO_USERNAME'], config['MONGO_PASSWORD'],
                                                    config['MONGO_HOST'], config['MONGO_PORT']))
metadata_db = client[config['MONGO_DATABASE']]

model_metadata = metadata_db[model]

model_metadata.delete_many({})
model_metadata.insert_one({'pages': []})

<pymongo.results.InsertOneResult at 0x7f7924fffdc0>

In [5]:
csv_files = get_files(os.path.join(DATA_DIR, model, '*.csv'))

In [6]:
if not csv_files:
    raise FileNotFoundError("Couldn't find any csv files! Please make sure the filepath exists")

FileNotFoundError: Couldn't find any csv files! Please make sure the filepath exists

In [None]:
df = pd.read_csv(csv_files[0], sep=';', nrows=100)
df.info()

In [None]:
df.describe(include=np.number)

In [None]:
df.describe(exclude=np.number)

In [None]:
def save_df_page(page, chunks, collection):
    # concatenate data chunks -> careful benchmark this
    df = pd.concat(chunks, axis=0, ignore_index=True)
    
    collection.update_one({}, {'$inc': {'nrows': df.shape[0]}})

    # store page metadata in mongo
    update_page_metadata(collection, df, exclude=[])

    # save output to a parquet file with brotli compression
    df.to_parquet(os.path.join(model_storage, '{}.{}.parquet'.format(
        model, page)), compression='brotli', index=False)
    
    del df

    return page + 1

In [None]:
mappings = {'sample_id':'uint32', 'domain':'category', 'application_label':'category',
            'is_system_app':'bool'}

model_metadata.update_one({}, {'$set': {'mappings': mappings}})

In [None]:
total_rows = 0
total_memory_usage = 0.0
memory_usage_split = 0.0
last_id = 0

page = 0
chunks = []

features = ['sample_id', 'name', 'application_label', 'is_system_app']

string_columns = df[features].select_dtypes(include='object').columns.to_list()

In [None]:
print('Total csv files: {}'.format(len(csv_files)))

In [None]:
for filepath in csv_files:
    df = pd.read_csv(filepath, sep=';', usecols=features)

    # drop missing values
    df = df.dropna()

    # basic string pre-processing: convert to lowercase and strip blank chars
    for column in string_columns:
        df.loc[:, column] = [x.lower().strip() for x in df[column].tolist()]

    df['domain'] = [x.split(':', 1)[0] for x in df['name'].values]
    
    df = df.drop(['name'], axis=1)
    
    df = df.astype(mappings)

    total_rows += df.shape[0]
    page_memory_usage = memory_usage(df)
    total_memory_usage += page_memory_usage
    memory_usage_split += page_memory_usage

    chunks.append(df)

    if memory_usage_split >= config['MEMORY_USAGE_SPLIT']:
        print('Page {} created!'.format(page))
        page = save_df_page(page, chunks, model_metadata)

        del chunks[:]
        del chunks
        gc.collect()

        chunks = []
        memory_usage_split = 0.0

    print('{} ✔'.format(filepath))

In [None]:
if chunks:
    save_df_page(page, chunks, model_metadata)
    del chunks[:]
    del chunks
    gc.collect()

In [None]:
model_metadata.update_one({}, {'$set': {'total': total_rows}})

In [None]:
total_rows, total_memory_usage

In [None]:
content = '**{}** | task: data pre-processing, rows: {}, memory usage: {} MB' \
.format(model, total_rows, total_memory_usage)

requests.post(config['discord_webhook_url'], data={'content': content})