In [1]:
import os
import yaml
import requests
import numpy as np
import pandas as pd

from pymongo import MongoClient

from utils import get_csv_files, memory_usage

In [2]:
BASE_PATH = '../'

CONFIG_DIR = os.path.join(BASE_PATH, 'config')
DATA_DIR = os.path.join(BASE_PATH, 'data')

config = yaml.load(open(os.path.join(CONFIG_DIR, 'env.yml')),
                   Loader=yaml.FullLoader)

In [3]:
model = 'devices'

csv_files = get_csv_files(os.path.join(DATA_DIR, 'raw', model, '*.csv'))

In [4]:
df = pd.read_csv(csv_files[0], sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            300000 non-null  int64 
 1   model         299999 non-null  object
 2   manufacturer  300000 non-null  object
 3   brand         300000 non-null  object
 4   os_version    300000 non-null  object
 5   is_root       300000 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 13.7+ MB


In [5]:
df = df.dropna()

In [6]:
string_columns = df.select_dtypes(include='object').columns.to_list()

In [27]:
from multiprocessing import Pool


def string_op(series):
    return [x.lower().strip() for x in series.tolist()]


def parallelize_dataframe(data, func):
    pool = Pool(4)
    result = pool.map(func, data)
    pool.close()
    pool.join()
    return result

series = [df[x] for x in string_columns]

In [32]:
%%timeit
result = parallelize_dataframe(series, string_op)
for i, column in enumerate(string_columns):
    df.loc[:, column] = result[i]

1.1 s ± 10.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [33]:
%%timeit
for column in string_columns:
    df.loc[:, column] = [x.lower().strip() for x in df[column].tolist()]

774 ms ± 2.95 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
mappings = {'id':'uint32', 'model':'category', 'manufacturer':'category',
            'brand':'category', 'os_version':'category', 'is_root': 'bool'}

In [9]:
df = df.astype(mappings)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 299999 entries, 0 to 299999
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   id            299999 non-null  uint32  
 1   model         299999 non-null  category
 2   manufacturer  299999 non-null  category
 3   brand         299999 non-null  category
 4   os_version    299999 non-null  category
 5   is_root       299999 non-null  bool    
dtypes: bool(1), category(4), uint32(1)
memory usage: 6.7 MB
