In [1]:
import os
import sys

from tqdm import tqdm
from time import time

from os.path import abspath

In [2]:
# add absolute path from root to sys.path to use custom modules
sys.path.insert(0, abspath('..'))

# Check if the path is already added to sys.path
[p for p in sys.path if p.endswith("2022")]

# Import custom modules
from src.utils import *

In [3]:
# Author: Author: Ayush Bihani
# Source: https://www.kaggle.com/code/hsuyab/fast-loading-high-compression-with-feather/notebook

In [4]:
file_names = []
for dirname, _, filenames in os.walk("../data/raw"):
    for filename in filenames:
        file_names.append(os.path.join(dirname, filename))

In [5]:
train_paths = []
for k in file_names:
    if 'train_' in k and 'dtypes' not in k:
        train_paths.append(k)
        
train_paths

['../data/raw/train_9.csv',
 '../data/raw/train_4.csv',
 '../data/raw/train_5.csv',
 '../data/raw/train_0.csv',
 '../data/raw/train_1.csv',
 '../data/raw/train_2.csv',
 '../data/raw/train_3.csv',
 '../data/raw/train_6.csv',
 '../data/raw/train_8.csv',
 '../data/raw/train_7.csv']

In [6]:
if not makedir_check("../data/interim/feather_data"):

    time_read = {}
    size_file = {}
    
    for file_path in tqdm(train_paths):
        
        # ext can be feather
        times = {}
        sizes = {}
        
        # stores name of file to be saved as 
        name_files_ftr =  (
            file_path.split("/")[-1]
            .replace(".", "_compressed.")
            .replace("csv","ftr")
        )
        
        # store only file name without extension
        file_only = name_files_ftr.split("_c")[0]

        #read file csv
        t1 = time()
        df = pd.read_csv(file_path)
        sizes["memory_csv"] = df.memory_usage(deep=True).sum()/(1024**2)
        times["read_csv"] = time() - t1
        df = reduce_mem_usage(df)
        
        # save to feather
        df.to_feather("../data/interim/feather_data/"+name_files_ftr)
        
        # calculate reading time for feather
        t1 = time()
        df = pd.read_feather("../data/interim/feather_data/"+name_files_ftr)
        sizes["memory_feather"] = df.memory_usage(deep=True).sum()/(1024**2)
        times["read_feather"] = time() - t1
        
        # store size and time for a particular file
        time_read[file_only] = times
        size_file[file_only] = sizes


    time_taken_read = pd.DataFrame(time_read).T
    # rounding of time to 3 decimal places
    time_taken_read = time_taken_read.apply(lambda x: x.round(3))
    time_taken_read.columns = ['time_to_read_csv(seconds)', 'time_to_read_feather(seconds)', 'time_to_read_parquet(seconds)']


    size_of_files = pd.DataFrame(size_file).T
    size_of_files = size_of_files.apply(lambda x: x.round(2))
    size_of_files.columns = ['memory_csv(MB)', 'memory_feather(MB)', 'memory_parquet(MB)']
    size_of_files

The directory already exists!


In [7]:
dtypes_df = pd.read_csv("../data/raw/train_dtypes.csv")
dtypes = {k: v for (k, v) in zip(dtypes_df.column, dtypes_df.dtype)}

df = pd.DataFrame()

for i in range(10):
    df_aux = pd.read_feather(f"../data/interim/feather_data/train_{i}_compressed.ftr")
    df_aux = df_aux.astype(dtypes)
    df = pd.concat([df, df_aux])

In [8]:
df = reduce_mem_usage(df)

Memory usage of dataframe is 4508.17 MB
Memory usage after optimization is: 2587.65 MB
Decreased by 42.6%


In [9]:
import gc
gc.collect()

0

In [10]:
df.reset_index(drop=True).to_feather("../data/interim/train_compressed.ftr")