In [12]:
import os
import sys

from tqdm import tqdm
from time import time

from os.path import abspath

In [13]:
# add absolute path from root to sys.path to use custom modules
sys.path.insert(0, abspath('..'))

# Check if the path is already added to sys.path
[p for p in sys.path if p.endswith("2022")]

# Import custom modules
from src.utils import *

In [14]:
# Author: Author: Ayush Bihani
# Source: https://www.kaggle.com/code/hsuyab/fast-loading-high-compression-with-feather/notebook

In [15]:
file_names = []
for dirname, _, filenames in os.walk("../data/raw"):
    for filename in filenames:
        file_names.append(os.path.join(dirname, filename))

In [16]:
train_paths = []
for k in file_names:
    if 'train_' in k and 'dtypes' not in k:
        train_paths.append(k)
        
train_paths.sort()

In [17]:
# df_test = pd.read_csv(train_paths[0])
# df_test = reduce_mem_usage(df_test)
# 
# df_test.head()
# df_test.game_num.nunique()
# df_test.groupby(["game_num", "event_id"])[["event_time"]].count() # .describe()
# df_test[(df_test.game_num == 1) & (df_test.event_id == 1002)].event_time.abs().round(-1).value_counts().sort_index()
# 
# df_sampled = df_test.groupby(["game_num", "event_id"], group_keys=False).apply(lambda x: x.sample(frac=0.1))
# df_sampled.groupby(["game_num", "event_id"])[["event_time"]].count()
# df_sampled[(df_sampled.game_num == 1) & (df_sampled.event_id == 1002)].event_time.abs().round(-1).value_counts().sort_index()

In [18]:
SAMPLE_FRAC = 0.01
SEED = 777
if not makedir_check("../data/interim/feather_data"):

    time_read = {}
    size_file = {}
    
    for file_path in tqdm(train_paths):
        
        # ext can be feather
        times = {}
        sizes = {}
        
        # stores name of file to be saved as 
        name_files_ftr =  (
            file_path.split("/")[-1]
            .replace(".", "_compressed.")
            .replace("csv","ftr")
        )
        
        # store only file name without extension
        file_only = name_files_ftr.split("_c")[0]

        #read file csv
        t1 = time()
        df = pd.read_csv(file_path)

        if SAMPLE_FRAC < 1:
            df = (
                df
                .groupby(["game_num", "event_id"], group_keys=False)
                .apply(lambda x: x.sample(frac=SAMPLE_FRAC, random_state=SEED))
                .reset_index(drop=True)
                )

        sizes["memory_csv"] = df.memory_usage(deep=True).sum()/(1024**2)
        times["read_csv"] = time() - t1
        df = reduce_mem_usage(df)
        
        # save to feather
        df.to_feather("../data/interim/feather_data/"+name_files_ftr)
        
        # calculate reading time for feather
        t1 = time()
        df = pd.read_feather("../data/interim/feather_data/"+name_files_ftr)
        sizes["memory_feather"] = df.memory_usage(deep=True).sum()/(1024**2)
        times["read_feather"] = time() - t1
        
        # store size and time for a particular file
        time_read[file_only] = times
        size_file[file_only] = sizes


    time_taken_read = pd.DataFrame(time_read).T
    # rounding of time to 3 decimal places
    time_taken_read = time_taken_read.apply(lambda x: x.round(3))
    time_taken_read.columns = ['time_to_read_csv(seconds)', 'time_to_read_feather(seconds)']


    size_of_files = pd.DataFrame(size_file).T
    size_of_files = size_of_files.apply(lambda x: x.round(2))
    size_of_files.columns = ['memory_csv(MB)', 'memory_feather(MB)']
    size_of_files

The directory, ../data/interim/feather_data, has been created!


  0%|          | 0/10 [00:00<?, ?it/s]

Memory usage of dataframe is 10.01 MB
Memory usage after optimization is: 2.46 MB
Decreased by 75.4%


 20%|██        | 2/10 [00:34<02:18, 17.35s/it]

Memory usage of dataframe is 10.11 MB
Memory usage after optimization is: 2.49 MB
Decreased by 75.4%


 30%|███       | 3/10 [00:51<02:00, 17.20s/it]

Memory usage of dataframe is 9.89 MB
Memory usage after optimization is: 2.43 MB
Decreased by 75.4%


 40%|████      | 4/10 [01:08<01:41, 16.99s/it]

Memory usage of dataframe is 9.83 MB
Memory usage after optimization is: 2.42 MB
Decreased by 75.4%


 50%|█████     | 5/10 [01:25<01:24, 16.89s/it]

Memory usage of dataframe is 9.86 MB
Memory usage after optimization is: 2.43 MB
Decreased by 75.4%


 60%|██████    | 6/10 [01:41<01:06, 16.72s/it]

Memory usage of dataframe is 9.75 MB
Memory usage after optimization is: 2.40 MB
Decreased by 75.4%


 70%|███████   | 7/10 [01:57<00:49, 16.52s/it]

Memory usage of dataframe is 9.61 MB
Memory usage after optimization is: 2.36 MB
Decreased by 75.4%


 80%|████████  | 8/10 [02:14<00:33, 16.65s/it]

Memory usage of dataframe is 9.82 MB
Memory usage after optimization is: 2.42 MB
Decreased by 75.4%


 90%|█████████ | 9/10 [02:31<00:16, 16.79s/it]

Memory usage of dataframe is 10.02 MB
Memory usage after optimization is: 2.46 MB
Decreased by 75.4%
Memory usage of dataframe is 9.80 MB
Memory usage after optimization is: 2.41 MB
Decreased by 75.4%


100%|██████████| 10/10 [02:48<00:00, 16.85s/it]


In [19]:
dtypes_df = pd.read_csv("../data/raw/train_dtypes.csv")
dtypes = {k: v for (k, v) in zip(dtypes_df.column, dtypes_df.dtype)}

df = pd.DataFrame()

for i in range(len(train_paths)):
    df_aux = pd.read_feather(f"../data/interim/feather_data/train_{i}_compressed.ftr")
    df_aux = df_aux.astype(dtypes)
    df = pd.concat([df, df_aux])

In [20]:
df = reduce_mem_usage(df)

Memory usage of dataframe is 45.11 MB
Memory usage after optimization is: 25.89 MB
Decreased by 42.6%


In [21]:
import gc
gc.collect()

21

In [22]:
df.reset_index(drop=True).to_feather(f"../data/interim/sample_{round(SAMPLE_FRAC*100)}perc_train_compressed.ftr")