In [9]:
import os
import sys

from tqdm import tqdm
from time import time

from os.path import abspath

In [10]:
# add absolute path from root to sys.path to use custom modules
sys.path.insert(0, abspath('..'))

# Check if the path is already added to sys.path
[p for p in sys.path if p.endswith("2022")]

# Import custom modules
from src.utils import *

In [11]:
# Author: Author: Ayush Bihani
# Source: https://www.kaggle.com/code/hsuyab/fast-loading-high-compression-with-feather/notebook

In [12]:
file_names = []
for dirname, _, filenames in os.walk("../data/raw"):
    for filename in filenames:
        file_names.append(os.path.join(dirname, filename))

In [13]:
train_paths = []
for k in file_names:
    if 'train_' in k and 'dtypes' not in k:
        train_paths.append(k)
        
train_paths.sort()

In [14]:
# df_test = pd.read_csv(train_paths[0])
# df_test = reduce_mem_usage(df_test)
# 
# df_test.head()
# df_test.game_num.nunique()
# df_test.groupby(["game_num", "event_id"])[["event_time"]].count() # .describe()
# df_test[(df_test.game_num == 1) & (df_test.event_id == 1002)].event_time.abs().round(-1).value_counts().sort_index()
# 
# df_sampled = df_test.groupby(["game_num", "event_id"], group_keys=False).apply(lambda x: x.sample(frac=0.1))
# df_sampled.groupby(["game_num", "event_id"])[["event_time"]].count()
# df_sampled[(df_sampled.game_num == 1) & (df_sampled.event_id == 1002)].event_time.abs().round(-1).value_counts().sort_index()

In [15]:
SAMPLE_FRAC = 0.1
if not makedir_check("../data/interim/feather_data"):

    time_read = {}
    size_file = {}
    
    for file_path in tqdm(train_paths):
        
        # ext can be feather
        times = {}
        sizes = {}
        
        # stores name of file to be saved as 
        name_files_ftr =  (
            file_path.split("/")[-1]
            .replace(".", "_compressed.")
            .replace("csv","ftr")
        )
        
        # store only file name without extension
        file_only = name_files_ftr.split("_c")[0]

        #read file csv
        t1 = time()
        df = pd.read_csv(file_path)

        if SAMPLE_FRAC < 1:
            df = (
                df
                .groupby(["game_num", "event_id"], group_keys=False)
                .apply(lambda x: x.sample(frac=SAMPLE_FRAC))
                .reset_index(drop=True)
                )

        sizes["memory_csv"] = df.memory_usage(deep=True).sum()/(1024**2)
        times["read_csv"] = time() - t1
        df = reduce_mem_usage(df)
        
        # save to feather
        df.to_feather("../data/interim/feather_data/"+name_files_ftr)
        
        # calculate reading time for feather
        t1 = time()
        df = pd.read_feather("../data/interim/feather_data/"+name_files_ftr)
        sizes["memory_feather"] = df.memory_usage(deep=True).sum()/(1024**2)
        times["read_feather"] = time() - t1
        
        # store size and time for a particular file
        time_read[file_only] = times
        size_file[file_only] = sizes


    time_taken_read = pd.DataFrame(time_read).T
    # rounding of time to 3 decimal places
    time_taken_read = time_taken_read.apply(lambda x: x.round(3))
    time_taken_read.columns = ['time_to_read_csv(seconds)', 'time_to_read_feather(seconds)']


    size_of_files = pd.DataFrame(size_file).T
    size_of_files = size_of_files.apply(lambda x: x.round(2))
    size_of_files.columns = ['memory_csv(MB)', 'memory_feather(MB)']
    size_of_files

The directory, ../data/interim/feather_data, has been created!


  0%|          | 0/10 [00:00<?, ?it/s]

Memory usage of dataframe is 100.03 MB
Memory usage after optimization is: 24.60 MB
Decreased by 75.4%


 10%|█         | 1/10 [00:34<05:10, 34.47s/it]

Memory usage of dataframe is 100.94 MB


 20%|██        | 2/10 [01:07<04:30, 33.81s/it]

Memory usage after optimization is: 24.82 MB
Decreased by 75.4%
Memory usage of dataframe is 98.92 MB


 30%|███       | 3/10 [01:40<03:53, 33.34s/it]

Memory usage after optimization is: 24.32 MB
Decreased by 75.4%
Memory usage of dataframe is 98.41 MB


 40%|████      | 4/10 [02:14<03:20, 33.43s/it]

Memory usage after optimization is: 24.20 MB
Decreased by 75.4%
Memory usage of dataframe is 98.56 MB


 50%|█████     | 5/10 [02:48<02:48, 33.74s/it]

Memory usage after optimization is: 24.24 MB
Decreased by 75.4%
Memory usage of dataframe is 97.44 MB


 60%|██████    | 6/10 [03:21<02:13, 33.44s/it]

Memory usage after optimization is: 23.96 MB
Decreased by 75.4%
Memory usage of dataframe is 96.06 MB


 70%|███████   | 7/10 [03:53<01:38, 32.99s/it]

Memory usage after optimization is: 23.62 MB
Decreased by 75.4%
Memory usage of dataframe is 98.17 MB


 80%|████████  | 8/10 [04:26<01:05, 32.92s/it]

Memory usage after optimization is: 24.14 MB
Decreased by 75.4%
Memory usage of dataframe is 100.12 MB


 90%|█████████ | 9/10 [04:59<00:32, 32.92s/it]

Memory usage after optimization is: 24.62 MB
Decreased by 75.4%
Memory usage of dataframe is 97.87 MB


100%|██████████| 10/10 [05:32<00:00, 33.27s/it]

Memory usage after optimization is: 24.07 MB
Decreased by 75.4%





In [16]:
dtypes_df = pd.read_csv("../data/raw/train_dtypes.csv")
dtypes = {k: v for (k, v) in zip(dtypes_df.column, dtypes_df.dtype)}

df = pd.DataFrame()

for i in range(len(train_paths)):
    df_aux = pd.read_feather(f"../data/interim/feather_data/train_{i}_compressed.ftr")
    df_aux = df_aux.astype(dtypes)
    df = pd.concat([df, df_aux])

In [17]:
df = reduce_mem_usage(df)

Memory usage of dataframe is 450.80 MB
Memory usage after optimization is: 258.76 MB
Decreased by 42.6%


In [18]:
import gc
gc.collect()

0

In [19]:
df.reset_index(drop=True).to_feather(f"../data/interim/sample_{round(SAMPLE_FRAC*100)}perc_train_compressed.ftr")