# Data Chunking

#### This script converts the stock data into manageable data chunks and orders them on an yearly and quarterly bases in folders. The data can be saves as a csv file or alternatively as a pickle datafile

Load the necessary python libraries

In [1]:
import os
import pandas as pd

In [3]:
#!/usr/bin/env python
#title           :Stata_data_file_chunking.py
#description     :This will create a header for a python script.
#author          :Emmanuel Mensah Boateng
#date            :20240915
#version         :1.0
#usage           :python pyscript.py
#notes           :
#==============================================================================


# p<th to stata .dta file path for chunking
file_path = '../Data/dsws_variables_monthly_basic_2021.dta'

# Output directory for results 
output_dir = '../Data/dsws_Data/'  
os.makedirs(output_dir, exist_ok=True)  # Create directory if nonexistent

# Define chunk size (e.g., 100,000 rows at a time)
chunk_size = 100000

# Function to extract the year and quarter from the data
def extract_year_quarter(df, date_column):
    df['year'] = pd.to_datetime(df['date']).dt.year
    df['quarter'] = pd.to_datetime(df['date']).dt.to_period('Q')
    
    # Convert 'quarter' to a string to avoid issues
    df['quarter'] = df['quarter'].astype(str)
    
    # Return the DataFrame with both columns at once to avoid fragmentation
    return df

# Function to extract the year and quarter from the data
def extract_year_quarter(df, date_column):
    # Use loc to avoid SettingWithCopyWarning and make a copy to avoid fragmentation
    df = df.copy()
    
    # Create new columns in a single step to avoid DataFrame fragmentation
    df['year'] = pd.to_datetime(df[date_column]).dt.year
    df['quarter'] = pd.to_datetime(df[date_column]).dt.to_period('Q').astype(str)
    
    return df

# Read the .dta file in chunks
chunk_iter = pd.read_stata(file_path, chunksize=chunk_size)

# Process each chunk
for i, chunk in enumerate(chunk_iter):
    print(f'Processing chunk {i+1}...')
    
    # Filter data where the 'country' column has value 'usa'
    chunk = chunk.loc[chunk['country'] == 'usa']  # Use .loc[] to avoid SettingWithCopyWarning

    # If no rows left after filtering, skip this chunk
    if chunk.empty:
        continue
    
    # Extract year and quarter (assuming you have a date column like 'date')
    chunk = extract_year_quarter(chunk, 'date')  # Update 'date' to your actual date column

    # Group the chunk by year and quarter, and save it into separate Pickle files
    for (year, quarter), group in chunk.groupby(['year', 'quarter']):
        # Create a subdirectory for each year
        year_dir = os.path.join(output_dir, str(year))
        os.makedirs(year_dir, exist_ok=True)

        # Define the filename for the Pickle file for each quarter
        pickle_filename = os.path.join(year_dir, f'{year}_Q{quarter[-1]}.pkl')

        # Save the group data to a Pickle file
        group.to_pickle(pickle_filename)

print("Data processing and saving to Pickle files complete.")

Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Processing chunk 12...
Processing chunk 13...
Processing chunk 14...
Processing chunk 15...
Processing chunk 16...
Processing chunk 17...
Processing chunk 18...
Processing chunk 19...
Processing chunk 20...
Processing chunk 21...
Processing chunk 22...
Processing chunk 23...
Processing chunk 24...
Processing chunk 25...
Processing chunk 26...
Processing chunk 27...
Processing chunk 28...
Processing chunk 29...
Processing chunk 30...
Processing chunk 31...
Processing chunk 32...
Processing chunk 33...
Processing chunk 34...
Processing chunk 35...
Processing chunk 36...
Processing chunk 37...
Processing chunk 38...
Processing chunk 39...
Processing chunk 40...
Processing chunk 41...
Processing chunk 42...
Processing chunk 43...
Processing chunk 44.

Chunking data into hDF5 files

In [None]:
import os
import pandas as pd

# Path to your large .dta file (replace this with your actual file path)
file_path = '../Data/dsws_variables_monthly_basic_2021.dta'

# Directory to save the organized data by year and quarter
output_dir = '../Data/dsws_Data/'  # Update this
os.makedirs(output_dir, exist_ok=True)  # Create directory if it doesn't exist

# Define chunk size (e.g., 100,000 rows at a time)
chunk_size = 100000

# Function to extract the year and quarter from the data
def extract_year_quarter(df, date_column):
    df['year'] = pd.to_datetime(df[date_column]).dt.year
    df['quarter'] = pd.to_datetime(df[date_column]).dt.to_period('Q')
    
    # Convert 'quarter' to a string to avoid issues with HDF5
    df['quarter'] = df['quarter'].astype(str)
    
    # Return a new DataFrame with both columns at once to avoid fragmentation
    return df

# Read the .dta file in chunks
chunk_iter = pd.read_stata(file_path, chunksize=chunk_size)

# Process each chunk
for i, chunk in enumerate(chunk_iter):
    print(f'Processing chunk {i+1}...')
    
    # Extract year and quarter (assuming you have a date column like 'date')
    chunk = extract_year_quarter(chunk, 'date')  # Update 'date' to your actual date column

    # Group the chunk by year and quarter, and save it into separate HDF5 files
    for (year, quarter), group in chunk.groupby(['year', 'quarter']):
        # Create a subdirectory for each year
        year_dir = os.path.join(output_dir, str(year))
        os.makedirs(year_dir, exist_ok=True)

        # Define the filename for the HDF5 file for each quarter
        hdf5_filename = os.path.join(year_dir, f'{year}_Q{quarter[-1]}.h5')

        # Define the key for the HDF5 file
        hdf_key = f'year_{year}_Q{quarter[-1]}'
        
        # Open or create the HDF5 file and append the data using 'fixed' format
        with pd.HDFStore(hdf5_filename, mode='a') as hdf_store:
            hdf_store.put(hdf_key, group, format='fixed')  # Use 'fixed' format for better performance

print("Data processing and saving to HDF5 files complete.")