In [3]:
###################################################################################
#                                                                                 #
#                EXTRACTING LOCATION-BASED DATA FROM XRAIN DATA                   #
#                                                                                 #
#                             Christopher Gomez, 2024                             #
#                                                                                 #
###################################################################################

import pandas as pd
import glob
import os
from datetime import datetime
import numpy as np

def extract_rainfall_timeseries_from_csv(
    data_folder,
    location,
    output_file,
    location_type='coords',  # 'coords' or 'rowcol'
    northwest_corner=(37.7584, 136.0218),
    southeast_corner=(36.2813, 137.8781),
    n_rows=709,
    n_cols=594
):
    """
    Extract rainfall time series for a specific location directly from CSV files
    and save it as a CSV file.
    
    Parameters:
    -----------
    data_folder : str
        Path to folder containing the CSV files
    location : tuple
        Either (lat, lon) or (row, col) depending on location_type
    output_file : str
        Path for output CSV file
    location_type : str
        'coords' for geographic coordinates, 'rowcol' for row/column indices
    northwest_corner : tuple
        (lat, lon) of northwest corner of the grid
    southeast_corner : tuple
        (lat, lon) of southeast corner of the grid
    n_rows : int
        Number of rows in the grid
    n_cols : int
        Number of columns in the grid
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing the time series data
    """
    
    def coords_to_rowcol(lat, lon):
        """Convert geographic coordinates to row and column indices"""
        lat_step = (northwest_corner[0] - southeast_corner[0]) / n_rows
        lon_step = (southeast_corner[1] - northwest_corner[1]) / n_cols
        
        # Calculate row and column
        row = int((northwest_corner[0] - lat) / lat_step)
        col = int((lon - northwest_corner[1]) / lon_step)
        
        # Ensure indices are within bounds
        row = max(0, min(row, n_rows - 1))
        col = max(0, min(col, n_cols - 1))
        
        return row, col
    
    # Get row and column based on input type
    if location_type == 'coords':
        lat, lon = location
        row, col = coords_to_rowcol(lat, lon)
    else:
        row, col = location
        # Calculate the coordinates for reference
        lat_step = (northwest_corner[0] - southeast_corner[0]) / n_rows
        lon_step = (southeast_corner[1] - northwest_corner[1]) / n_cols
        lat = northwest_corner[0] - (row * lat_step) - (lat_step / 2)  # Center of cell
        lon = northwest_corner[1] + (col * lon_step) + (lon_step / 2)  # Center of cell

    # Get list of all CSV files in the folder
    csv_files = sorted(glob.glob(os.path.join(data_folder, '*.csv')))
    
    # Create a dictionary to store the time series data
    data = {
        'datetime': [],
        'rainfall': []
    }
    
    # Process each CSV file
    print(f"Processing data for location - Row: {row}, Col: {col} (Lat: {lat:.4f}, Lon: {lon:.4f})")
    
    for csv_file in csv_files:
        try:
            # Extract timestamp from filename (assuming format YYYYMMDD-HHMM)
            timestamp = datetime.strptime(os.path.basename(csv_file)[:13], '%Y%m%d-%H%M')
            
            # Read CSV file
            df = pd.read_csv(csv_file, header=None)
            
            if df.shape != (n_rows, n_cols):
                print(f"Warning: CSV file {csv_file} has unexpected dimensions: {df.shape}")
                continue
            
            # Get value for the specified location
            value = df.iloc[row, col]
            
            data['datetime'].append(timestamp)
            data['rainfall'].append(value)
            
        except Exception as e:
            print(f"Error processing file {csv_file}: {str(e)}")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Add location information
    df['latitude'] = lat
    df['longitude'] = lon
    df['row'] = row
    df['col'] = col
    
    # Sort by datetime
    df = df.sort_values('datetime')
    
    # Reorder columns
    df = df[['datetime', 'rainfall', 'latitude', 'longitude', 'row', 'col']]
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Time series data saved to {output_file}")
    print(f"Number of time steps processed: {len(df)}")
    
    return df

# EXTRACTING DATA BY LOCATION EITHER BY ROW and COLUMN Number, OR EITHER By COORDINATES:
if __name__ == "__main__":
    # Extract by coordinates
    timeseries_coords = extract_rainfall_timeseries_from_csv(
        data_folder=os.getcwd(),  # Current directory, adjust as needed
        location=(37.5, 136.5),  # Example coordinates
        output_file='rainfall_timeseries_coords.csv',
        location_type='coords'
    )
    
    # Extract by row/column
    timeseries_rowcol = extract_rainfall_timeseries_from_csv(
        data_folder=os.getcwd(),  # Current directory, adjust as needed
        location=(350, 300),  # Example row/column
        output_file='rainfall_timeseries_rowcol.csv',
        location_type='rowcol'
    )

Processing data for location - Row: 124, Col: 153 (Lat: 37.5000, Lon: 136.5000)
Time series data saved to rainfall_timeseries_coords.csv
Number of time steps processed: 71
Processing data for location - Row: 350, Col: 300 (Lat: 37.0282, Lon: 136.9609)
Error processing file C:\Users\kaiki\Downloads\202409190000-202409210000-10-THK-136.0218-37.7584-137.8781-36.2813\0920till2350\rainfall_timeseries_coords.csv: time data 'rainfall_time' does not match format '%Y%m%d-%H%M'
Time series data saved to rainfall_timeseries_rowcol.csv
Number of time steps processed: 71
