In [1]:
import os
import xarray as xr
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.interpolate import UnivariateSpline

MODE = 'test'

df = pd.read_csv(f'data/{MODE}_2001_over34/ibtracs_2001_{MODE}_all.csv')

# df_clean = pd.DataFrame(columns=df.columns.to_list())
ids_to_eliminate = []

In [2]:
def find_large_gaps(tc_df):
    # find ids of all the rows with actual values
    nonzero_ids = tc_df.loc[tc_df['WMO_WIND'] != 0].index
    ids_to_eliminate = list(np.arange(tc_df.index[0], nonzero_ids[0]))
    
    # for every row with an actual value, see if the gap between it and the next non-zero
    # row is greater than 4 rows (i.e., there's more than three 0-valued rows in between them);
    # if so, drop all rows belonging to that gap
    for i, nonzero_id in enumerate(nonzero_ids):
        if nonzero_id == nonzero_ids[-1]: # make sure you stop one row short to avoid indexing errors
            pass
        elif (nonzero_ids[i+1] - nonzero_id) > 4:
            ids_to_eliminate.extend(range(nonzero_id+1, nonzero_ids[i+1]))
    return ids_to_eliminate

In [3]:
ids_to_eliminate.extend(find_large_gaps(df))
df_clean = df.drop(ids_to_eliminate)

In [4]:
def interpolate_missing_values(cyclone_df):
    # Use pchip interpolation to fill missing values
    cyclone_df['WMO_WIND'] = cyclone_df['WMO_WIND'].replace(0, np.nan).interpolate(method='pchip')

    # If there are still missing values at the start or end of the series, fill them with bfill or ffill
    cyclone_df['WMO_WIND'] = cyclone_df['WMO_WIND'].bfill().ffill()
    
    # Convert to int
    cyclone_df['WMO_WIND'] = cyclone_df['WMO_WIND'].round().astype(int)
    return cyclone_df

In [5]:
df_clean_interp = df_clean.groupby('SID').apply(interpolate_missing_values).reset_index(drop=True)

In [6]:
over = df_clean_interp[df_clean_interp['WMO_WIND'] >= 34]

In [7]:
over.to_csv(f'data/{MODE}_2001_over34/ibtracs_2001_{MODE}_over34.csv')