# Parquet of the base dataframe

A Parquet file is a columnar storage file format optimized for use with big data processing frameworks, which efficiently compresses and encodes the data in a DataFrame for faster read and write operations.

To transform into parquet we use :
- pandas
- pyarrow (as engine)

Categorical => LabelBinarizer or OneHotEncoder or Label Encoder (one dimension with different values (1, 2, 3, 4, etc.))
- SEASON (4)
- BASIN (7)
- NATURE (6)

Numeric => everything between 0 and 1
- LAT
- LON
- WIND 
- DIST2LAND
- STORM_SPEED
- STORM_DIR

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import the dataframe with a single header
df = pd.read_parquet("../data/ibtracs_single_header.parquet", engine="pyarrow")

In [3]:
# working on a copy of the dataset to preserve an untouched source of the dfcon
df_ibtracs = df.copy()

In [4]:
df_ibtracs.head()

Unnamed: 0,SID,SEASON,NUMBER,BASIN,SUBBASIN,NAME,ISO_TIME,NATURE,LAT,LON,...,BOM_GUST_PER,REUNION_GUST,REUNION_GUST_PER,USA_SEAHGT,USA_SEARAD_NE,USA_SEARAD_SE,USA_SEARAD_SW,USA_SEARAD_NW,STORM_SPEED,STORM_DIR
0,1980001S13173,1980,1,SP,MM,PENI,1980-01-01 00:00:00,TS,-12.5,172.5,...,,,,,,,,,6,350
1,1980001S13173,1980,1,SP,MM,PENI,1980-01-01 03:00:00,TS,-12.2,172.4,...,,,,,,,,,6,350
2,1980001S13173,1980,1,SP,MM,PENI,1980-01-01 06:00:00,TS,-11.9,172.4,...,,,,,,,,,5,360
3,1980001S13173,1980,1,SP,MM,PENI,1980-01-01 09:00:00,TS,-11.7,172.4,...,,,,,,,,,4,10
4,1980001S13173,1980,1,SP,MM,PENI,1980-01-01 12:00:00,TS,-11.5,172.5,...,,,,,,,,,4,20


In [5]:
# Prevent downcasting in the .replace
pd.set_option("future.no_silent_downcasting", True)

for col in df_ibtracs.columns:
    df_ibtracs[col] = df_ibtracs[col].replace(" ", np.nan)
    try:
        df_ibtracs[col] = pd.to_numeric(df_ibtracs[col])
    except ValueError:
        pass

In [6]:
df_ibtracs = df_ibtracs.dropna(subset=["TD9636_STAGE"])

In [7]:
df_ibtracs["ISO_TIME"] = pd.to_datetime(df_ibtracs["ISO_TIME"])

In [8]:
def get_season(date, latitude):
    month = date.month
    day = date.day

    if latitude >= 0:  # Northern Hemisphere
        match month:
            case 12 | 1 | 2:
                return "winter"
            case 3 | 4 | 5:
                return "spring"
            case 6 | 7 | 8:
                return "summer"
            case 9 | 10 | 11:
                return "fall"

    else:  # Southern Hemisphere
        match month:
            case 12 | 1 | 2:
                return "summer"
            case 3 | 4 | 5:
                return "fall"
            case 6 | 7 | 8:
                return "winter"
            case 9 | 10 | 11:
                return "spring"

In [9]:
df_ibtracs["SEASON"] = df_ibtracs.apply(
    lambda row: get_season(row["ISO_TIME"], row["LAT"]), axis=1
)

In [10]:
# renaming USA_WIND as WIND
df_ibtracs.rename(columns={"USA_WIND": "WIND"}, inplace=True)

In [11]:
df_ibtracs[
    ["BOM_WIND", "WELLINGTON_WIND", "REUNION_WIND", "HKO_WIND", "TOKYO_WIND"]
] = (
    df_ibtracs[
        [
            "BOM_WIND",
            "WELLINGTON_WIND",
            "REUNION_WIND",
            "HKO_WIND",
            "TOKYO_WIND",
        ]
    ]
    * 1.12
)

In [12]:
wind_list = [
    "TD9636_WIND",
    "NEUMANN_WIND",
    "HKO_WIND",
    "TOKYO_WIND",
    "BOM_WIND",
    "WELLINGTON_WIND",
    "REUNION_WIND",
    "DS824_WIND",
]

In [13]:
# if WIND is NaN we loop over the other columns to fill it
for column in wind_list:
    df_ibtracs["WIND"] = df_ibtracs["WIND"].fillna(value=df_ibtracs[column])

In [14]:
# Drop rows where WIND is still null
df_ibtracs = df_ibtracs.dropna(subset=["WIND"])

In [15]:
df_ = df_ibtracs[
    [
        "SEASON",
        "BASIN",
        "NATURE",
        "LAT",
        "LON",
        "WIND",
        "DIST2LAND",
        "STORM_SPEED",
        "STORM_DIR",
        "TD9636_STAGE",
    ]
]

In [16]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45025 entries, 0 to 67409
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SEASON        45025 non-null  object 
 1   BASIN         45025 non-null  object 
 2   NATURE        45025 non-null  object 
 3   LAT           45025 non-null  float64
 4   LON           45025 non-null  float64
 5   WIND          45025 non-null  float64
 6   DIST2LAND     45025 non-null  int64  
 7   STORM_SPEED   45025 non-null  float64
 8   STORM_DIR     45025 non-null  float64
 9   TD9636_STAGE  45025 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.8+ MB


In [17]:
df_.isnull().sum().sort_values()

SEASON          0
BASIN           0
NATURE          0
LAT             0
LON             0
WIND            0
DIST2LAND       0
STORM_SPEED     0
STORM_DIR       0
TD9636_STAGE    0
dtype: int64

In [18]:
# From DataFrame to parquet
df_.to_parquet("../data/base.parquet", engine="pyarrow")