# Pickle of our first dataframe

Categorical => OneHotEncoder or one dimension with different values (1, 2, 3, 4, etc.)
- SEASON oneHotEncoder (4)
- BASIN (7)
- NATURE (6)

Numeric => everything between 0 and 1
- LAT
- LON
- WIND 
- DIST2LAND
- STORM_SPEED
- STORM_DIR

In [21]:
import pandas as pd 
import numpy as np
from numpy import std

import plotly.express as px 
import plotly.graph_objects as go 

# import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
df = pd.read_csv(
    "../data/ibtracs.csv", 
    skiprows=[1],
    header=0,
    low_memory=False
)

In [23]:
# working on a copy of the dataset to preserve an untouched source of the dfcon
df_ibtracs = df.copy()

In [24]:
for col in df_ibtracs.columns:
    df_ibtracs.loc[df_ibtracs[col]==" ", col] = np.nan
    try:
        df_ibtracs[col] = pd.to_numeric(df_ibtracs[col])
    except ValueError:
        pass

In [25]:
df_ibtracs = df_ibtracs.dropna(subset=["TD9636_STAGE"])

In [26]:
df_ibtracs['ISO_TIME'] = pd.to_datetime(df_ibtracs['ISO_TIME'])

In [29]:
def get_season(date, latitude):
    month = date.month
    day = date.day

    if latitude >= 0:  # Northern Hemisphere
        match month:
            case 12 | 1 | 2: 
                return "Winter"
            case 3 | 4 | 5: 
                return "Spring"
            case 6 | 7 | 8:
                return "Summer"
            case 9 | 10 | 11: 
                return "Fall"
    
    else:  # Southern Hemisphere
        match month:
            case 12 | 1 | 2:
                return "Summer"
            case 3 | 4 | 5: 
                return  "Fall"
            case 6 | 7 | 8: 
                return "Winter"
            case 9 | 10 | 11: 
                return "Spring"

In [30]:
df_ibtracs['SEASON'] = df_ibtracs.apply(lambda row: get_season(row['ISO_TIME'], row['LAT']), axis=1)

In [31]:
# renaming USA_WIND as WIND
df_ibtracs.rename(columns={"USA_WIND": "WIND"}, inplace=True)

In [32]:
df_ibtracs[["BOM_WIND", "WELLINGTON_WIND", "REUNION_WIND", "HKO_WIND", "TOKYO_WIND"]] = df_ibtracs[["BOM_WIND", "WELLINGTON_WIND", "REUNION_WIND", "HKO_WIND", "TOKYO_WIND"]] * 1.12

In [33]:
wind_list = [            
    "TD9636_WIND",                 
    "NEUMANN_WIND",       
    "HKO_WIND",           
    "TOKYO_WIND",        
    "BOM_WIND",          
    "WELLINGTON_WIND",    
    "REUNION_WIND",     
    "DS824_WIND"        
]

In [34]:
# if WIND is NaN we loop over the other columns to fill it 
for column in wind_list:
    df_ibtracs["WIND"] = df_ibtracs["WIND"].fillna(value=df_ibtracs[column])

In [35]:
# Drop rows where WIND is still null
df_ibtracs = df_ibtracs.dropna(subset=['WIND'])

In [37]:
df_ = df_ibtracs[[
    "SEASON", 
    "BASIN", 
    "NATURE",
    "LAT", 
    "LON", 
    "WIND", 
    "DIST2LAND",
    "STORM_SPEED",
    "STORM_DIR", 
    "TD9636_STAGE",
    "PRESSURE"
]]

KeyError: "['PRESSURE'] not in index"

In [16]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45025 entries, 0 to 67409
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SEASON        45025 non-null  object 
 1   BASIN         45025 non-null  object 
 2   NATURE        45025 non-null  object 
 3   LAT           45025 non-null  float64
 4   LON           45025 non-null  float64
 5   WIND          45025 non-null  float64
 6   DIST2LAND     45025 non-null  float64
 7   STORM_SPEED   45025 non-null  float64
 8   STORM_DIR     45025 non-null  float64
 9   TD9636_STAGE  45025 non-null  float64
dtypes: float64(7), object(3)
memory usage: 3.8+ MB


In [17]:
df_.isnull().sum().sort_values()

SEASON          0
BASIN           0
NATURE          0
LAT             0
LON             0
WIND            0
DIST2LAND       0
STORM_SPEED     0
STORM_DIR       0
TD9636_STAGE    0
dtype: int64

In [18]:
df_.to_parquet("../data/base.parquet", engine="pyarrow")