# Pickle of our first dataframe

Categorical => OneHotEncoder or one dimension with different values (1, 2, 3, 4, etc.)
- SEASON oneHotEncoder (4)
- BASIN (7)
- NATURE (6)

Numeric => everything between 0 and 1
- LAT
- LON
- WIND 
- DIST2LAND
- STORM_SPEED
- STORM_DIR

In [178]:
import pandas as pd
import geopandas as gpd
import numpy as np
from numpy import std

import plotly.express as px 
import plotly.graph_objects as go 

from shapely.geometry import Point

# import matplotlib.pyplot as plt
import seaborn as sns

In [179]:
df = pd.read_csv(
    "../data/ibtracs.csv", 
    skiprows=[1],
    header=0,
    low_memory=False
)

In [180]:
# working on a copy of the dataset to preserve an untouched source of the dfcon
df_ibtracs = df.copy()

In [181]:
df_ibtracs['BASIN'].unique()

array(['SP', 'SI', 'WP', 'EP', nan, 'NI', 'SA'], dtype=object)

Changing NULL values in BASIN to North Atlantic values

In [182]:
basins = gpd.read_file('../data_visualization/shapefiles/goas_v01.shp')

In [183]:
na_basin_geo = basins.loc[basins['name']=='North Atlantic Ocean', 'geometry'].iloc[0]

In [184]:
na_basin_geom = na_basin_geo.simplify(tolerance=1, preserve_topology=False)

In [185]:
df_ibtracs.loc[df_ibtracs['BASIN'].isnull(), 'BASIN'] = 'NA' # Correcting the Null values

Changing NULL values in BASIN to correct NA subbasin

In [186]:
subbasins = gpd.read_file('../data_visualization/shapefiles/World_Seas_IHO_v3.shp')

In [187]:
subbasins['NAME_SHORT'] = subbasins['NAME'].apply(lambda x: ''.join([i[0].upper() for i in x.split()]))

In [188]:
subbasins = subbasins[['NAME_SHORT', 'geometry']] # Keeping only relevent informations

In [189]:
subbasins["geometry"] = subbasins["geometry"].simplify(tolerance=1, preserve_topology=False) # Simplifying the geometry to fasten the process

In [190]:
subbasins = subbasins.loc[subbasins['geometry'].intersects(na_basin_geom)].reset_index(drop=True) # Keeping only subbasins interesting with the NA basin

Lets now complete the subbasin name information with the correct geometry.

In [191]:
def find_subbasin(point):
    """Fuction checking in wich subbasin a point is located, returning np.nan if none"""
    subbasin_serie = subbasins.loc[subbasins['geometry'].contains(point), 'NAME_SHORT']
    return np.nan if len(subbasin_serie)==0 else subbasin_serie.iloc[0]

In [192]:
# Turning every LAT and LON of null subbasing points into Point objects
point_serie = df_ibtracs.loc[df_ibtracs['SUBBASIN'].isnull(), ['LAT', 'LON']].apply(lambda x: Point(x['LON'], x['LAT']), axis=1)

In [193]:
# Finding the corresponding subbasin
subbasin_serie_found = point_serie.apply(lambda p: find_subbasin(p))

In [194]:
# Completing the information in the original dataframe
df_ibtracs.loc[df_ibtracs['SUBBASIN'].isnull(), "SUBBASIN"] = subbasin_serie_found

In [195]:
df_ibtracs.loc[df_ibtracs['SUBBASIN'].isnull(), "SUBBASIN"] = 'LAND'

In [196]:
df_ibtracs['BASIN'].unique()

array(['SP', 'SI', 'WP', 'EP', 'NA', 'NI', 'SA'], dtype=object)

In [197]:
for col in df_ibtracs.columns:
    df_ibtracs.loc[df_ibtracs[col]==" ", col] = np.nan
    try:
        df_ibtracs[col] = pd.to_numeric(df_ibtracs[col])
    except ValueError:
        pass

In [202]:
df_ibtracs['BASIN'].unique()

array(['SP', 'SI', 'WP', 'EP', 'NI'], dtype=object)

In [199]:
df_ibtracs = df_ibtracs.dropna(subset=["TD9636_STAGE"])

In [200]:
df_ibtracs['ISO_TIME'] = pd.to_datetime(df_ibtracs['ISO_TIME'])

In [201]:
def get_season(date, latitude):
    month = date.month
    day = date.day

    if latitude >= 0:  # Northern Hemisphere
        match month:
            case 12 | 1 | 2: 
                return "Winter"
            case 3 | 4 | 5: 
                return "Spring"
            case 6 | 7 | 8:
                return "Summer"
            case 9 | 10 | 11: 
                return "Fall"
    
    else:  # Southern Hemisphere
        match month:
            case 12 | 1 | 2:
                return "Summer"
            case 3 | 4 | 5: 
                return  "Fall"
            case 6 | 7 | 8: 
                return "Winter"
            case 9 | 10 | 11: 
                return "Spring"

In [164]:
df_ibtracs['SEASON'] = df_ibtracs.apply(lambda row: get_season(row['ISO_TIME'], row['LAT']), axis=1)

In [165]:
# renaming USA_WIND as WIND
df_ibtracs.rename(columns={"USA_WIND": "WIND"}, inplace=True)

In [166]:
df_ibtracs[["BOM_WIND", "WELLINGTON_WIND", "REUNION_WIND", "HKO_WIND", "TOKYO_WIND"]] = df_ibtracs[["BOM_WIND", "WELLINGTON_WIND", "REUNION_WIND", "HKO_WIND", "TOKYO_WIND"]] * 1.12

In [167]:
wind_list = [            
    "TD9636_WIND",                 
    "NEUMANN_WIND",       
    "HKO_WIND",           
    "TOKYO_WIND",        
    "BOM_WIND",          
    "WELLINGTON_WIND",    
    "REUNION_WIND",     
    "DS824_WIND"        
]

In [168]:
# if WIND is NaN we loop over the other columns to fill it 
for column in wind_list:
    df_ibtracs["WIND"] = df_ibtracs["WIND"].fillna(value=df_ibtracs[column])

In [169]:
# Drop rows where WIND is still null
df_ibtracs = df_ibtracs.dropna(subset=['WIND'])

In [170]:
df_ = df_ibtracs[[
    "SEASON", 
    "BASIN", 
    "NATURE",
    "LAT", 
    "LON", 
    "WIND", 
    "DIST2LAND",
    "STORM_SPEED",
    "STORM_DIR", 
    "TD9636_STAGE"
]]

In [171]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45025 entries, 0 to 67409
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SEASON        45025 non-null  object 
 1   BASIN         45025 non-null  object 
 2   NATURE        45025 non-null  object 
 3   LAT           45025 non-null  float64
 4   LON           45025 non-null  float64
 5   WIND          45025 non-null  float64
 6   DIST2LAND     45025 non-null  float64
 7   STORM_SPEED   45025 non-null  float64
 8   STORM_DIR     45025 non-null  float64
 9   TD9636_STAGE  45025 non-null  float64
dtypes: float64(7), object(3)
memory usage: 3.8+ MB


In [172]:
df_.isnull().sum().sort_values()

SEASON          0
BASIN           0
NATURE          0
LAT             0
LON             0
WIND            0
DIST2LAND       0
STORM_SPEED     0
STORM_DIR       0
TD9636_STAGE    0
dtype: int64

In [173]:
df_.to_parquet("../data/base2.parquet", engine="pyarrow")

In [174]:
df_['BASIN'].unique()

array(['SP', 'SI', 'WP', 'EP', 'NI'], dtype=object)