In [None]:
from pathlib import Path
import matplotlib.pyplot as plt
import xarray as xr
import numpy as np
import pandas as pd

dir0 = Path('el_nino/')
file_sst = 'sst.mnmean.nc'
file_2 = 'mslp_coarse.nc'

# load the data set with xarray
ds_nino = xr.open_dataset(Path(dir0, file_sst))
ds_mslp = xr.open_dataset(Path(dir0, file_2))

# define 3.4 region
lat_min, lat_max = -5, 5
lon_min, lon_max = 190, 240

ds_nino = ds_nino.interpolate_na(dim='lon')
ds_mslp = ds_mslp.interpolate_na(dim='lon')

# Select the region
ds_region = ds_nino.where((ds_nino.lat >= lat_min) & (ds_nino.lat <= lat_max) & (ds_nino.lon >= lon_min) & (ds_nino.lon <= lon_max), drop=True)
ds_region_mslp = ds_mslp.where((ds_mslp.latitude >= lat_min) & (ds_mslp.latitude <= lat_max) & (ds_mslp.longitude >= lon_min) & (ds_mslp.longitude <= lon_max), drop=True)


In [None]:
months = range(1, 13)
years = range(1983, 2022)

# Prepare lists to store data
el_nino_col = []
ys = []
ys_np = np.zeros((1, 456))
big_mean = float(ds_region.mean()['sst'])
# Loop through each year
for year in years:
    for month in months:
        print(year, '/', month)
        if month == 1:
            prev_year = year - 1
            next_year = year
            prev_month = 12
            next_month = 2
        
        elif month == 12:
            prev_year = year
            next_year = year + 1
            prev_month = 11
            next_month = 1
        
        else:
            prev_year = year
            next_year = year
            prev_month = month - 1
            next_month = month + 1
    
        # prv_month = ds_region.where(ds_region['time'].dt.year == prev_year, drop=True).where(ds_region['time'].dt.month == month - 1)
        ds_prev_month = ds_region.where((ds_region['time'].dt.year == prev_year) & (ds_region['time'].dt.month == prev_month), drop=True)
        ds_curr_month = ds_region.where((ds_region['time'].dt.year == year) & (ds_region['time'].dt.month == month), drop=True)
        ds_next_month =  ds_region.where((ds_region['time'].dt.year == next_year) & (ds_region['time'].dt.month == next_month), drop=True)

        # Merge the three datasets
        merged_dataset = xr.concat([ds_prev_month, ds_curr_month, ds_next_month], dim='time')

        # Calculate the average sea surface temperature along the time dimension
        sst_anom = float(merged_dataset['sst'].mean()) - big_mean
        cases = [
            (sst_anom > 2),
            (sst_anom > 1.5) & (sst_anom <= 2),
            (sst_anom > 1.0) & (sst_anom <= 1.5),
            (sst_anom > 0.5) & (sst_anom <= 1.0),
            (sst_anom <= 0.5) & (sst_anom >= -0.5),
            (sst_anom < -2.0),
            (sst_anom < -1.5) & (sst_anom >= -2.0),
            (sst_anom < -1.0) & (sst_anom >= -1.5),
            (sst_anom < -0.5) & (sst_anom >= -1.0),
        ]
        conditions = [4, 3, 2, 1, 0, -4, -3, -2, -1]
        res = np.select(cases, conditions, 0)
    
        str_mnth = "/" + str(month)
        ys.append(res)
        ys_np = np.array(ys)

In [46]:
start_year = 1982
end_year = 2021

xs_np = {}

for year in range(start_year, end_year):

    accum_months = []
    
    for month in range(1, 13):  # Looping through months 1 to 12
        start_date = pd.Timestamp(year=year, month=month, day=1)
        end_date = start_date + pd.DateOffset(years=1) - pd.Timedelta(days=1)
        # print(start_date, " => ", end_date)
        
        # Selecting the data for the one-year interval
        interval_data = ds_region_mslp.sel(time=slice(start_date, end_date))
    
        # Formatting the interval data
        numpy_array = interval_data['msl'].to_numpy()
        flattened_data = numpy_array.flatten()
        xs_np[str(year + 1) + "/" + str(month)] = flattened_data
        # print(str(year + 1) + "/" + str(month))

xs_np = np.array(list(xs_np.values()))

In [50]:
print(ys_np.shape)
print(xs_np.shape)

(468,)
(468, 6732)


In [52]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


X_train, X_test, y_train, y_test = train_test_split(xs_np, ys_np, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA
pca = PCA(n_components=0.95)  # Keep 95% of variance
X_train = pca.fit_transform(X_train_scaled)
X_test = pca.transform(X_test_scaled)

# Apply LDA
lda = LinearDiscriminantAnalysis()
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

# Create the SVM model with a kernel
svm_model = SVC(kernel='rbf')

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Display the DataFrame
print(comparison_df)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

    Actual  Predicted
0        3          0
1        0          3
2       -1          0
3       -3          0
4       -4         -3
..     ...        ...
89      -1          0
90      -4         -4
91       1          0
92       0          0
93       0          0

[94 rows x 2 columns]
Model Accuracy: 0.4787234042553192
