In [None]:
from pathlib import Path
import matplotlib.pyplot as plt
import xarray as xr
import numpy as np
import pandas as pd

dir0 = Path('el_nino/')
file_sst = 'sst.mnmean.nc'
file_2 = 'mslp_coarse.nc'

# load the data set with xarray
ds_nino = xr.open_dataset(Path(dir0, file_sst))
ds_mslp = xr.open_dataset(Path(dir0, file_2))

# define 3.4 region
lat_min, lat_max = -5, 5
lon_min, lon_max = 190, 240

ds_nino = ds_nino.interpolate_na(dim='lon')
ds_mslp = ds_mslp.interpolate_na(dim='lon')

# Select the region
ds_region = ds_nino.where((ds_nino.lat >= lat_min) & (ds_nino.lat <= lat_max) & (ds_nino.lon >= lon_min) & (ds_nino.lon <= lon_max), drop=True)
ds_region_mslp = ds_mslp.where((ds_mslp.latitude >= lat_min) & (ds_mslp.latitude <= lat_max) & (ds_mslp.longitude >= lon_min) & (ds_mslp.longitude <= lon_max), drop=True)


In [53]:
months = range(1, 13)
years = range(1983, 2022)

# Prepare lists to store data
el_nino_col = []
ys = []
ys_np = np.zeros((1, 456))
big_mean = float(ds_region.mean()['sst'])
# Loop through each year
for year in years:
    for month in months:
        # print(year, '/', month)
        if month == 1:
            prev_year = year - 1
            next_year = year
            prev_month = 12
            next_month = 2
        
        elif month == 12:
            prev_year = year
            next_year = year + 1
            prev_month = 11
            next_month = 1
        
        else:
            prev_year = year
            next_year = year
            prev_month = month - 1
            next_month = month + 1
    
        # prv_month = ds_region.where(ds_region['time'].dt.year == prev_year, drop=True).where(ds_region['time'].dt.month == month - 1)
        ds_prev_month = ds_region.where((ds_region['time'].dt.year == prev_year) & (ds_region['time'].dt.month == prev_month), drop=True)
        ds_curr_month = ds_region.where((ds_region['time'].dt.year == year) & (ds_region['time'].dt.month == month), drop=True)
        ds_next_month =  ds_region.where((ds_region['time'].dt.year == next_year) & (ds_region['time'].dt.month == next_month), drop=True)

        # Merge the three datasets
        merged_dataset = xr.concat([ds_prev_month, ds_curr_month, ds_next_month], dim='time')

        # Calculate the average sea surface temperature along the time dimension
        sst_anom = float(merged_dataset['sst'].mean()) - big_mean
        cases = [
            (sst_anom > 1.0),
            (sst_anom <= 1.0) & (sst_anom >= -1.0),
            (sst_anom < -1.0),
            ]
        conditions = [1, 0, -1]
        res = np.select(cases, conditions, 0)
    
        str_mnth = "/" + str(month)
        ys.append(res)
        ys_np = np.array(ys)

1983 / 1
1983 / 2
1983 / 3
1983 / 4
1983 / 5
1983 / 6
1983 / 7
1983 / 8
1983 / 9
1983 / 10
1983 / 11
1983 / 12
1984 / 1
1984 / 2
1984 / 3
1984 / 4
1984 / 5
1984 / 6
1984 / 7
1984 / 8
1984 / 9
1984 / 10
1984 / 11
1984 / 12
1985 / 1
1985 / 2
1985 / 3
1985 / 4
1985 / 5
1985 / 6
1985 / 7
1985 / 8
1985 / 9
1985 / 10
1985 / 11
1985 / 12
1986 / 1
1986 / 2
1986 / 3
1986 / 4
1986 / 5
1986 / 6
1986 / 7
1986 / 8
1986 / 9
1986 / 10
1986 / 11
1986 / 12
1987 / 1
1987 / 2
1987 / 3
1987 / 4
1987 / 5
1987 / 6
1987 / 7
1987 / 8
1987 / 9
1987 / 10
1987 / 11
1987 / 12
1988 / 1
1988 / 2
1988 / 3
1988 / 4
1988 / 5
1988 / 6
1988 / 7
1988 / 8
1988 / 9
1988 / 10
1988 / 11
1988 / 12
1989 / 1
1989 / 2
1989 / 3
1989 / 4
1989 / 5
1989 / 6
1989 / 7
1989 / 8
1989 / 9
1989 / 10
1989 / 11
1989 / 12
1990 / 1
1990 / 2
1990 / 3
1990 / 4
1990 / 5
1990 / 6
1990 / 7
1990 / 8
1990 / 9
1990 / 10
1990 / 11
1990 / 12
1991 / 1
1991 / 2
1991 / 3
1991 / 4
1991 / 5
1991 / 6
1991 / 7
1991 / 8
1991 / 9
1991 / 10
1991 / 11
1991 / 12
1

In [54]:
start_year = 1982
end_year = 2021

xs_np = {}

for year in range(start_year, end_year):

    accum_months = []
    
    for month in range(1, 13):  # Looping through months 1 to 12
        start_date = pd.Timestamp(year=year, month=month, day=1)
        end_date = start_date + pd.DateOffset(years=1) - pd.Timedelta(days=1)
        # print(start_date, " => ", end_date)
        
        # Selecting the data for the one-year interval
        interval_data = ds_region_mslp.sel(time=slice(start_date, end_date))
    
        # Formatting the interval data
        numpy_array = interval_data['msl'].to_numpy()
        flattened_data = numpy_array.flatten()
        xs_np[str(year + 1) + "/" + str(month)] = flattened_data
        # print(str(year + 1) + "/" + str(month))

xs_np = np.array(list(xs_np.values()))

In [55]:
print(ys_np.shape)
print(xs_np.shape)

(468,)
(468, 6732)


In [58]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


X_train, X_test, y_train, y_test = train_test_split(xs_np, ys_np, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA
pca = PCA(n_components=0.95)  # Keep % of variance
X_train = pca.fit_transform(X_train_scaled)
X_test = pca.transform(X_test_scaled)

# Apply LDA
lda = LinearDiscriminantAnalysis()
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

# Create the SVM model with a kernel
svm_model = SVC(kernel='rbf')

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Display the DataFrame
print(comparison_df)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

    Actual  Predicted
0        1          0
1        0          1
2        0          0
3       -1          0
4       -1         -1
..     ...        ...
89       0          0
90      -1         -1
91       0          0
92       0          0
93       0         -1

[94 rows x 2 columns]
Model Accuracy: 0.7446808510638298
