In [190]:
from pathlib import Path
import matplotlib.pyplot as plt
import xarray as xr
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


dir0 = Path('el_nino/')
file_sst = 'sst.mnmean.nc'
file_2 = 'mslp_coarse.nc'

# load the data set with xarray
ds_nino = xr.open_dataset(Path(dir0, file_sst))
ds_mslp = xr.open_dataset(Path(dir0, file_2))

# define 3.4 region
lat_min, lat_max = -5, 5
lon_min, lon_max = 190, 240

ds_nino = ds_nino.interpolate_na(dim='lon')
ds_mslp = ds_mslp.interpolate_na(dim='lon')

# Select the region
ds_region_nino = ds_nino.where((ds_nino.lat >= lat_min) & (ds_nino.lat <= lat_max) & 
                               (ds_nino.lon >= lon_min) & (ds_nino.lon <= lon_max), drop=True)
ds_region_mslp = ds_mslp.where((ds_mslp.latitude >= lat_min) & (ds_mslp.latitude <= lat_max) & 
                               (ds_mslp.longitude >= lon_min) & (ds_mslp.longitude <= lon_max), drop=True)


In [None]:
print(ds_mslp.head)
print(ds_nino.head)

In [192]:
# Extracting the labels from 12/1981 to 06/2022
# 0 = Nothing
# 1 = El Nino
# 2 = La Nina


# Initialisation
start_date_y = pd.Timestamp(year = 1981, month = 12, day = 1)
end_date_y = pd.Timestamp(year = 2022, month = 6, day = 1)
current_date = start_date_y

# Mean temperature in the region over all the years
big_mean = float(ds_region_nino.mean()['sst'])

ys = []

while current_date <= end_date_y:
    # print(current_date)

    # Create Timestamps for previous, current, and next months
    current_month = current_date
    prev_month = current_month - pd.DateOffset(months = 1)
    next_month = current_month + pd.DateOffset(months = 1)

    # Get data for each month
    ds_prev_month = ds_region_nino.sel(time = slice(prev_month, prev_month))
    ds_curr_month = ds_region_nino.sel(time = slice(current_month, current_month))
    ds_next_month = ds_region_nino.sel(time = slice(next_month, next_month))

    # Merge the three datasets
    merged_dataset = xr.concat([ds_prev_month, ds_curr_month, ds_next_month], dim='time')

    # Calculate the average sea surface temperature along the time dimension
    sst_anom = float(merged_dataset['sst'].mean()) - big_mean
    cases = [
        (sst_anom > 1.0),
        (sst_anom <= 1.0) & (sst_anom >= -1.0),
        (sst_anom < -1.0),
    ]
    conditions = [1, 0, 2]
    res = np.select(cases, conditions, 0)

    ys.append(res)
    
    # Increment to the first day of the next month
    current_date += pd.DateOffset(months = 1)

# Convert the list to a numpy array
ys_np = np.array(ys)

In [None]:
# Dataset to predict n_month in advance using 1 year of data

# how many month in advance do we want to predict el nino (max: 24 months)
n_month = 12

start_date_X = start_date_y - pd.DateOffset(years = 1) - pd.DateOffset(months = n_month - 1)
end_date_X = end_date_y - pd.DateOffset(years = 1) - pd.DateOffset(months = n_month - 1)
current_date = start_date_X

xs_np = {}

while current_date <= end_date_X:
    
    start_variable = current_date
    end_variable = current_date + pd.DateOffset(years = 1) - pd.DateOffset(months = 1)
    print(start_variable, ' => ', end_variable)

    # Selecting the data for the one-year interval
    interval_data = ds_mslp.sel(time=slice(start_variable, end_variable))

    # Formatting the interval data
    numpy_array = interval_data['msl'].to_numpy()
    flattened_data = numpy_array.flatten()
    xs_np[str(end_variable.year) + "/" + str(end_variable.month + n_month)] = flattened_data
    # print(str(end_variable.year) + "/" + str(end_variable.month + n_month))

    # Increment to the next month
    current_date += pd.DateOffset(months = 1)

xs_np = np.array(list(xs_np.values()))

In [200]:
print(ys_np.shape)
print(xs_np.shape)

(487,)
(487, 781920)


In [211]:
# Random Forest Classifier

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(xs_np)

# Apply PCA
pca = PCA(n_components=0.99)
X_pca = pca.fit_transform(X_scaled)

# Apply LDA
lda = LinearDiscriminantAnalysis()
X_lda = lda.fit_transform(X_pca, ys_np)

X_train, X_test, y_train, y_test = train_test_split(X_lda, ys_np, test_size=0.2, random_state=42)


# Create the Random Forest model
rf_model = RandomForestClassifier(n_estimators=3000, random_state=42) 

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_test = rf_model.predict(X_test)
y_pred_train = rf_model.predict(X_train)

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_test})

# Display the DataFrame
print(comparison_df)

# Evaluate the model
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

print(f"Train Accuracy: {accuracy_train}")
print(f"Test Accuracy: {accuracy_test}")

    Actual  Predicted
0        0          0
1        0          0
2        0          0
3        2          2
4        1          1
..     ...        ...
93       0          0
94       2          2
95       2          2
96       0          0
97       0          0

[98 rows x 2 columns]
Train Accuracy: 1.0
Test Accuracy: 1.0


In [205]:
# Support Vector Machine

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(xs_np)

# Apply PCA
pca = PCA(n_components=0.99)
X_pca = pca.fit_transform(X_scaled)

# Apply LDA
lda = LinearDiscriminantAnalysis()
X_lda = lda.fit_transform(X_pca, ys_np)

X_train, X_test, y_train, y_test = train_test_split(X_lda, ys_np, test_size = 0.2, random_state = 12, shuffle = True)

# Create the SVM model with a kernel
svm_model = SVC(kernel='rbf')

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions
y_pred_test = svm_model.predict(X_test)
y_pred_train = svm_model.predict(X_train)

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_test})

# Display the DataFrame
print(comparison_df)

# Evaluate the model
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

print(f"Train Accuracy: {accuracy_train}")
print(f"Test Accuracy: {accuracy_test}")

    Actual  Predicted
0        1          1
1        0          0
2        0          0
3        2          2
4        0          0
..     ...        ...
93       2          2
94       0          0
95       0          0
96       2          2
97       0          0

[98 rows x 2 columns]
Train Accuracy: 1.0
Test Accuracy: 1.0


In [None]:
# Neural Network

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

# Apply PCA
pca = PCA(n_components=0.99)  # Keep % of variance
X_pca = pca.fit_transform(xs_np)

# Apply LDA
lda = LinearDiscriminantAnalysis()
X_lda = lda.fit_transform(X_pca, ys_np)

X_train, X_test, y_train, y_test = train_test_split(X_lda, ys_np, test_size=0.2, random_state=42)

# One-hot encode the labels
y_train = to_categorical(y_train, num_classes = 3)
y_test = to_categorical(y_test, num_classes = 3)

print(X_train.shape)
rows, cols = X_train.shape

# L1 Regularization factor
l1_lambda = 0.1 

# Create a Sequential model
model = Sequential()

# Add layers to the model with L1 regularization
model.add(Dense(6, activation='relu', input_shape=(cols,), kernel_regularizer = regularizers.l1(l1_lambda)))
model.add(Dense(12, activation='relu', kernel_regularizer = regularizers.l1(l1_lambda)))
model.add(Dense(3, activation='softmax', kernel_regularizer = regularizers.l1(l1_lambda)))  # 3 output units for 3 classes

model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs = 15, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
train_loss, train_accuracy = model.evaluate(X_train, y_train)

print(f"Train loss: {train_loss}")
print(f"Test loss: {test_loss}")
print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

In [134]:
# Gradient Boosting Machines

import xgboost as xgb
from sklearn.metrics import accuracy_score

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(xs_np)

# Apply PCA
pca = PCA(n_components=0.99)
X_pca = pca.fit_transform(X_scaled)

# Apply LDA
lda = LinearDiscriminantAnalysis()
X_lda = lda.fit_transform(X_pca, ys_np)

X_train, X_test, y_train, y_test = train_test_split(X_lda, ys_np, test_size=0.2, random_state=42)

# Create the XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class = 3, eval_metric='mlogloss', use_label_encoder = True)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_test = xgb_model.predict(X_test)
y_pred_train = xgb_model.predict(X_train)

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_test})

# Display the DataFrame
print(comparison_df)

# Evaluate the model
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

print(f"Train Accuracy: {accuracy_train}")
print(f"Test Accuracy: {accuracy_test}")

    Actual  Predicted
0        1          1
1        0          0
2        0          0
3        2          2
4        2          2
..     ...        ...
89       0          0
90       2          2
91       0          0
92       0          0
93       0          0

[94 rows x 2 columns]
Train Accuracy: 1.0
Test Accuracy: 0.9893617021276596


In [209]:
# Cross-Validation

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(xs_np)

# Apply PCA
pca = PCA(n_components=0.99)
X_pca = pca.fit_transform(X_scaled)

# Apply LDA
lda = LinearDiscriminantAnalysis()
X_lda = lda.fit_transform(X_pca, ys_np)

# Create the SVM model with a kernel
svm_model = SVC(kernel='rbf')

# Perform 5-fold cross-validation
cv_scores = cross_val_score(svm_model, X_lda, ys_np, cv=5)

# cv_scores will hold the score for each fold
print("Cross-validation scores:", cv_scores)
print("Average cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.69387755 0.68367347 0.69072165 0.68041237 0.68041237]
Average cross-validation score: 0.6858194824321482
