In [99]:
import warnings
warnings.filterwarnings('ignore')

In [100]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [101]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [105]:
#load the data
file_path = Path('SanClemente_MachineLearningDataSet/eunjais_clemente_2020.csv')
sc2020_df = pd.read_csv(file_path)

# testing dataset will be 
###  file_path = Path('SanClemente_MachineLearningDataSet/sanclemente_2021.csv')
sc2020_df.head()

Unnamed: 0.1,Unnamed: 0,#YY,MM,DD,hh,mm,WSP,GST,WVHT,DPD,APD,MWD,PRES,ATMP,DEWP,Wave_Size,Wind_Speed
0,4,2020,1,1,0,40,3.7,5.1,2.19,16.0,9.15,263,1016.4,16.2,11.3,Ideal (3-9 ft),Moderate (3-19 mph)
1,10,2020,1,1,1,40,2.0,3.8,2.07,13.79,8.97,267,1016.7,16.0,11.7,Ideal (3-9 ft),Ideal(<3 mph)
2,16,2020,1,1,2,40,4.9,6.4,1.97,11.43,8.99,263,1017.0,16.3,10.9,Small (<2 ft),Moderate (3-19 mph)
3,22,2020,1,1,3,40,5.3,6.6,2.11,14.81,9.08,267,1017.4,15.9,12.1,Ideal (3-9 ft),Moderate (3-19 mph)
4,28,2020,1,1,4,40,5.9,7.7,2.01,14.81,9.04,262,1017.9,16.2,11.7,Ideal (3-9 ft),Moderate (3-19 mph)


In [122]:
file_path = Path('SanClemente_MachineLearningDataSet/eunjais_clemente_2021.csv')
sc2021_df = pd.read_csv(file_path)
sc2021_df.head()

Unnamed: 0.1,Unnamed: 0,#YY,MM,DD,hh,mm,WSP,GST,WVHT,DPD,APD,MWD,PRES,ATMP,DEWP,Wave_Size,Wind_Speed
0,10,2021,1,1,1,40,9.0,11.6,2.98,9.09,6.8,285,1013.9,14.5,10.4,Ideal (3-9 ft),Moderate (3-19 mph)
1,16,2021,1,1,2,40,9.3,11.8,2.94,17.39,6.92,293,1014.1,14.6,10.7,Ideal (3-9 ft),Moderate (3-19 mph)
2,28,2021,1,1,4,40,8.2,10.5,2.79,9.09,6.97,285,1014.2,14.8,9.8,Ideal (3-9 ft),Moderate (3-19 mph)
3,34,2021,1,1,5,40,7.5,9.8,2.65,16.0,7.09,286,1014.6,14.9,9.6,Ideal (3-9 ft),Moderate (3-19 mph)
4,40,2021,1,1,6,40,8.0,10.2,2.5,16.0,7.21,288,1014.8,15.2,9.2,Ideal (3-9 ft),Moderate (3-19 mph)


In [125]:
sc_df = pd.concat([sc2020_df,sc2021_df])
sc_df= sc_df.rename(columns={'#YY':'YY'})
sc_df['YY'] = sc_df['YY'].astype(str)
sc_df.tail()

Unnamed: 0.1,Unnamed: 0,YY,MM,DD,hh,mm,WSP,GST,WVHT,DPD,APD,MWD,PRES,ATMP,DEWP,Wave_Size,Wind_Speed
8539,51325,2021,12,31,19,40,9.1,11.0,1.95,11.43,5.93,292,1008.0,14.1,11.1,Small (<2 ft),Moderate (3-19 mph)
8540,51331,2021,12,31,20,40,9.4,11.3,2.04,12.12,5.75,279,1007.8,13.9,10.5,Ideal (3-9 ft),Moderate (3-19 mph)
8541,51337,2021,12,31,21,40,9.4,12.1,2.2,6.25,5.92,284,1007.6,13.9,10.2,Ideal (3-9 ft),Moderate (3-19 mph)
8542,51343,2021,12,31,22,40,10.1,12.5,2.43,6.67,6.08,287,1007.3,13.7,9.8,Ideal (3-9 ft),Moderate (3-19 mph)
8543,51349,2021,12,31,23,40,11.5,14.9,2.74,7.69,6.18,276,1007.6,13.7,9.3,Ideal (3-9 ft),Moderate (3-19 mph)


In [127]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(sc_df.dtypes)

Unnamed: 0      int64
YY             object
MM              int64
DD              int64
hh              int64
mm              int64
WSP           float64
GST           float64
WVHT          float64
DPD           float64
APD           float64
MWD             int64
PRES          float64
ATMP          float64
DEWP          float64
Wave_Size      object
Wind_Speed     object
dtype: object


In [128]:
sc_df = sc_df.dropna(axis='columns', how='all')
sc_df = sc_df.dropna()

# convert interest rate to numerical
sc_df['WVHT'] = sc_df['WVHT'].astype('float') 

# Convert the target column values to surfing_ideal and surfing__unideal vased on wave size
x = { "Ideal (3-9 ft)": 'surfing_ideal'}   
sc_df = sc_df.replace(x)
x = dict.fromkeys(["Small (<2 ft)", "Large (10-12 ft)"], 'surfing_unideal')    
sc_df = sc_df.replace(x)

# Convert the target column values to surfing_ideal and surfing__unideal vased on wind speed
x = { "Ideal(<3 mph)": 'surfing_ideal', "Moderate (3-19 mph)":'surfing_ideal'}   
sc_df = sc_df.replace(x)
x = dict.fromkeys(["Dangerous (>20 mph)"], 'surfing_unideal')    
sc_df = sc_df.replace(x)


sc_df.reset_index(inplace=True, drop=True)
sc_df.head(5)

Unnamed: 0.1,Unnamed: 0,YY,MM,DD,hh,mm,WSP,GST,WVHT,DPD,APD,MWD,PRES,ATMP,DEWP,Wave_Size,Wind_Speed
0,4,2020,1,1,0,40,3.7,5.1,2.19,16.0,9.15,263,1016.4,16.2,11.3,surfing_ideal,surfing_ideal
1,10,2020,1,1,1,40,2.0,3.8,2.07,13.79,8.97,267,1016.7,16.0,11.7,surfing_ideal,surfing_ideal
2,16,2020,1,1,2,40,4.9,6.4,1.97,11.43,8.99,263,1017.0,16.3,10.9,surfing_unideal,surfing_ideal
3,22,2020,1,1,3,40,5.3,6.6,2.11,14.81,9.08,267,1017.4,15.9,12.1,surfing_ideal,surfing_ideal
4,28,2020,1,1,4,40,5.9,7.7,2.01,14.81,9.04,262,1017.9,16.2,11.7,surfing_ideal,surfing_ideal


In [119]:
#2020 set for training
#2021 set for testing

# try and make a separate column for both csvs that separates WVHT into 
# different swell height / surfable wave category
# so that it can be used for the surfing ideal vs unideal categorization in ML

## also maybe drop all the 99 / 999 columns and merge the date time

In [135]:
df_bin_encode = pd.get_dummies(sc_df, columns=["MM",
                                                   "DD",
                                                   "hh",
                                                   "mm",
                                                   "WSP",
                                            "GST",
                                            "DPD",
                                            "MWD",
                                            "PRES",
                                            "ATMP",
                                            "DEWP",
                                                  "Wind_Speed"])

YY_num = {
    '2020' : 1,
    '2021' : 2
}

df_bin_encode.head()

Unnamed: 0.1,Unnamed: 0,YY,WVHT,APD,Wave_Size,MM_1,MM_2,MM_3,MM_4,MM_5,...,DEWP_21.4,DEWP_21.5,DEWP_21.6,DEWP_21.7,DEWP_21.8,DEWP_21.9,DEWP_22.0,DEWP_22.1,DEWP_999.0,Wind_Speed_surfing_ideal
0,4,2020,2.19,9.15,surfing_ideal,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,10,2020,2.07,8.97,surfing_ideal,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,16,2020,1.97,8.99,surfing_unideal,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,22,2020,2.11,9.08,surfing_ideal,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,28,2020,2.01,9.04,surfing_ideal,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [136]:
# Months' and Loan Status condition encoded using the dictionary values
df_bin_encode["Year_Num"] = df_bin_encode["YY"].apply(lambda x: YY_num[x])

# Drop the issue_d columns
df_bin_encode = df_bin_encode.drop(["YY"], axis=1)


df_bin_encode.head(10)

Unnamed: 0.1,Unnamed: 0,WVHT,APD,Wave_Size,MM_1,MM_2,MM_3,MM_4,MM_5,MM_6,...,DEWP_21.5,DEWP_21.6,DEWP_21.7,DEWP_21.8,DEWP_21.9,DEWP_22.0,DEWP_22.1,DEWP_999.0,Wind_Speed_surfing_ideal,Year_Num
0,4,2.19,9.15,surfing_ideal,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,10,2.07,8.97,surfing_ideal,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,16,1.97,8.99,surfing_unideal,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,22,2.11,9.08,surfing_ideal,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
4,28,2.01,9.04,surfing_ideal,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
5,34,1.76,7.93,surfing_unideal,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
6,40,1.74,8.24,surfing_unideal,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
7,46,1.88,8.44,surfing_unideal,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
8,52,1.83,8.0,surfing_unideal,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
9,64,1.96,8.73,surfing_unideal,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [137]:
# Create our features
X = df_bin_encode.drop(columns=["Wave_Size"])

# Create our target
y = pd.DataFrame(df_bin_encode["Wave_Size"])

In [138]:

X.describe()

Unnamed: 0.1,Unnamed: 0,WVHT,APD,MM_1,MM_2,MM_3,MM_4,MM_5,MM_6,MM_7,...,DEWP_21.5,DEWP_21.6,DEWP_21.7,DEWP_21.8,DEWP_21.9,DEWP_22.0,DEWP_22.1,DEWP_999.0,Wind_Speed_surfing_ideal,Year_Num
count,17129.0,17129.0,17129.0,17129.0,17129.0,17129.0,17129.0,17129.0,17129.0,17129.0,...,17129.0,17129.0,17129.0,17129.0,17129.0,17129.0,17129.0,17129.0,17129.0,17129.0
mean,25891.090723,1.472187,7.170257,0.086286,0.079514,0.073501,0.083601,0.086753,0.083776,0.081616,...,0.000409,0.000409,0.000642,0.000292,0.000292,0.000117,0.000117,0.000409,1.0,1.497752
std,14954.049177,0.504612,1.388752,0.280795,0.270548,0.260965,0.276796,0.281482,0.27706,0.273787,...,0.020212,0.020212,0.025334,0.017083,0.017083,0.010805,0.010805,0.020212,0.0,0.50001
min,4.0,0.52,4.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,12958.0,1.13,6.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
50%,25843.0,1.37,7.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
75%,38851.0,1.69,7.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
max,52139.0,5.64,15.07,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [139]:
y.head(10)

Unnamed: 0,Wave_Size
0,surfing_ideal
1,surfing_ideal
2,surfing_unideal
3,surfing_ideal
4,surfing_ideal
5,surfing_unideal
6,surfing_unideal
7,surfing_unideal
8,surfing_unideal
9,surfing_unideal


In [140]:

# Check the size of our target values
y['Wave_Size'].value_counts()

surfing_unideal    14959
surfing_ideal       2170
Name: Wave_Size, dtype: int64

In [141]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({'Wave_Size': 1})

In [142]:
# Resample the training data with the BalancedRandomForestClassifier
# Create a random forest classifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fitting the model
brf_model = brf_model.fit(X_train, y_train)

# Making predictions using the testing data
predictions = brf_model.predict(X_test)

In [143]:
# Calculate the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

y_pred = brf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9982800005890409

In [144]:
# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,544,1
Actual 1,6,3732


In [145]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

  surfing_ideal       0.99      1.00      1.00      0.99      1.00      1.00       545
surfing_unideal       1.00      1.00      1.00      1.00      1.00      1.00      3738

    avg / total       1.00      1.00      1.00      1.00      1.00      1.00      4283



In [146]:
# List the features sorted in descending order by feature importance
# Calculate feature importance in the Random Forest model
importances = brf_model.feature_importances_
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.4589534672595514, 'WVHT'),
 (0.05874315095584117, 'Unnamed: 0'),
 (0.0376738027398223, 'APD'),
 (0.022041985925427936, 'MM_1'),
 (0.01232502728741897, 'MM_7'),
 (0.01175207060783317, 'MM_8'),
 (0.008457630566571195, 'MM_12'),
 (0.00838635656474301, 'MM_9'),
 (0.007609214330759945, 'DPD_8.33'),
 (0.007148162479529347, 'Year_Num'),
 (0.004855301282433644, 'MM_6'),
 (0.004731056279181359, 'DPD_9.09'),
 (0.004465958727016229, 'MM_5'),
 (0.003739552083452749, 'DD_3'),
 (0.003129307407707878, 'DPD_7.69'),
 (0.0026919577457659733, 'MM_11'),
 (0.0026525698950556303, 'DPD_10.0'),
 (0.0026267446068849007, 'MM_2'),
 (0.002626403107067261, 'MM_3'),
 (0.0025457360757220873, 'DPD_14.81'),
 (0.0024531268814159545, 'MM_10'),
 (0.0024039066669560008, 'DD_7'),
 (0.0022308866379951056, 'DD_9'),
 (0.0020903262147578257, 'DD_8'),
 (0.0019991859881393053, 'MM_4'),
 (0.0019962998436916374, 'DPD_12.9'),
 (0.001968593902642773, 'DD_21'),
 (0.0019459853367742148, 'DD_15'),
 (0.001911898066070746, 'DD_5'),
 

In [148]:
# Easy Ensemble AdaBoost

# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)

# Fitting the model
eec_model = eec_model.fit(X_train, y_train)

# Making predictions using the testing data
predictions = eec_model.predict(X_test)

In [149]:
# Calculated the balanced accuracy score
y_pred = eec_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

1.0

In [150]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,545,0
Actual 1,0,3738


In [151]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

  surfing_ideal       1.00      1.00      1.00      1.00      1.00      1.00       545
surfing_unideal       1.00      1.00      1.00      1.00      1.00      1.00      3738

    avg / total       1.00      1.00      1.00      1.00      1.00      1.00      4283

