In [36]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import mean_absolute_error
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, Normalizer
import xgboost as xgb


os.getcwd()

'c:\\Users\\johnc\\Documents\\Python Scripts\\ocean-depth-characterization\\EDA'

In [3]:
PROFILE_DATA_PATH = "../data/profileData.csv"
SAT_DATA_PATH = "../data/satData.csv"
drop_cols = ['LT_SAT_SST_SD', 'LT_SAT_SST_MED', 'LT_SAT_CHL_SD', 'LT_SAT_CHL_MED', 'LT_SAT_BBP_SD', 'LT_SAT_BBP_MED']

profile_df = pd.read_csv(PROFILE_DATA_PATH)
sat_df = pd.read_csv(SAT_DATA_PATH)

out = pd.cut(profile_df.PRES, bins=50, labels=[i for i in range(50)])
profile_df['depth_bin'] = out

depth_profiles = profile_df[['float', 'cycleNumber', 'depth_bin', 'CHLA', 'BBP700']] \
    .groupby(['float', 'cycleNumber', 'depth_bin']).mean().reset_index().dropna()

df_depth = depth_profiles.merge(sat_df, on=['float', 'cycleNumber']).drop(drop_cols, axis=1).dropna()
df_depth['date'] = pd.to_datetime(df_depth.date, format='%Y-%m-%d %H:%M:%S')

In [15]:
def model_scoring(model, X, y, argument_dict=None, cv=5):
    # for sklearn API compatible gradient boosting models
    
    if not argument_dict:
        # train vanilla model
        cv_results = cross_validate(model, X, y, cv=cv,
                                    scoring=['neg_root_mean_squared_error', 'r2', 'neg_mean_absolute_percentage_error', 'neg_mean_absolute_error'],
                                    return_estimator=True,
                                    return_train_score=True)
        return cv_results
    else:
        # do a grid search
        gs_results = GridSearchCV(model,
                                  param_grid = [argument_dict], cv=cv,
                                  scoring=['neg_root_mean_squared_error', 'r2', 'neg_mean_absolute_percentage_error', 'neg_mean_absolute_error'],
                                  refit='neg_root_mean_squared_error',
                                  return_train_score=True)
        gs_results.fit(X, y)
        return gs_results

### Preprocessing

In [11]:
bins_to_use = [0, 2, 4, 6, 8, 10]
df = df_depth[df_depth.depth_bin.isin(bins_to_use)]

In [12]:
# turn date into radians
df['date_doy'] = df['date'].apply(lambda x: pd.Period(x, freq='D').day_of_year)
df['date_doy_rad'] = df['date_doy'] * (np.pi /182.625)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_doy'] = df['date'].apply(lambda x: pd.Period(x, freq='D').day_of_year)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_doy_rad'] = df['date_doy'] * (np.pi /182.625)


In [6]:
score = list()

In [18]:
Xc_boost = df[["latitude", "longitude", "date_doy_rad", "depth_bin", "MO_SAT_CHL", "MO_SAT_BBP", "MO_SAT_SST"]]
yc_boost = df["CHLA"]

X_tr, X_te, y_tr, y_te = train_test_split(Xc_boost, yc_boost, test_size=0.15, random_state=23)

In [None]:
gbr = GradientBoostingRegressor()

gbr_scoring = model_scoring(gbr, X_tr, y_tr)

In [41]:
xgbr = xgb.XGBRegressor()

xgbr_scoring = model_scoring(xgbr, X_tr.to_numpy(), y_tr.to_numpy())

In [43]:
xgbr_scoring["test_r2"]

array([0.7030797 , 0.68809621, 0.69670025, 0.66998076, 0.69835647])