# Preprocessing the Data
---

Now that we've got some data, let's label it with our target variable and make a basic logistic regression model! 

In [94]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xarray as xr

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [83]:
CTD_data = pd.read_csv('CTD_01-01_09-16.csv')

In [84]:
CTD_data

Unnamed: 0,time,seawater_pressure,practical_salinity,seawater_temperature,corrected_dissolved_oxygen,lat,lon
0,2017-01-01 00:00:00.317937152,200.479041,33.886699,7.785772,106.590066,44.37415,-124.95648
1,2017-01-01 00:00:01.318152192,200.473701,33.886692,7.786201,106.566439,44.37415,-124.95648
2,2017-01-01 00:00:02.318157312,200.467264,33.886432,7.786262,106.633814,44.37415,-124.95648
3,2017-01-01 00:00:03.317954560,200.465142,33.886519,7.785895,106.511052,44.37415,-124.95648
4,2017-01-01 00:00:04.318168576,200.466196,33.886425,7.785711,106.702620,44.37415,-124.95648
...,...,...,...,...,...,...,...
21517478,2017-09-16 12:13:36.480951808,55.887428,33.515015,9.740724,174.247648,44.37415,-124.95648
21517479,2017-09-16 12:13:37.480647680,55.495146,33.513691,9.733299,174.247264,44.37415,-124.95648
21517480,2017-09-16 12:13:38.480967680,55.449183,33.524510,9.725553,174.128022,44.37415,-124.95648
21517481,2017-09-16 12:13:39.480977408,55.550728,33.524760,9.729361,174.092489,44.37415,-124.95648


In [86]:
CTD_data.describe()

Unnamed: 0,seawater_pressure,practical_salinity,seawater_temperature,corrected_dissolved_oxygen,lat,lon
count,21517480.0,21517480.0,21517480.0,21517480.0,21517480.0,21517480.0
mean,189.2278,33.82338,7.923279,103.2202,44.37415,-124.9565
std,39.56323,0.2838685,0.7240315,28.5486,2.273737e-12,5.087486e-12
min,13.55533,26.68256,6.647794,34.93388,44.37415,-124.9565
25%,202.6819,33.84952,7.368913,87.60923,44.37415,-124.9565
50%,203.7541,33.90711,7.839711,100.2137,44.37415,-124.9565
75%,204.4697,33.9401,8.267647,112.5449,44.37415,-124.9565
max,290.6436,34.02038,15.45517,326.1326,44.37415,-124.9565


In [92]:
type(CTD_data)

pandas.core.frame.DataFrame

In [115]:
CTD_data_xr = CTD_data.to_xarray().swap_dims({'index':'seawater_pressure'})

In [116]:
type(CTD_data_xr)

xarray.core.dataset.Dataset

In [117]:
CTD_data_xr

In [118]:
depth = 'seawater_pressure'
bin_list = list(np.arange(0, 250, 50))
features = ['practical_salinity', 'seawater_temperature', 'corrected_dissolved_oxygen']
binned_CTD = pd.DataFrame()
for col in features:
    s = CTD_data_xr[col]
    binned_CTD[col] = s.groupby_bins(group='seawater_pressure', bins=bin_list)

  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)


In [119]:
binned_CTD

Unnamed: 0,practical_salinity,seawater_temperature,corrected_dissolved_oxygen
0,"((150.0, 200.0], [<xarray.DataArray 'practical...","((150.0, 200.0], [<xarray.DataArray 'seawater_...","((150.0, 200.0], [<xarray.DataArray 'corrected..."
1,"((100.0, 150.0], [<xarray.DataArray 'practical...","((100.0, 150.0], [<xarray.DataArray 'seawater_...","((100.0, 150.0], [<xarray.DataArray 'corrected..."
2,"((50.0, 100.0], [<xarray.DataArray 'practical_...","((50.0, 100.0], [<xarray.DataArray 'seawater_t...","((50.0, 100.0], [<xarray.DataArray 'corrected_..."
3,"((0.0, 50.0], [<xarray.DataArray 'practical_sa...","((0.0, 50.0], [<xarray.DataArray 'seawater_tem...","((0.0, 50.0], [<xarray.DataArray 'corrected_di..."


In [80]:
METBK_daily = pd.read_csv('METBK_daily.csv')
CTD_daily = pd.read_csv('CTD_daily.csv')
CUTI = pd.read_csv('cuti.csv')

In [81]:
CTD_daily

Unnamed: 0,time,seawater_pressure,practical_salinity,seawater_temperature,corrected_dissolved_oxygen,lat,lon
0,2017-01-01,133.662894,33.470424,9.106671,157.425085,44.37415,-124.95648
1,2017-01-02,133.842129,33.470867,9.046603,155.220764,44.37415,-124.95648
2,2017-01-03,130.500264,33.455696,9.063661,156.355440,44.37415,-124.95648
3,2017-01-04,130.413681,33.471092,9.038018,154.870646,44.37415,-124.95648
4,2017-01-05,130.503017,33.484965,8.982736,154.908087,44.37415,-124.95648
...,...,...,...,...,...,...,...
248,2017-09-12,128.906938,33.701076,8.582766,123.861366,44.37415,-124.95648
249,2017-09-13,128.363783,33.702985,8.680365,119.646004,44.37415,-124.95648
250,2017-09-14,128.197780,33.714112,8.743877,129.043682,44.37415,-124.95648
251,2017-09-15,128.067688,33.716564,8.802753,124.152806,44.37415,-124.95648


In [82]:
CTD_daily['seawater_pressure'].describe()

count    253.000000
mean     188.850229
std       29.759336
min      123.285328
25%      203.876549
50%      203.956560
75%      204.037210
max      204.924699
Name: seawater_pressure, dtype: float64

In [47]:
METBK_daily

Unnamed: 0,time,Sea Surface Temperature (deg_C),Eastward Wind Velocity (m s-1),Northward Wind Velocity (m s-1)
0,2017-01-01,11.247412,6.413056,-5.371468
1,2017-01-02,11.149430,-1.148830,-1.207261
2,2017-01-03,11.089363,-5.203772,2.533484
3,2017-01-04,10.926763,-6.222477,-4.027738
4,2017-01-05,10.756387,-6.911579,-3.048631
...,...,...,...,...
313,2017-11-10,12.468681,-0.408066,4.630594
314,2017-11-11,12.591080,0.274552,4.486309
315,2017-11-12,12.417127,-0.501287,8.977260
316,2017-11-13,11.889144,4.022042,10.856365


Add METBK sea surface temperature to CTD dataframe

Add CUTI index to CTD dataframe

In [48]:
CUTI

Unnamed: 0,year,month,day,31N,32N,33N,34N,35N,36N,37N,...,39N,40N,41N,42N,43N,44N,45N,46N,47N,time
0,2017,1,1,0.304,0.348,0.805,0.904,0.612,0.390,0.712,...,1.950,3.470,3.912,2.677,1.937,1.731,0.619,1.463,1.365,2017-01-01
1,2017,1,2,0.684,0.432,0.706,0.877,0.700,0.479,0.424,...,1.148,1.376,1.111,1.063,1.066,1.308,0.574,0.625,0.709,2017-01-02
2,2017,1,3,0.443,0.284,0.273,0.055,0.076,0.093,-0.186,...,0.011,-0.014,-0.386,0.296,0.306,0.360,0.418,0.542,-0.171,2017-01-03
3,2017,1,4,0.130,0.124,0.064,-0.031,-0.588,-0.436,-0.878,...,0.118,0.052,-0.041,0.423,0.479,0.742,0.694,-0.226,-0.382,2017-01-04
4,2017,1,5,0.023,-0.029,-0.189,-0.134,-0.225,0.126,-0.098,...,0.562,1.107,0.565,0.751,1.013,1.469,1.337,-0.287,-0.294,2017-01-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,2017,12,27,-0.120,0.197,-0.119,-0.043,0.519,1.084,0.554,...,0.379,-0.008,0.223,0.465,0.261,0.256,0.101,0.282,-0.061,2017-12-27
361,2017,12,28,0.013,0.048,0.076,0.394,0.055,0.269,0.626,...,0.493,0.157,-0.286,0.031,-0.034,0.214,-0.072,-0.110,-0.040,2017-12-28
362,2017,12,29,0.104,-0.038,-0.029,-0.191,0.011,0.814,0.202,...,-0.113,-0.101,-0.475,-0.094,-0.472,-0.814,-1.247,-0.602,-0.099,2017-12-29
363,2017,12,30,-0.061,0.172,0.074,-0.441,-0.289,-0.349,-0.157,...,-0.082,-0.726,-0.722,-0.223,-0.458,-0.675,-0.523,-0.272,0.012,2017-12-30


In [49]:
CUTI['time'][0] == METBK_daily['time'][0] # == CTD_daily['time'][0]

True

In [50]:
METBK_daily['time'].max()

'2017-11-14'

In [51]:
CUTI[:318]['time'].max()

'2017-11-14'

In [52]:
CUTI[:318]['44N']

0      1.731
1      1.308
2      0.360
3      0.742
4      1.469
       ...  
313   -1.239
314   -0.740
315   -0.543
316   -2.091
317   -2.339
Name: 44N, Length: 318, dtype: float64

In [53]:
METBK_daily['CUTI'] = CUTI[:318]['44N']

In [54]:
METBK_daily

Unnamed: 0,time,Sea Surface Temperature (deg_C),Eastward Wind Velocity (m s-1),Northward Wind Velocity (m s-1),CUTI
0,2017-01-01,11.247412,6.413056,-5.371468,1.731
1,2017-01-02,11.149430,-1.148830,-1.207261,1.308
2,2017-01-03,11.089363,-5.203772,2.533484,0.360
3,2017-01-04,10.926763,-6.222477,-4.027738,0.742
4,2017-01-05,10.756387,-6.911579,-3.048631,1.469
...,...,...,...,...,...
313,2017-11-10,12.468681,-0.408066,4.630594,-1.239
314,2017-11-11,12.591080,0.274552,4.486309,-0.740
315,2017-11-12,12.417127,-0.501287,8.977260,-0.543
316,2017-11-13,11.889144,4.022042,10.856365,-2.091


In [55]:
METBK_daily['upwelling'] = METBK_daily['CUTI'].apply(lambda x: 1 if x > 0 else 0)

In [56]:
METBK_daily['upwelling'].value_counts(normalize=True)

1    0.613208
0    0.386792
Name: upwelling, dtype: float64

--- 
### Now make a model ahhhh

In [57]:
METBK_units = {'sea_surface_temperature': ['ºC'],
 'met_windavg_mag_corr_east': ['m s-1'],
 'met_windavg_mag_corr_north': ['m s-1']}

In [58]:
METBK_daily = METBK_daily.rename(columns={'Sea Surface Temperature (deg_C)':'sea_surface_temperature',
                                            'Eastward Wind Velocity (m s-1)': 'met_windavg_mag_corr_east'.format(METBK_units['met_windavg_mag_corr_east'][0]), 
                                            'Northward Wind Velocity (m s-1)': 'met_windavg_mag_corr_north'.format(METBK_units['met_windavg_mag_corr_north'][0])})

In [59]:
METBK_daily

Unnamed: 0,time,sea_surface_temperature,met_windavg_mag_corr_east,met_windavg_mag_corr_north,CUTI,upwelling
0,2017-01-01,11.247412,6.413056,-5.371468,1.731,1
1,2017-01-02,11.149430,-1.148830,-1.207261,1.308,1
2,2017-01-03,11.089363,-5.203772,2.533484,0.360,1
3,2017-01-04,10.926763,-6.222477,-4.027738,0.742,1
4,2017-01-05,10.756387,-6.911579,-3.048631,1.469,1
...,...,...,...,...,...,...
313,2017-11-10,12.468681,-0.408066,4.630594,-1.239,0
314,2017-11-11,12.591080,0.274552,4.486309,-0.740,0
315,2017-11-12,12.417127,-0.501287,8.977260,-0.543,0
316,2017-11-13,11.889144,4.022042,10.856365,-2.091,0


In [64]:
features = ['sea_surface_temperature', 'met_windavg_mag_corr_east', 'met_windavg_mag_corr_north']
X = METBK_daily[features]
y = METBK_daily['upwelling']

In [65]:
X

Unnamed: 0,sea_surface_temperature,met_windavg_mag_corr_east,met_windavg_mag_corr_north
0,11.247412,6.413056,-5.371468
1,11.149430,-1.148830,-1.207261
2,11.089363,-5.203772,2.533484
3,10.926763,-6.222477,-4.027738
4,10.756387,-6.911579,-3.048631
...,...,...,...
313,12.468681,-0.408066,4.630594
314,12.591080,0.274552,4.486309
315,12.417127,-0.501287,8.977260
316,11.889144,4.022042,10.856365


In [66]:
y

0      1
1      1
2      1
3      1
4      1
      ..
313    0
314    0
315    0
316    0
317    0
Name: upwelling, Length: 318, dtype: int64

In [70]:
sc = StandardScaler()

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [73]:
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [75]:
logreg = LogisticRegression()

In [76]:
logreg.fit(X_train_sc, y_train)

LogisticRegression()

In [77]:
logreg.score(X_train_sc, y_train)

0.8319327731092437

In [78]:
logreg.score(X_test_sc, y_test)

0.8125

In [79]:
X_train

Unnamed: 0,sea_surface_temperature,met_windavg_mag_corr_east,met_windavg_mag_corr_north
102,10.340394,4.393650,5.505326
79,10.432781,-1.444948,6.717467
280,14.719081,0.921250,-5.099964
186,13.437458,-0.098272,-3.034137
155,14.538817,-0.589584,-6.848077
...,...,...,...
233,14.815643,1.153536,-2.578816
58,10.488763,5.111347,-2.618992
195,14.060986,1.617900,-2.921389
74,10.602498,3.696034,0.242052
