In [47]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate

In [6]:
InputDataDir='data/GOSHIP_Data/QCFilteredData.csv'

pos_encode=['none','raw','radians']
date_encode=['none','thermometer','sincos']

In [48]:
train_val = 40000
k=5
for i in np.arange(len(pos_encode)):
    for j in np.arange(len(date_encode)):
        
        # Load data
        GOSHIP_Data=pd.read_csv(InputDataDir)
        GOSHIP_Data=GOSHIP_Data.iloc[:,3:]
        
        # Shuffle data
        GOSHIP_Data = GOSHIP_Data.sample(frac=1)
        
        Position_Data = GOSHIP_Data.loc[:,['LATITUDE','LONGITUDE']]
        Date_Data = GOSHIP_Data.loc[:,'MONTH']
        
        # Standardize non-position/datae data
        scaler = StandardScaler().fit(GOSHIP_Data.iloc[:,2:-1])
        data_scaled = scaler.transform(GOSHIP_Data.iloc[:,2:-1])
        data_scaled = pd.DataFrame(data_scaled, columns = ['PRES','TEMP','SAL','OXY','NITR','PHSP','SILI'])
        
        # Get X data
        X=data_scaled.iloc[:,0:5] # Pres, temp, sal , oxy, nitr
        
        # Encode position data and add it to X-data
        if pos_encode[i]=='none':
            # Do not include postion data
            pass
        elif pos_encode[i]=='raw':
            # Use raw lat/lon data
            X['LAT']=GOSHIP_Data.loc[:,'LATITUDE'].to_numpy()
            X['LON']=GOSHIP_Data.loc[:,'LONGITUDE'].to_numpy()
        elif pos_encode[i]=='radians':
            # Use lat/lon encoded as radians
            X['LAT']=np.radians(GOSHIP_Data.loc[:,'LATITUDE'].to_numpy())
            X['LON']=np.radians(GOSHIP_Data.loc[:,'LONGITUDE'].to_numpy())
        
        # Encode date data and add it to X-data
        if date_encode[i]== 'none':
            pass
        elif date_encode[i]== 'thermometer':
            non_data=GOSHIP_Data.loc[:,'MONTH'].to_numpy()
            temp_data=np.zeros((len(non_data), 12))
            for m in np.arange(len(temp_data)):
                m_ind=non_data[m]  
                temp_data[m,:m_ind]=1
        
            EncodedFeatures=pd.DataFrame(temp_data, columns=['M1','M2','M3', 'M4', 'M5','M6', 'M7', 'M8', 'M9','M10','M11', 'M12'])
            #print(EncodedFeatures)
            X=pd.concat([X,EncodedFeatures], axis=1)
            #print(X)
        elif date_encode[i]== 'sincos':
            # Encode as a sin/cosine pair
            X['MONTH_SIN']=np.sin((2*np.pi*GOSHIP_Data.loc[:,'MONTH'])/max(GOSHIP_Data.loc[:,'MONTH']))
            X['MONTH_COS']=np.cos((2*np.pi*GOSHIP_Data.loc[:,'MONTH'])/max(GOSHIP_Data.loc[:,'MONTH']))
        
        # Get Y data
        Y_P = data_scaled.loc[:,'PHSP']
        Y_S = data_scaled.loc[:, 'SILI']
        
        # Run regression
        print('\nMonth encoding: ', pos_encode[i])
        print('Date encoding: ', date_encode[j])
        
        # Split in to test and train values
        X_train = X.iloc[:train_val,:]
        X_test=X.iloc[train_val:, :]
        
        Y_P_train=Y_P.iloc[:train_val]
        Y_P_test=Y_P.iloc[train_val:]
        
        Y_S_train=Y_S.iloc[:train_val]
        Y_S_test=Y_S.iloc[train_val:]
        
        # Run Linear regression
        print('\nLinear Regression with training size: ', train_val,' and testing size :', len(Y_S)-train_val)
        # Phosphate
        P_reg=LinearRegression().fit(X_train,Y_P_train)
        P_pred_train=P_reg.predict(X_train)
        P_pred_test=P_reg.predict(X_test)
        
        print('\n%% PHOSPHATE %%')
        print("score: ", P_reg.score(X_train,Y_P_train))
        print("coefficients: ", P_reg.coef_)
        print("intercept: ", P_reg.intercept_)
        print('training error: ', mean_squared_error(Y_P_train, P_pred_train))
        print('testing error: ', mean_squared_error(Y_P_test, P_pred_test))
        
        # Silicate
        S_reg=LinearRegression().fit(X_train,Y_S_train)
        S_pred_train=S_reg.predict(X_train)
        S_pred_test=S_reg.predict(X_test)
        
        print('\n%% SILICATE %%')
        print("score: ", S_reg.score(X_train,Y_S_train))
        print("coefficients: ", S_reg.coef_)
        print("intercept: ", S_reg.intercept_)
        print('training error: ', mean_squared_error(Y_S_train, S_pred_train))
        print('testing error: ', mean_squared_error(Y_S_test, S_pred_test))
        
        # Print Linear regression with cross validation 
        linear=LinearRegression()
        cv_results_P = cross_validate(linear, X, Y_P, cv=k,return_train_score=True)
        cv_results_S = cross_validate(linear, X, Y_S, cv=k,return_train_score=True)
        
        print('\nCross validation with k-folding: ', k)
        print('\n%% PHOSPHATE %%')
        print('train score: ', cv_results_P['train_score'])
        print('test score: ', cv_results_P['test_score'])
        
        print('\n%% SILICATE %%')
        print('train score: ', cv_results_S['train_score'])
        print('test score: ', cv_results_S['test_score'])
        
        
        
            


Month encoding:  none
Date encoding:  none

Linear Regression with training size:  40000  and testing size : 2412

%% PHOSPHATE %%
score:  0.9718386162981666
coefficients:  [ 0.01909467 -0.11922947 -0.04762304 -0.16109347  0.81473361]
intercept:  0.00035647910258718927
training error:  0.028164682031244264
testing error:  0.03037675606868572

%% SILICATE %%
score:  0.9325501574417971
coefficients:  [ 0.31207187 -0.77339247  0.29947843 -0.40902468 -0.31878013]
intercept:  -0.00017867678722623362
training error:  0.0675180964757489
testing error:  0.06609805808889774

Cross validation with k-folding:  5

%% PHOSPHATE %%
train score:  [0.9717652  0.97156629 0.97158234 0.97192334 0.9717207 ]
test score:  [0.97148052 0.97227229 0.97220079 0.97086125 0.97165921]

Month encoding:  none
Date encoding:  thermometer

Linear Regression with training size:  40000  and testing size : 2412

%% PHOSPHATE %%
score:  0.9719427583938491
coefficients:  [ 0.01917269 -0.11994186 -0.04713779 -0.16111475  0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Month encoding:  raw
Date encoding:  none

Linear Regression with training size:  40000  and testing size : 2412

%% PHOSPHATE %%
score:  0.9748920660623709
coefficients:  [ 8.80852959e-03 -1.11542523e-01 -1.95268650e-02 -1.22668233e-01
  8.40786374e-01  2.74952328e-03 -8.99044745e-05 -1.38777878e-16
 -1.63705451e-02  5.17416977e-02 -2.35963870e-02 -2.88482142e-03
 -6.09353765e-02 -6.09353765e-02 -6.09353765e-02 -6.09353765e-02
 -6.09353765e-02  5.57271522e-04  3.06724488e-01]
intercept:  0.16829922072277378
training error:  0.025145662375066734
testing error:  0.02699763018368408

%% SILICATE %%
score:  0.9483805276632371
coefficients:  [ 3.80694280e-01 -5.75391752e-01  2.53014582e-01 -3.76498732e-01
 -2.27734424e-01 -2.37602566e-02  1.97667974e-04  1.66533454e-16
  2.77571635e-02  2.05863350e-02 -1.57092014e-02 -1.09294517e-01
 -1.13252901e-03 -1.13252901e-03 -1.13252901e-03 -1.13252901e-03
 -1.13252901e-03  5.03719481e-02 -7.46267412e-02]
intercept:  -1.4331475571190067
training er

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Month encoding:  raw
Date encoding:  thermometer

Linear Regression with training size:  40000  and testing size : 2412

%% PHOSPHATE %%
score:  0.9746517154208968
coefficients:  [ 9.13411178e-03 -1.15113347e-01 -2.12836600e-02 -1.26205390e-01
  8.36444630e-01  2.71869089e-03 -8.83726715e-05  2.77555756e-17
 -1.68389685e-02  5.11850332e-02 -2.29731768e-02 -3.54489531e-03
 -6.08113365e-02 -6.08113365e-02 -6.08113365e-02 -6.08113365e-02
 -6.08113365e-02  1.67742352e-03  3.05470344e-01]
intercept:  0.16707343350355208
training error:  0.025301849528707045
testing error:  0.024404095611620578

%% SILICATE %%
score:  0.9484938846596972
coefficients:  [ 3.81859641e-01 -5.75798906e-01  2.51767616e-01 -3.78772781e-01
 -2.29375745e-01 -2.38455073e-02  1.99868595e-04  0.00000000e+00
  2.80073727e-02  1.96377689e-02 -1.65461837e-02 -1.09647564e-01
 -5.74987131e-05 -5.74987131e-05 -5.74987131e-05 -5.74987131e-05
 -5.74987131e-05  4.70554097e-02 -7.60840907e-02]
intercept:  -1.4384666262497754
tra

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Month encoding:  raw
Date encoding:  sincos

Linear Regression with training size:  40000  and testing size : 2412

%% PHOSPHATE %%
score:  0.9748595762923965
coefficients:  [ 9.12007201e-03 -1.12388491e-01 -2.11982821e-02 -1.24365774e-01
  8.39085193e-01  2.66382332e-03 -8.66821816e-05 -1.38777878e-16
 -1.53590827e-02  5.09689317e-02 -2.37591311e-02 -1.78708316e-03
 -6.09861841e-02 -6.09861841e-02 -6.09861841e-02 -6.09861841e-02
 -6.09861841e-02  6.70133489e-04  3.04549222e-01]
intercept:  0.16329623002642685
training error:  0.02511013292712917
testing error:  0.027574385661328634

%% SILICATE %%
score:  0.9482407995563071
coefficients:  [ 3.79726063e-01 -5.75282867e-01  2.54105232e-01 -3.75402169e-01
 -2.26723196e-01 -2.37484418e-02  1.89217524e-04  1.66533454e-16
  2.89948113e-02  1.86607706e-02 -1.56273009e-02 -1.06416584e-01
 -1.16358341e-03 -1.16358341e-03 -1.16358341e-03 -1.16358341e-03
 -1.16358341e-03  4.89506134e-02 -7.46747763e-02]
intercept:  -1.4329954062682342
training 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Month encoding:  radians
Date encoding:  none

Linear Regression with training size:  40000  and testing size : 2412

%% PHOSPHATE %%
score:  0.9723313717348531
coefficients:  [ 0.00988926 -0.132563   -0.03284903 -0.1481522   0.81506509  0.13081081
 -0.01080485  0.00431067  0.00171663]
intercept:  0.13389947864387455
training error:  0.027847176591222295
testing error:  0.027978902945740222

%% SILICATE %%
score:  0.9470193257126548
coefficients:  [ 3.78073171e-01 -5.86330834e-01  2.50482938e-01 -3.86155227e-01
 -2.39300074e-01 -1.35690442e+00  6.61997270e-03  3.11415621e-03
  8.54699583e-04]
intercept:  -1.420844478226268
training error:  0.053062269567818815
testing error:  0.05327283430286291

Cross validation with k-folding:  5

%% PHOSPHATE %%
train score:  [0.97207081 0.97255338 0.97172779 0.97213729 0.9722471 ]
test score:  [0.97244682 0.97056126 0.97372308 0.97218139 0.97172219]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Month encoding:  radians
Date encoding:  thermometer

Linear Regression with training size:  40000  and testing size : 2412

%% PHOSPHATE %%
score:  0.9720818350077336
coefficients:  [ 0.00935072 -0.13163021 -0.0305767  -0.14576441  0.81691429  0.13169929
 -0.01079293 -0.00173867  0.00180302]
intercept:  0.13861458514617706
training error:  0.027893741614400995
testing error:  0.027196711172280157

%% SILICATE %%
score:  0.9468494016758425
coefficients:  [ 0.37929167 -0.58573703  0.24812898 -0.38861398 -0.24056817 -1.37075478
  0.00696684  0.0019269   0.0020832 ]
intercept:  -1.4350089160587325
training error:  0.053109079673957095
testing error:  0.05248474614280373

Cross validation with k-folding:  5

%% PHOSPHATE %%
train score:  [0.97191617 0.97271472 0.97169976 0.97234984 0.97204817]
test score:  [0.97302358 0.96976283 0.97388285 0.97128855 0.97252844]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Month encoding:  radians
Date encoding:  sincos

Linear Regression with training size:  40000  and testing size : 2412

%% PHOSPHATE %%
score:  0.9720328813161994
coefficients:  [ 0.0096136  -0.12867531 -0.03027788 -0.14348747  0.8198531   0.12824927
 -0.01078107 -0.00372363 -0.00223244]
intercept:  0.13810696888377566
training error:  0.027889788896916155
testing error:  0.027250847860047462

%% SILICATE %%
score:  0.9468523449126612
coefficients:  [ 0.37743913 -0.58931737  0.24969718 -0.38817521 -0.24166958 -1.35032224
  0.00643339  0.00344598  0.00312162]
intercept:  -1.4151153141405346
training error:  0.05311390585901936
testing error:  0.05239249466028057

Cross validation with k-folding:  5

%% PHOSPHATE %%
train score:  [0.97294331 0.97170465 0.97186216 0.97231805 0.97188708]
test score:  [0.96856101 0.97379495 0.97323559 0.97140837 0.97311223]
