In [34]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

In [9]:
# Load Data
InputDataDir='data/GOSHIP_Data/QCFilteredData.csv'
GOSHIP_Data=pd.read_csv(InputDataDir)
GOSHIP_Data=GOSHIP_Data.iloc[:,3:]
print(GOSHIP_Data)

       LATITUDE  LONGITUDE    PRES     TEMP      SAL    OXY   NITR   PHSP  \
0       -45.000   146.2200     4.1  15.7440  35.4240  241.6   2.37  0.140   
1       -45.000   146.2200    22.0  15.5310  35.3940  241.8   2.54  0.150   
2       -45.000   146.2200    51.5  14.4770  35.2600  239.1   4.68  0.280   
3       -45.000   146.2200   202.8  12.3210  35.1620  224.9  10.52  0.590   
4       -45.000   146.2200   498.6   9.0490  34.6360  247.7  16.62  0.960   
...         ...        ...     ...      ...      ...    ...    ...    ...   
42407   -60.013   -30.8953  1697.8   0.0694  34.6681  218.5  32.60  2.251   
42408   -60.013   -30.8953  1996.5  -0.0152  34.6641  222.7  32.40  2.247   
42409   -60.013   -30.8953  2299.0  -0.0551  34.6618  224.9  32.50  2.247   
42410   -60.013   -30.8953  2597.7  -0.0707  34.6601  226.8  32.40  2.248   
42411   -60.013   -30.8953  2953.5  -0.0603  34.6595  231.2  32.30  2.246   

         SILI  MONTH  
0        0.15      1  
1        0.18      1  
2     

In [17]:
# Standardize data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(GOSHIP_Data.iloc[:,:-1])
data_scaled = scaler.transform(GOSHIP_Data.iloc[:,:-1])
data_scaled = pd.DataFrame(data_scaled, columns = ['LATITUDE', 'LONGITUDE','PRES','TEMP','SAL','OXY','NITR','PHSP','SILI'])
data_scaled['MONTH']=GOSHIP_Data.loc[:,'MONTH']
print(data_scaled)


       LATITUDE  LONGITUDE      PRES      TEMP       SAL       OXY      NITR  \
0      1.972425   1.207289 -0.967296  5.592470  3.041097  0.210208 -5.300686   
1      1.972425   1.207289 -0.953505  5.507665  2.941475  0.214179 -5.267945   
2      1.972425   1.207289 -0.930776  5.088020  2.496495  0.160573 -4.855791   
3      1.972425   1.207289 -0.814206  4.229619  2.171062 -0.121352 -3.731035   
4      1.972425   1.207289 -0.586305  2.926888  0.424349  0.331317 -2.556204   
...         ...        ...       ...       ...       ...       ...       ...   
42407 -0.016004  -0.287745  0.337627 -0.648296  0.530945 -0.248417  0.521467   
42408 -0.016004  -0.287745  0.567762 -0.681979  0.517662 -0.165031  0.482948   
42409 -0.016004  -0.287745  0.800826 -0.697865  0.510025 -0.121352  0.502208   
42410 -0.016004  -0.287745  1.030961 -0.704076  0.504379 -0.083630  0.482948   
42411 -0.016004  -0.287745  1.305090 -0.699935  0.502387  0.003727  0.463689   

           PHSP      SILI  MONTH  
0   

In [45]:
# Cross-Validate: Phosphate
k=10
# Shuffle Data
data_shuffled = data_scaled.sample(frac=1)
#print(data_shuffled)
# Create data chunks index
index_list=np.array(data_shuffled.index.to_list())
chunk_index=np.array_split(index_list,k)
#print(chunk_index)

P_train_error = np.zeros(k)
P_train_error[:]=np.NaN

P_test_error = np.zeros(k)
P_test_error[:]=np.NaN
for i in np.arange(k):
    
    
    test_ind=chunk_index[i]
    train_data=data_shuffled.copy()
    train_data=train_data.drop(index=test_ind)
    
    # training data
    x_train=train_data.iloc[:,[0,1,2,3,4,5,6,9]]
    y_train=train_data.iloc[:,7]
    
    # testing data
    x_test=data_shuffled.iloc[test_ind, [0,1,2,3,4,5,6,9]]
    y_test=data_shuffled.iloc[test_ind, 7]
    
    P_reg=LinearRegression().fit(x_train,y_train)
    P_pred_train=P_reg.predict(x_train)
    P_pred_test=P_reg.predict(x_test)
    
    # Calculate training error 
    P_train_error[i]=mean_squared_error(y_train, P_pred_train)
    
    # Calculate testing error
    P_test_error[i]=mean_squared_error(y_test, P_pred_test)


print('%% PHOSPHATE Cross validate k=',k,' %%')
print('Training error: ',np.nanmean(train_error))
print('Testing error: ', np.nanmean(test_error))

%% PHOSPHATE Cross validate k= 10  %%
Training error:  0.05247707159289501
Testing error:  0.05247729718562387


In [46]:
# Cross-Validate: Silicate
k=10
# Shuffle Data
data_shuffled = data_scaled.sample(frac=1)
#print(data_shuffled)
# Create data chunks index
index_list=np.array(data_shuffled.index.to_list())
chunk_index=np.array_split(index_list,k)
#print(chunk_index)

S_train_error = np.zeros(k)
S_train_error[:]=np.NaN

S_test_error = np.zeros(k)
S_test_error[:]=np.NaN
for i in np.arange(k):
    
    
    test_ind=chunk_index[i]
    train_data=data_shuffled.copy()
    train_data=train_data.drop(index=test_ind)
    
    # training data
    x_train=train_data.iloc[:,[0,1,2,3,4,5,6,9]]
    y_train=train_data.iloc[:,8]
    
    # testing data
    x_test=data_shuffled.iloc[test_ind, [0,1,2,3,4,5,6,9]]
    y_test=data_shuffled.iloc[test_ind, 8]
    
    S_reg=LinearRegression().fit(x_train,y_train)
    S_pred_train=S_reg.predict(x_train)
    S_pred_test=S_reg.predict(x_test)
    
    # Calculate training error 
    S_train_error[i]=mean_squared_error(y_train, S_pred_train)
    
    # Calculate testing error
    S_test_error[i]=mean_squared_error(y_test, S_pred_test)


print('%% SILICATE Cross validate k=',k,' %%')
print('Training error: ',np.nanmean(S_train_error))
print('Testing error: ', np.nanmean(S_test_error))

%% SILICATE Cross validate k= 10  %%
Training error:  0.05247670206040937
Testing error:  0.05248203559059931
