### Read the Buoy Data

In [11]:
import pandas as pd
Buoydata_modified = pd.read_csv("Buoydata_modified.csv", sep=",")
print(Buoydata_modified.head(n=10))
Buoydata_modified.info()


  station_id longitude_degrees_east latitude_degrees_north  \
0         M2             -5.424.046             53.481.003   
1         M2             -5.424.046             53.481.003   
2         M2             -5.424.046             53.481.003   
3         M2             -5.424.046             53.481.003   
4         M2             -5.424.046             53.481.003   
5         M2             -5.424.046             53.481.003   
6         M2             -5.424.046             53.481.003   
7         M2             -5.424.046             53.481.003   
8         M2             -5.424.046             53.481.003   
9         M2             -5.424.046             53.481.003   

               time_UTC  atmospheric_pressure_mb  wind_direction_degrees_true  \
0  2011-03-31T01:00:00Z              1001.509846                          174   
1  2011-03-31T02:00:00Z               999.160000                          171   
2  2011-03-31T03:00:00Z               997.598000                          

In [7]:
# third cell -
import pandas as pd
Buoydata_modified = pd.read_csv("Buoydata_modified.csv", sep=",")
  

### Train/Test Data Split

In [12]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(Buoydata_modified, 
                        test_size=0.2, random_state=123)
print('Train size: ', len(train_set), 'Test size: ', len(test_set))


Train size:  78464 Test size:  19616


### Read Data, Select Features in DataSet for X and y, linear Regression, Train Model, Evaluate Model

In [13]:
import pandas as pd
Buoydata_modified = pd.read_csv("Buoydata_modified.csv", sep=",")

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import numpy as np

X = train_set[['wind_speed_kn', 'gust_kn']]
y = train_set['wave_height_m']
X_test = test_set[['wind_speed_kn', 'gust_kn']]
y_test = test_set['wave_height_m']

lr_model = LinearRegression()
lr_model.fit(X, y)

y_pred = lr_model.predict(X)
print('Results for linear regression on training data')
print(' Default settings')
print('Internal parameters:')
print(' Bias is ', lr_model.intercept_)
print(' Coefficients', lr_model.coef_)
print(' Score', lr_model.score(X, y))
print('MAE is ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2 ', r2_score(y, y_pred))


y_test_pred = lr_model.predict(X_test, y_test)
print()
print('Results for linear regression on test data')
print('MAE is ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test,
y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2 ', r2_score(y_test,y_test_pred))


Results for linear regression on training data
 Default settings
Internal parameters:
 Bias is  0.44927198923704625
 Coefficients [-0.23049854  0.25078581]
 Score 0.40555324922132097
MAE is  0.7664062578769086
RMSE is  1.0726032541360306
MSE is  1.1504777407832023
R^2  0.40555324922132097


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype

TypeError: LinearModel.predict() takes 2 positional arguments but 3 were given

### Create pipelined models on the Buoy data set with the same two features from before and with wave_height_m as the target.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

X = train_set[['wind_speed_kn', 'gust_kn']]
y = train_set['wave_height_m']

X_test = test_set[['wind_speed_kn', 'gust_kn']]
y_test = test_set['wave_height_m']


imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
scale = StandardScaler()

lr_model = LinearRegression()

stages = [('imp_mean', imp_mean),
          ('scale', scale),
          ('lr_model', lr_model),
         ]
pipe_model = Pipeline(stages)


pipe_model.fit(X,y)

y_pred = pipe_model.predict(X.values)
print('Results for pipeline linear regression on training data')
#print('  Default settings')
#print('Internal parameters:')
print('   Bias is ', pipe_model.predict([[8,2158]]))
#print('   Coefficients', pipe_model.coef_)
print('   Score', pipe_model.score(X.values,y))

print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))

y_test_pred = pipe_model.predict(X_test.values)
print()
print('Results for pipeline linear regression on test data')
print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))


Results for pipeline linear regression on training data
   Bias is  [539.80105946]
   Score 0.40555324922132097
MAE is   0.7664062578769085
RMSE is  1.0726032541360306
MSE is  1.1504777407832023
R^2     0.40555324922132097

Results for pipeline linear regression on test data
MAE is   0.766383851881842
RMSE is  1.0936165245522262
MSE is  1.19599710277369
R^2     0.38496313693879247


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if