In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt

In [2]:
path = "/Users/ecem/Desktop/gyrocardiogram/phase2/"

## Load data

In [3]:
selected_cols = [0,4,6,7,11]

In [4]:
X = pd.read_csv(path + "scg_selected_features_10sec.csv", index_col = 0).iloc[:, selected_cols]

In [5]:
X.isnull().sum()

SE x    0
SR y    0
SC x    7
SC y    2
SS z    0
dtype: int64

In [6]:
idx  = list(np.where(X['SC x'].isnull())[0]) + list(np.where(X['SC y'].isnull())[0])
idx

[3504, 3505, 3508, 3515, 3517, 3736, 3739, 2399, 2875]

In [7]:
np.where(X['SC y'].isnull())

(array([2399, 2875]),)

In [8]:
X = X.drop(idx)
X

Unnamed: 0,SE x,SR y,SC x,SC y,SS z
0,3.321794,0.898047,0.501910,0.498958,0.288676
1,3.321919,0.898828,0.499997,0.499516,0.288668
2,3.321912,0.899219,0.499899,0.499979,0.288667
3,3.321833,0.902344,0.499137,0.501352,0.288681
4,3.321859,0.899219,0.501234,0.499578,0.288674
...,...,...,...,...,...
4409,3.315372,0.912891,0.507174,0.510179,0.288629
4410,3.316430,0.911719,0.507915,0.510864,0.288646
4411,3.317626,0.899609,0.506036,0.507815,0.288672
4412,3.317750,0.886328,0.502737,0.504285,0.288733


In [9]:
y = pd.read_csv(path + "scg_target_df_10sec.csv", index_col = 0).drop(idx).iloc[:,-1]
y

0       10.0
1       10.0
2       10.0
3       10.0
4       10.0
        ... 
4409    11.0
4410    11.0
4411    11.0
4412    11.0
4413    11.0
Name: LVPW (mm), Length: 4405, dtype: float64

## split training and test data:

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=42)

## Now, let's try different models and see which outperforms the others :

### xgb:

In [11]:
import xgboost as xgb
model_xgb  = xgb.XGBRegressor(random_state = 42) 

# Fit the model
model_xgb.fit(X_train, y_train)

In [12]:
predictions_xgb = model_xgb.predict(X_valid)

In [13]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_xgb))
print("RMSE : % f" %(rmse))

RMSE :  1.708541


In [14]:
mape = mean_absolute_percentage_error(y_valid, predictions_xgb)
print("MAPE: %f" %(100 * mape))

MAPE: 12.029282


### linear regresion:

In [15]:
from sklearn.linear_model import LinearRegression
model_reg = LinearRegression()

model_reg.fit(X_train, y_train)

In [16]:
predictions_reg = model_reg.predict(X_valid)

In [17]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_reg))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_reg)
print("MAPE: %f" %(100 * mape))

RMSE :  1.754915
MAPE: 12.128002


### ridge regression:

In [18]:
from sklearn.linear_model import Ridge
model_ridge = Ridge( solver = 'sag')

model_ridge.fit(X_train, y_train)

In [19]:
predictions_ridge = model_ridge.predict(X_valid)

In [20]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_ridge))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_ridge)
print("MAPE: %f" %(100 * mape))

RMSE :  1.755049
MAPE: 12.130112


### lasso regression:

In [21]:
from sklearn import linear_model
model_lasso = linear_model.Lasso(alpha=0.1)

model_lasso.fit(X_train, y_train)

In [22]:
predictions_lasso = model_lasso.predict(X_valid)

In [23]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_lasso))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_lasso)
print("MAPE: %f" %(100 * mape))

RMSE :  1.760089
MAPE: 12.159543


### ElasticNet

In [24]:
from sklearn.linear_model import ElasticNet
model_en = ElasticNet()

model_en.fit(X_train,y_train)

In [25]:
predictions_en = model_en.predict(X_valid)

In [26]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_en))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_en)
print("MAPE: %f" %(100 * mape))

RMSE :  1.760089
MAPE: 12.159543


### OrthogonalMatchingPursuit 

In [27]:
from sklearn.linear_model import OrthogonalMatchingPursuit
model_omp = OrthogonalMatchingPursuit()

model_omp.fit(X_train, y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), OrthogonalMatchingPursuit())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




In [28]:
predictions_omp = model_omp.predict(X_valid)

In [29]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_omp))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_omp)
print("MAPE: %f" %(100 * mape))

RMSE :  1.755333
MAPE: 12.145332


### bayesian ridge

In [30]:
model_br = linear_model.BayesianRidge()

model_br.fit(X_train, y_train)

In [31]:
predictions_br = model_br.predict(X_valid)

In [32]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_br))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_br)
print("MAPE: %f" %(100 * mape))

RMSE :  1.756365
MAPE: 12.134250


In [33]:
from sklearn.linear_model import SGDRegressor
model_sgd = SGDRegressor()

model_sgd.fit(X_train, y_train)
predictions_sgd = model_sgd.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, predictions_sgd))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_sgd)
print("MAPE: %f" %(100 * mape))

RMSE :  1.761381
MAPE: 12.544035


In [None]:
from sklearn.svm import SVR
model_svr_rbf = SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.1)
model_svr_lin = SVR(kernel="linear", C=100, gamma="auto")
model_svr_poly = SVR(kernel="poly", C=100, gamma="auto", degree=3, epsilon=0.1, coef0=1)

model_svr_rbf.fit(X_train, y_train)
model_svr_lin.fit(X_train, y_train)
model_svr_poly.fit(X_train, y_train)

predictions_svr_rbf = model_svr_rbf.predict(X_valid)
predictions_svr_lin = model_svr_lin.predict(X_valid)
predictions_svr_poly = model_svr_poly.predict(X_valid)

rmse = np.sqrt(mean_squared_error(y_valid, predictions_svr_rbf))
print("RMSE RBF: % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_svr_rbf)
print("MAPE RBF: %f" %(100 * mape))

rmse = np.sqrt(mean_squared_error(y_valid, predictions_svr_lin))
print("RMSE LINEAR: % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_svr_lin)
print("MAPE LINEAR: %f" %(100 * mape))

rmse = np.sqrt(mean_squared_error(y_valid, predictions_svr_poly))
print("RMSE POLY: % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_svr_poly)
print("MAPE POLY: %f" %(100 * mape))