In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt

In [2]:
path = "/Users/ecem/Desktop/gyrocardiogram/phase2/"

## Load data

In [3]:
selected_cols = [0,4,6,7,11]

In [4]:
X = pd.read_csv(path + "selected_features_10sec.csv", index_col = 0).iloc[:,selected_cols]
X

Unnamed: 0,SE x,SR y,SC x,SC y,SS z
0,2.764249,0.879687,0.454484,0.938928,0.291549
1,3.274312,0.884766,0.497074,0.155128,0.288993
2,3.298964,0.881250,0.513497,0.795949,0.289119
3,3.284066,0.871875,0.539506,0.638711,0.287926
4,3.280501,0.888672,0.521286,0.161650,0.281535
...,...,...,...,...,...
3912,3.309207,0.833594,-20.216970,2.443424,-0.161938
3913,3.275286,0.920703,0.328882,0.336628,0.157898
3914,3.209732,0.807813,0.710252,0.623062,0.396913
3915,3.174726,0.858594,0.583945,0.456299,0.426516


In [5]:
X.isnull().sum()

SE x    0
SR y    0
SC x    0
SC y    0
SS z    9
dtype: int64

In [6]:
idx  = list(np.where(X['SS z'].isnull())[0])
idx

[1541, 1550, 1554, 1567, 1718, 1736, 1752, 2251, 2259]

In [7]:
X = X.drop(idx)
X

Unnamed: 0,SE x,SR y,SC x,SC y,SS z
0,2.764249,0.879687,0.454484,0.938928,0.291549
1,3.274312,0.884766,0.497074,0.155128,0.288993
2,3.298964,0.881250,0.513497,0.795949,0.289119
3,3.284066,0.871875,0.539506,0.638711,0.287926
4,3.280501,0.888672,0.521286,0.161650,0.281535
...,...,...,...,...,...
3912,3.309207,0.833594,-20.216970,2.443424,-0.161938
3913,3.275286,0.920703,0.328882,0.336628,0.157898
3914,3.209732,0.807813,0.710252,0.623062,0.396913
3915,3.174726,0.858594,0.583945,0.456299,0.426516


In [8]:
pd.read_csv(path + "target_df_10sec.csv", index_col = 0)

Unnamed: 0,Ejection fraction (%),Left ventricular end diastolic dimension (mm),IVS (mm),LVPW (mm)
0,65.5,55.0,12.0,10.0
1,65.5,55.0,12.0,10.0
2,65.5,55.0,12.0,10.0
3,65.5,55.0,12.0,10.0
4,65.5,55.0,12.0,10.0
...,...,...,...,...
3912,58.0,57.0,13.0,11.0
3913,58.0,57.0,13.0,11.0
3914,58.0,57.0,13.0,11.0
3915,58.0,57.0,13.0,11.0


In [9]:
y = pd.read_csv(path + "target_df_10sec.csv", index_col = 0).drop(idx).iloc[:,2]
y

0       12.0
1       12.0
2       12.0
3       12.0
4       12.0
        ... 
3912    13.0
3913    13.0
3914    13.0
3915    13.0
3916    13.0
Name: IVS (mm), Length: 3908, dtype: float64

## split training and test data:

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=42)

## Now, let's try different models and see which outperforms the others :

### xgb:

In [11]:
import xgboost as xgb
model_xgb  = xgb.XGBRegressor(random_state = 42) 

# Fit the model
model_xgb.fit(X_train, y_train)

In [12]:
predictions_xgb = model_xgb.predict(X_valid)

In [13]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_xgb))
print("RMSE : % f" %(rmse))

RMSE :  2.401838


In [14]:
mape = mean_absolute_percentage_error(y_valid, predictions_xgb)
print("MAPE: %f" %(100 * mape))

MAPE: 22.564746


### linear regresion:

In [15]:
from sklearn.linear_model import LinearRegression
model_reg = LinearRegression()

model_reg.fit(X_train, y_train)

In [16]:
predictions_reg = model_reg.predict(X_valid)

In [17]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_reg))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_reg)
print("MAPE: %f" %(100 * mape))

RMSE :  3.156993
MAPE: 22.140728


### ridge regression:

In [18]:
from sklearn.linear_model import Ridge
model_ridge = Ridge( solver = 'sag')

model_ridge.fit(X_train, y_train)

In [19]:
predictions_ridge = model_ridge.predict(X_valid)

In [20]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_ridge))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_ridge)
print("MAPE: %f" %(100 * mape))

RMSE :  3.077109
MAPE: 22.117272


### lasso regression:

In [21]:
from sklearn import linear_model
model_lasso = linear_model.Lasso(alpha=0.1)

model_lasso.fit(X_train, y_train)

In [22]:
predictions_lasso = model_lasso.predict(X_valid)

In [23]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_lasso))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_lasso)
print("MAPE: %f" %(100 * mape))

RMSE :  2.243121
MAPE: 21.428859


### ElasticNet

In [24]:
from sklearn.linear_model import ElasticNet
model_en = ElasticNet()

model_en.fit(X_train,y_train)

In [25]:
predictions_en = model_en.predict(X_valid)

In [26]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_en))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_en)
print("MAPE: %f" %(100 * mape))

RMSE :  2.243187
MAPE: 21.433463


### OrthogonalMatchingPursuit 

In [27]:
from sklearn.linear_model import OrthogonalMatchingPursuit
model_omp = OrthogonalMatchingPursuit()

model_omp.fit(X_train, y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), OrthogonalMatchingPursuit())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




In [28]:
predictions_omp = model_omp.predict(X_valid)

In [29]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_omp))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_omp)
print("MAPE: %f" %(100 * mape))

RMSE :  2.243105
MAPE: 21.427711


### bayesian ridge

In [30]:
model_br = linear_model.BayesianRidge()

model_br.fit(X_train, y_train)

In [31]:
predictions_br = model_br.predict(X_valid)

In [32]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_br))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_br)
print("MAPE: %f" %(100 * mape))

RMSE :  2.243190
MAPE: 21.434751


In [33]:
from sklearn.linear_model import SGDRegressor
model_sgd = SGDRegressor()

model_sgd.fit(X_train, y_train)

predictions_sgd = model_sgd.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, predictions_sgd))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_sgd)
print("MAPE: %f" %(100 * mape))

RMSE :  1044.989147
MAPE: 474.377714


In [None]:
from sklearn.svm import SVR
model_svr_rbf = SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.1)
model_svr_lin = SVR(kernel="linear", C=100, gamma="auto")
model_svr_poly = SVR(kernel="poly", C=100, gamma="auto", degree=3, epsilon=0.1, coef0=1)

model_svr_rbf.fit(X_train, y_train)
model_svr_lin.fit(X_train, y_train)
model_svr_poly.fit(X_train, y_train)

In [None]:
predictions_svr_rbf = model_svr_rbf.predict(X_valid)
predictions_svr_lin = model_svr_lin.predict(X_valid)
predictions_svr_poly = model_svr_poly.predict(X_valid)