In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt

In [2]:
path = "/Users/ecem/Desktop/gyrocardiogram/phase2/"

## Load data

In [3]:
selected_cols = [4,9,2,10,3]

In [4]:
X = pd.read_csv(path + "selected_features_seismo.csv", index_col = 0).iloc[:, selected_cols]
X

Unnamed: 0,SC x,SC y,SR y,SE x,SS z
0,0.125374,0.140738,0.151571,0.756619,0.127694
1,0.130171,0.138467,0.127891,0.739110,0.139575
2,0.119070,0.117445,0.120807,0.349087,0.104309
3,0.145498,0.147362,0.207178,0.981220,0.123417
4,0.145737,0.133073,0.095584,0.702113,0.130902
...,...,...,...,...,...
95,0.087609,0.088285,0.064098,0.306076,0.076900
96,0.080112,0.065091,0.028885,0.313150,0.070935
97,0.269108,0.234729,0.164874,1.041992,0.268650
98,0.170623,0.125184,0.096827,0.926349,0.164032


In [5]:
y = pd.read_csv(path + "target_df.csv", index_col = 0).iloc[:,-1]
y

0     10.0
1     13.0
2     10.0
3     10.0
4     13.0
      ... 
95    12.0
96    18.0
97    11.0
98    12.0
99    11.0
Name: LVPW (mm), Length: 100, dtype: float64

## split training and test data:

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=42)

## Now, let's try different models and see which outperforms the others :

### xgb:

In [7]:
import xgboost as xgb
model_xgb  = xgb.XGBRegressor(random_state = 42) 

# Fit the model
model_xgb.fit(X_train, y_train)

In [8]:
predictions_xgb = model_xgb.predict(X_valid)

In [9]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_xgb))
print("RMSE : % f" %(rmse))

RMSE :  2.009354


In [10]:
mape = mean_absolute_percentage_error(y_valid, predictions_xgb)
print("MAPE: %f" %(100 * mape))

MAPE: 16.269640


### linear regresion:

In [11]:
from sklearn.linear_model import LinearRegression
model_reg = LinearRegression()

model_reg.fit(X_train, y_train)

In [12]:
predictions_reg = model_reg.predict(X_valid)

In [13]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_reg))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_reg)
print("MAPE: %f" %(100 * mape))

RMSE :  1.916732
MAPE: 15.740161


### ridge regression:

In [14]:
from sklearn.linear_model import Ridge
model_ridge = Ridge( solver = 'sag')

model_ridge.fit(X_train, y_train)

In [15]:
predictions_ridge = model_ridge.predict(X_valid)

In [16]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_ridge))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_ridge)
print("MAPE: %f" %(100 * mape))

RMSE :  1.814722
MAPE: 15.421240


### lasso regression:

In [17]:
from sklearn import linear_model
model_lasso = linear_model.Lasso(alpha=0.1)

model_lasso.fit(X_train, y_train)

In [18]:
predictions_lasso = model_lasso.predict(X_valid)

In [19]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_lasso))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_lasso)
print("MAPE: %f" %(100 * mape))

RMSE :  1.774824
MAPE: 15.230478


### ElasticNet

In [20]:
from sklearn.linear_model import ElasticNet
model_en = ElasticNet()

model_en.fit(X_train,y_train)

In [21]:
predictions_en = model_en.predict(X_valid)

In [22]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_en))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_en)
print("MAPE: %f" %(100 * mape))

RMSE :  1.774824
MAPE: 15.230478


### OrthogonalMatchingPursuit 

In [23]:
from sklearn.linear_model import OrthogonalMatchingPursuit
model_omp = OrthogonalMatchingPursuit()

model_omp.fit(X_train, y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), OrthogonalMatchingPursuit())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




In [24]:
predictions_omp = model_omp.predict(X_valid)

In [25]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_omp))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_omp)
print("MAPE: %f" %(100 * mape))

RMSE :  1.894184
MAPE: 15.764199


### bayesian ridge

In [26]:
model_br = linear_model.BayesianRidge()

model_br.fit(X_train, y_train)

In [27]:
predictions_br = model_br.predict(X_valid)

In [28]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_br))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_br)
print("MAPE: %f" %(100 * mape))

RMSE :  1.774876
MAPE: 15.230755


### SGD

In [29]:
from sklearn.linear_model import SGDRegressor
model_sgd = SGDRegressor()

model_sgd.fit(X_train, y_train)

In [30]:
predictions_sgd = model_sgd.predict(X_valid)

In [31]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_sgd))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_sgd)
print("MAPE: %f" %(100 * mape))

RMSE :  1.715094
MAPE: 14.495748


### SVM for regression 

In [32]:
from sklearn.svm import SVR
model_svr_rbf = SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.1)
model_svr_lin = SVR(kernel="linear", C=100, gamma="auto")
model_svr_poly = SVR(kernel="poly", C=100, gamma="auto", degree=3, epsilon=0.1, coef0=1)

model_svr_rbf.fit(X_train, y_train)
model_svr_lin.fit(X_train, y_train)
model_svr_poly.fit(X_train, y_train)

In [33]:
predictions_svr_rbf = model_svr_rbf.predict(X_valid)
predictions_svr_lin = model_svr_lin.predict(X_valid)
predictions_svr_poly = model_svr_poly.predict(X_valid)

In [34]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_svr_rbf))
print("RMSE RBF: % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_svr_rbf)
print("MAPE RBF: %f" %(100 * mape))


RMSE RBF:  1.742217
MAPE RBF: 12.945428


In [35]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_svr_lin))
print("RMSE LINEAR: % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_svr_lin)
print("MAPE LINEAR: %f" %(100 * mape))

RMSE LINEAR:  1.722945
MAPE LINEAR: 12.797301


In [36]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_svr_poly))
print("RMSE POLY: % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_svr_poly)
print("MAPE POLY: %f" %(100 * mape))

RMSE POLY:  1.750126
MAPE POLY: 13.012147
