In [5]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt

In [6]:
path = "/Users/ecem/Desktop/gyrocardiogram/phase2/"

## Load data

In [7]:
selected_cols = [0,3,7,8,9]

In [8]:
X = pd.read_csv(path + "selected_features.csv", index_col = 0).iloc[:, selected_cols]
X

Unnamed: 0,SC x,SE x,SC y,SS z,SR y
0,0.118882,0.346638,0.129428,0.136051,0.040513
1,0.113485,0.518874,0.127058,0.150801,0.081135
2,0.115830,0.446310,0.100361,0.107055,0.044529
3,0.130141,0.549191,0.147496,0.135390,0.187671
4,0.140124,0.583176,0.123637,0.135801,0.036081
...,...,...,...,...,...
95,0.100810,0.394827,0.084148,0.089695,0.028969
96,0.075995,0.278854,0.080143,0.085031,0.077933
97,0.038995,0.191763,0.047402,0.052510,0.032946
98,0.016169,0.042198,0.021791,0.021298,0.011459


In [9]:
y = pd.read_csv(path + "target_df.csv", index_col = 0).iloc[:,0]
y

0     65.5
1     65.4
2     61.9
3     67.5
4     63.6
      ... 
95    72.5
96    55.0
97    64.0
98    67.5
99    58.0
Name: Ejection fraction (%), Length: 100, dtype: float64

## split training and test data:

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=42)

## Now, let's try different models and see which outperforms the others :

### xgb:

In [12]:
import xgboost as xgb
model_xgb  = xgb.XGBRegressor(random_state = 42) 

# Fit the model
model_xgb.fit(X_train, y_train)

In [14]:
predictions_xgb = model_xgb.predict(X_valid)

In [15]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_xgb))
print("RMSE : % f" %(rmse))

RMSE :  16.352465


In [16]:
mape = mean_absolute_percentage_error(y_valid, predictions_xgb)
print("MAPE: %f" %(100 * mape))

MAPE: 25.673592


### linear regresion:

In [17]:
from sklearn.linear_model import LinearRegression
model_reg = LinearRegression()

model_reg.fit(X_train, y_train)

In [18]:
predictions_reg = model_reg.predict(X_valid)

In [20]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_reg))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_reg)
print("MAPE: %f" %(100 * mape))

RMSE :  12.814992
MAPE: 21.711475


### ridge regression:

In [25]:
from sklearn.linear_model import Ridge
model_ridge = Ridge( solver = 'sag')

model_ridge.fit(X_train, y_train)

In [28]:
predictions_ridge = model_ridge.predict(X_valid)

In [29]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_ridge))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_ridge)
print("MAPE: %f" %(100 * mape))

RMSE :  12.702438
MAPE: 21.588431


### lasso regression:

In [30]:
from sklearn import linear_model
model_lasso = linear_model.Lasso(alpha=0.1)

model_lasso.fit(X_train, y_train)

In [31]:
predictions_lasso = model_lasso.predict(X_valid)

In [32]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_lasso))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_lasso)
print("MAPE: %f" %(100 * mape))

RMSE :  12.561921
MAPE: 21.389424


### ElasticNet

In [33]:
from sklearn.linear_model import ElasticNet
model_en = ElasticNet()

model_en.fit(X_train,y_train)

In [34]:
predictions_en = model_en.predict(X_valid)

In [35]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_en))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_en)
print("MAPE: %f" %(100 * mape))

RMSE :  12.078515
MAPE: 20.707256


### OrthogonalMatchingPursuit 

In [36]:
from sklearn.linear_model import OrthogonalMatchingPursuit
model_omp = OrthogonalMatchingPursuit()

model_omp.fit(X_train, y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), OrthogonalMatchingPursuit())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




In [38]:
predictions_omp = model_omp.predict(X_valid)

In [39]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_omp))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_omp)
print("MAPE: %f" %(100 * mape))

RMSE :  12.502278
MAPE: 21.347221


### bayesian ridge

In [40]:
model_br = linear_model.BayesianRidge()

model_br.fit(X_train, y_train)

In [41]:
predictions_br = model_br.predict(X_valid)

In [42]:
rmse = np.sqrt(mean_squared_error(y_valid, predictions_br))
print("RMSE : % f" %(rmse))
mape = mean_absolute_percentage_error(y_valid, predictions_br)
print("MAPE: %f" %(100 * mape))

RMSE :  12.079190
MAPE: 20.708320
