In [1]:
import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.model_selection import cross_validate, LeaveOneOut
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import root_mean_squared_error, make_scorer

In [2]:
np.set_printoptions(threshold=sys.maxsize)
scale = lambda x: StandardScaler().fit_transform(x)
data_raw = pd.read_csv('C:/Users/dcfra/GR_capstone_public/data/ML_data.csv'); del data_raw['Unnamed: 0']
data = data_raw
data = data[data['ATHLETE'] == 'Brendan Wenzel']
n = data.shape[0]

y_vars = ['AAL','SPEEDMAX','JUMPS','PHYSIOLOAD','EXERTIONS','DISTANCE']
Y = np.array(scale(data[y_vars])) # 108 x 6 standardized responses
indicators = pd.get_dummies(data[['HALF']], dtype=float) #  Date not included
features = data.drop(pd.Index(y_vars + ['ATHLETE','DATE','HALF']), axis=1) 
x_vars = list(indicators.columns) + list(features.columns)

X_full = np.c_[np.array(indicators), scale(features)]
X_one = np.ones((n,1))
X_red = np.c_[np.array(indicators), scale(np.array(data.TIME).reshape(-1,1))]
print(x_vars)

['HALF', 'TIME', 'PEAK_POWER_BM', 'JUMP_HEIGHT_IMP_DIS', 'ECCENTRIC_MEAN_FORCE', 'ECCENTRIC_MEAN_POWER', 'TAKEOFF_PEAK_FORCE', 'PEAK_POWER', 'ECCENTRIC_MEAN_BRAKING_FORCE', 'FORCE_AT_PEAK_POWER', 'CONCENTRIC_PEAK_FORCE']


In [81]:
data_raw = pd.read_csv('C:/Users/dcfra/GR_capstone_public/data/fulldata.csv'); del data_raw['Unnamed: 0']
name = (0,'Akuel Kot')
scale = lambda x: StandardScaler().fit_transform(x)
instance = pd.read_csv("C:/Users/dcfra/GR_capstone_public/plots_summaries/instance.csv")
row_cutter = pd.Index(data_raw["ATHLETE"] == name[1])
column_cutter = pd.Index([True for _ in range(10)] + list(instance.iloc[:,name[0]] == 1))
data = data_raw.iloc[row_cutter,column_cutter]
n = data.shape[0]

y_vars = ['AAL','SPEEDMAX','JUMPS','PHYSIOLOAD','EXERTIONS','DISTANCE']
Y = np.array(scale(data[y_vars])) # 108 x 6 standardized responses
indicators = pd.get_dummies(data[['HALF']], dtype=float) #  Date not included
features = data.drop(pd.Index(y_vars + ['ATHLETE','DATE','HALF']), axis=1) 
x_vars = list(indicators.columns) + list(features.columns)

X_full = np.c_[np.array(indicators), scale(features)]
X_one = np.ones((n,1))
X_red = np.c_[np.array(indicators), scale(np.array(data.TIME).reshape(-1,1))]
print(x_vars)

['HALF', 'TIME', 'PEAK_POWER_BM', 'JUMP_HEIGHT_IMP_DIS', 'ECCENTRIC_MEAN_FORCE', 'ECCENTRIC_MEAN_POWER', 'TAKEOFF_PEAK_FORCE', 'PEAK_POWER', 'ECCENTRIC_MEAN_BRAKING_FORCE', 'FORCE_AT_PEAK_POWER', 'CONCENTRIC_PEAK_FORCE']


In [47]:
print(X_full.shape)
print(X_red.shape)
print(X_one.shape)
print(Y.shape)

(40, 11)
(40, 2)
(40, 1)
(40, 6)


In [5]:
def response_cv(cv,var):
    if type(var) == str:
        return -np.mean(cv[f'test_{var}'])
    else:
        return [-np.mean(cv[f'test_{var_i}']) for var_i in var]
    
def rmse_score(y_true, y_pred,output_index):
    return root_mean_squared_error(y_true[:, output_index], y_pred[:, output_index])

scorers = {
    f'{y_vars[i]}': make_scorer(rmse_score, output_index=i, greater_is_better=False)
    for i in range(Y.shape[1])
}

In [6]:
loo = LeaveOneOut()
cv_one = cross_validate(MultiOutputRegressor(LinearRegression()), 
                        X_one,Y,cv = loo,scoring = scorers, 
                        return_train_score = False, verbose = 0, n_jobs = -1)
cv_redreg = cross_validate(MultiOutputRegressor(LinearRegression()), 
                           X_red,Y, cv=loo, scoring=scorers,
                           return_train_score=False, verbose = 0, n_jobs = -1)
cv_fullreg = cross_validate(MultiOutputRegressor(LinearRegression()), 
                            X_full, Y, cv=loo, scoring=scorers, return_train_score=False, verbose = 0, n_jobs = -1)

In [11]:
# Model switch
model = RidgeCV(alphas = np.logspace(-4,4,50),
                scoring = 'neg_root_mean_squared_error',
                cv = loo)

In [12]:
# rfe_model = RFECV(model, step=1, verbose=1, min_features_to_select=1,
#                   cv=loo, n_jobs=-1, scoring='neg_root_mean_squared_error')
rfe_model = SelectFromModel(model,prefit=False)
selector = rfe_model.fit(X_full, Y)
X_cut = selector.transform(X_full)
x_cutnames = selector.get_feature_names_out(input_features=x_vars)
x_cutnames

array(['HALF', 'TIME', 'ECCENTRIC_MEAN_FORCE', 'ECCENTRIC_MEAN_POWER',
       'ECCENTRIC_MEAN_BRAKING_FORCE'], dtype=object)

In [46]:
cv_red = cross_validate(model, X_red,Y, cv=loo, scoring=scorers, 
                        return_train_score=False,verbose = 0,n_jobs = -1)
cv_full = cross_validate(model, X_full,Y, cv=loo, scoring=scorers, 
                         return_train_score=False,verbose = 0,n_jobs = -1)
cv_cut = cross_validate(model, X_cut,Y, cv=loo, scoring=scorers, 
                        return_train_score=False,verbose = 0,n_jobs = -1)

In [45]:
fit_model = model.fit(X_full,Y)
coefs = pd.DataFrame(fit_model.coef_)
coefs.index = y_vars
coefs.columns = x_vars
Y_sds = np.std(np.array(data[y_vars]),axis=0).reshape(-1,1)

(coefs * Y_sds).T

Unnamed: 0,AAL,SPEEDMAX,JUMPS,PHYSIOLOAD,EXERTIONS,DISTANCE
HALF,7.743549,-0.083707,1.156016,9.868365,4.473525,45.860505
TIME,20.761595,0.06324,-0.068931,26.531012,0.376684,123.295565
PEAK_POWER_BM,-4.780627,-0.000583,0.139018,-5.648437,-0.553158,-26.249554
JUMP_HEIGHT_IMP_DIS,0.046311,-0.012536,0.574827,3.877406,0.387129,18.019175
ECCENTRIC_MEAN_FORCE,16.994441,0.09666,-0.067754,22.797682,1.068935,105.945946
ECCENTRIC_MEAN_POWER,3.3476,-0.18158,2.477957,-0.952039,0.386816,-4.424339
TAKEOFF_PEAK_FORCE,3.706771,0.006395,0.190942,4.530371,-0.488934,21.053651
PEAK_POWER,2.662292,0.033014,0.198873,5.26714,0.167578,24.477583
ECCENTRIC_MEAN_BRAKING_FORCE,6.951731,0.057577,-0.211271,13.590479,-2.086061,63.158004
FORCE_AT_PEAK_POWER,4.596619,0.005179,0.392212,4.539531,-0.23548,21.09622


In [44]:
rmse_array = np.c_[response_cv(cv_one,y_vars),
                   response_cv(cv_redreg,y_vars),
                   response_cv(cv_fullreg,y_vars),
                   response_cv(cv_red,y_vars),
                   response_cv(cv_full,y_vars),
                   response_cv(cv_cut,y_vars)
                   ]
rmse_frame = pd.DataFrame(rmse_array,index = y_vars,
                          columns = ["Intercept Only Regression",
                                     "Kinexon Only Regression",
                                     "Full Model Regression",
                                     "Kinexon Only Ridge",
                                     "Full Model Ridge",
                                     "Reduced Model Ridge" ])
rmse_frame.T

Unnamed: 0,AAL,SPEEDMAX,JUMPS,PHYSIOLOAD,EXERTIONS,DISTANCE
Intercept Only Regression,0.810577,0.81728,0.86189,0.828243,0.889093,0.828243
Kinexon Only Regression,0.773963,0.847327,0.937971,0.792152,0.715674,0.792152
Full Model Regression,0.591531,0.985255,0.791675,0.58874,0.933471,0.58874
Kinexon Only Ridge,0.743272,0.830917,0.888675,0.771065,0.80876,0.771065
Full Model Ridge,0.618508,0.832268,0.752745,0.63159,0.882436,0.63159
Reduced Model Ridge,0.530593,0.838008,0.737943,0.541075,0.806757,0.541075


In [58]:
pd.set_option('expand_frame_repr', False)
output_string = '\n\n'.join([
    f'Name: \n{name}',
    f'Ridge Alpha: \n{fit_model.alpha_}',
    f'Reliable Features: \n{x_vars}',
    f'Selected Features: \n{list(x_cutnames)}',
    f'RMSE Values: \n{rmse_frame.T}',
    f'Rescaled Coefficients: \n{(coefs * Y_sds).T}',
    "###############################################################################################",
    "\n\n"
])

with open('machine_learning_output.txt','a') as writefile:
    writefile.write(output_string)

names = list(data_raw['ATHLETE'].unique())
for name in enumerate(names):
    print('hi') if name[0] == 0 else print(name)

