# Linear Regression

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [7]:
df_standardised = pd.read_csv('../gait_standardised.csv')
df_standardised.head()

Unnamed: 0,subject,condition,replication,leg,joint,time,angle,angle_scaled
0,1,1,1,1,1,0,4.682881,-0.465902
1,1,1,1,1,1,1,5.073127,-0.441551
2,1,1,1,1,1,2,5.229774,-0.431776
3,1,1,1,1,1,3,5.083273,-0.440918
4,1,1,1,1,1,4,4.652399,-0.467804


In [8]:
print(f"Data types before:\n{df_standardised.dtypes}\n")
categorical_columns = ['subject', 'condition', 'replication', 'leg', 'joint', 'time']
df_standardised[categorical_columns] = df_standardised[categorical_columns].astype('category')
df_standardised['time'] = df_standardised['time'].cat.set_categories(list(range(0, 101)), ordered=True)
print(f"Data types after:\n{df_standardised.dtypes}")

Data types before:
subject           int64
condition         int64
replication       int64
leg               int64
joint             int64
time              int64
angle           float64
angle_scaled    float64
dtype: object

Data types after:
subject         category
condition       category
replication     category
leg             category
joint           category
time            category
angle            float64
angle_scaled     float64
dtype: object


In [9]:
kf = KFold(n_splits=10, random_state=42, shuffle=True)
kf.get_n_splits(df_standardised)

print(kf)
for i, (train_index, test_index) in enumerate(kf.split(df_standardised)):
    print(f"Fold {i}:")
     
    x_train = df_standardised.loc[train_index, ['subject', 'condition', 'replication', 'leg', 'joint']]
    y_train = df_standardised.loc[train_index, 'angle_scaled']
    reg = LinearRegression().fit(x_train, y_train)
    
    x_test = df_standardised.loc[test_index, ['subject', 'condition', 'replication', 'leg', 'joint']]
    y_test = df_standardised.loc[test_index, 'angle_scaled']
    
    y_pred = reg.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"MSE: {mse}")


KFold(n_splits=10, random_state=42, shuffle=True)
Fold 0:
MSE: 0.9927141422349068
Fold 1:
MSE: 0.9466715376816032
Fold 2:
MSE: 0.9668586314503693
Fold 3:
MSE: 0.9609647181213448
Fold 4:
MSE: 0.9797941353542725
Fold 5:
MSE: 0.9744207881640048
Fold 6:
MSE: 0.9494378148746109
Fold 7:
MSE: 0.966130658512769
Fold 8:
MSE: 0.9707862024942514
Fold 9:
MSE: 0.9595413506469411
