# Data Transformation and feature evaluation for set 3

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from knee_stress_predict.config import raw_data_dir, processed_data_dir

This notebook prepare data for training time serial model.

## Step 1: Load data

In [2]:
data_set_name = "set_3"
file_path = Path.joinpath(processed_data_dir, data_set_name,  "out.csv")
result = pd.read_csv(file_path)
result = result.drop('Unnamed: 0', axis=1)
result

Unnamed: 0,Code,Patella_PN,Femur_PN,Tibia_PN,Patella_Car_PN,Femur_Car_PN,Tibia_M_Car_PN,Tibia_L_Car_PN,Patella_volume,Femur_volume,...,med_frame_231,med_frame_232,med_frame_233,med_frame_234,med_frame_235,med_frame_236,med_frame_237,med_frame_238,med_frame_239,med_frame_240
0,9003406M00,1137,4142,2789,5275,21420,2840,2840,33682.157434,231437.991665,...,9.759382,9.932714,9.782494,9.542757,9.550017,9.888674,10.287620,10.229705,10.525443,10.990571
1,9003406M12,1126,4113,2684,5900,21030,2840,2840,33039.218053,228671.171460,...,9.806394,9.903366,9.658813,9.345903,8.863383,8.515292,8.726943,8.729221,8.886322,9.224288
2,9007827M00,771,3414,2596,4485,18385,2840,2840,21736.919532,184294.924991,...,6.839143,6.736033,6.599309,6.549271,6.820798,6.718779,6.745682,6.785289,6.868892,7.046464
3,9007827M12,730,3404,2600,5055,18755,2840,2840,20866.993760,184114.045604,...,6.069279,6.093854,6.118194,6.276575,6.574237,6.211847,6.384310,6.605030,6.585426,6.867692
4,9040390M00,670,3907,2560,4935,19150,2840,2840,17988.202111,226082.515840,...,122.803833,122.954926,123.055641,123.183105,123.373184,123.773911,123.798569,123.820938,123.799873,123.741707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,9993650M12,686,2963,2388,4405,20910,2155,2840,15821.015946,140615.628188,...,30.319807,31.751848,33.184460,34.442417,30.123833,15.865294,15.989101,16.239870,16.647863,17.259405
165,9993833M00,795,3669,2724,4765,20410,2840,2840,22375.020348,207318.185631,...,8.792763,8.976914,8.909942,10.684889,11.045062,10.653504,9.277707,9.529099,9.912118,10.419240
166,9993833M12,801,3805,2575,4885,20400,2840,2840,22439.863013,215968.865103,...,8.802687,9.059080,9.370121,10.105094,9.852423,9.866968,9.584963,9.702561,9.904342,10.722541
167,9993846M00,868,3794,3029,5430,20755,2840,2840,25084.417057,216622.182978,...,7.280858,7.429334,7.494839,7.629389,7.617738,7.752455,8.115380,7.971771,8.071439,8.470410


## Step 2: Transform to a tall table

Now let's create a tall table, so we can extract data in a specific tall format, which is more useful for training models.

In [3]:
column_names = list(result.columns)
column_names = [item for item in column_names if 'med_frame' not in item]
column_names = [item for item in column_names if 'lat_frame' not in item]

result = pd.wide_to_long(result,
                    stubnames=["med_frame_", "lat_frame_"],
                    i=column_names,
                    j="frame")
result = result.rename(columns={"med_frame_": "Max_tib_med_contact_pressure", "lat_frame_": "Max_tib_lat_contact_pressure"})

In [4]:
result

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0,Unnamed: 22_level_0,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0,Unnamed: 29_level_0,Unnamed: 30_level_0,Unnamed: 31_level_0,Unnamed: 32_level_0,Unnamed: 33_level_0,Unnamed: 34_level_0,Unnamed: 35_level_0,Unnamed: 36_level_0,Unnamed: 37_level_0,Unnamed: 38_level_0,Unnamed: 39_level_0,Unnamed: 40_level_0,Unnamed: 41_level_0,Unnamed: 42_level_0,Unnamed: 43_level_0,Max_tib_med_contact_pressure,Max_tib_lat_contact_pressure
Code,Patella_PN,Femur_PN,Tibia_PN,Patella_Car_PN,Femur_Car_PN,Tibia_M_Car_PN,Tibia_L_Car_PN,Patella_volume,Femur_volume,Tibia_volume,Patella_Car_volume,Femur_Car_volume,Tibia_M_Car_volume,Tibia_L_Car_volume,Patella_bounds_x,Femur_bounds_x,Tibia_bounds_x,Patella_Car_bounds_x,Femur_Car_bounds_x,Tibia_M_Car_bounds_x,Tibia_L_Car_bounds_x,Patella_bounds_y,Femur_bounds_y,Tibia_bounds_y,Patella_Car_bounds_y,Femur_Car_bounds_y,Tibia_M_Car_bounds_y,Tibia_L_Car_bounds_y,Patella_bounds_z,Femur_bounds_z,Tibia_bounds_z,Patella_Car_bounds_z,Femur_Car_bounds_z,Tibia_M_Car_bounds_z,Tibia_L_Car_bounds_z,Simulation_len,Max_dist_femur_tibia_lat_car,Min_dist_femur_tibia_lat_car,Mean_dist_femur_tibia_lat_car,Max_dist_femur_tibia_med_car,Min_dist_femur_tibia_med_car,Mean_dist_femur_tibia_med_car,frame,Unnamed: 44_level_1,Unnamed: 45_level_1
9003406M00,1137,4142,2789,5275,21420,2840,2840,33682.157434,231437.991665,139880.512171,5449.789170,21567.928389,2819.946838,3458.419752,58.810320,92.530422,92.080981,59.695596,88.260389,35.666414,38.220578,54.817801,85.703219,61.531890,47.659671,49.311722,11.095169,11.084142,30.210769,73.680822,61.784069,16.526861,76.027084,45.495615,39.069456,0,13.941570,0.987391,6.105886,10.304892,1.376902,4.580906,0,7.619495,6.696390
9003406M00,1137,4142,2789,5275,21420,2840,2840,33682.157434,231437.991665,139880.512171,5449.789170,21567.928389,2819.946838,3458.419752,58.810320,92.530422,92.080981,59.695596,88.260389,35.666414,38.220578,54.817801,85.703219,61.531890,47.659671,49.311722,11.095169,11.084142,30.210769,73.680822,61.784069,16.526861,76.027084,45.495615,39.069456,0,13.941570,0.987391,6.105886,10.304892,1.376902,4.580906,1,8.068417,5.042103
9003406M00,1137,4142,2789,5275,21420,2840,2840,33682.157434,231437.991665,139880.512171,5449.789170,21567.928389,2819.946838,3458.419752,58.810320,92.530422,92.080981,59.695596,88.260389,35.666414,38.220578,54.817801,85.703219,61.531890,47.659671,49.311722,11.095169,11.084142,30.210769,73.680822,61.784069,16.526861,76.027084,45.495615,39.069456,0,13.941570,0.987391,6.105886,10.304892,1.376902,4.580906,2,7.796326,5.344840
9003406M00,1137,4142,2789,5275,21420,2840,2840,33682.157434,231437.991665,139880.512171,5449.789170,21567.928389,2819.946838,3458.419752,58.810320,92.530422,92.080981,59.695596,88.260389,35.666414,38.220578,54.817801,85.703219,61.531890,47.659671,49.311722,11.095169,11.084142,30.210769,73.680822,61.784069,16.526861,76.027084,45.495615,39.069456,0,13.941570,0.987391,6.105886,10.304892,1.376902,4.580906,3,8.058777,6.240049
9003406M00,1137,4142,2789,5275,21420,2840,2840,33682.157434,231437.991665,139880.512171,5449.789170,21567.928389,2819.946838,3458.419752,58.810320,92.530422,92.080981,59.695596,88.260389,35.666414,38.220578,54.817801,85.703219,61.531890,47.659671,49.311722,11.095169,11.084142,30.210769,73.680822,61.784069,16.526861,76.027084,45.495615,39.069456,0,13.941570,0.987391,6.105886,10.304892,1.376902,4.580906,4,8.098284,7.306662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9993846M12,873,3921,2831,5690,21495,2840,2840,24799.135262,226161.340527,154353.942127,2906.365014,21666.378965,3118.786380,3829.389323,46.939792,87.462473,85.571822,47.164029,84.428101,35.739801,39.183579,49.869225,82.363148,60.384777,41.096517,46.442852,10.847292,16.060068,25.000336,70.437008,63.702576,13.315780,74.592797,50.467989,45.786630,12,18.862473,2.616759,6.815766,15.943200,2.833736,6.482209,236,11.455851,7.974414
9993846M12,873,3921,2831,5690,21495,2840,2840,24799.135262,226161.340527,154353.942127,2906.365014,21666.378965,3118.786380,3829.389323,46.939792,87.462473,85.571822,47.164029,84.428101,35.739801,39.183579,49.869225,82.363148,60.384777,41.096517,46.442852,10.847292,16.060068,25.000336,70.437008,63.702576,13.315780,74.592797,50.467989,45.786630,12,18.862473,2.616759,6.815766,15.943200,2.833736,6.482209,237,11.642740,8.043670
9993846M12,873,3921,2831,5690,21495,2840,2840,24799.135262,226161.340527,154353.942127,2906.365014,21666.378965,3118.786380,3829.389323,46.939792,87.462473,85.571822,47.164029,84.428101,35.739801,39.183579,49.869225,82.363148,60.384777,41.096517,46.442852,10.847292,16.060068,25.000336,70.437008,63.702576,13.315780,74.592797,50.467989,45.786630,12,18.862473,2.616759,6.815766,15.943200,2.833736,6.482209,238,12.039758,8.359842
9993846M12,873,3921,2831,5690,21495,2840,2840,24799.135262,226161.340527,154353.942127,2906.365014,21666.378965,3118.786380,3829.389323,46.939792,87.462473,85.571822,47.164029,84.428101,35.739801,39.183579,49.869225,82.363148,60.384777,41.096517,46.442852,10.847292,16.060068,25.000336,70.437008,63.702576,13.315780,74.592797,50.467989,45.786630,12,18.862473,2.616759,6.815766,15.943200,2.833736,6.482209,239,11.383085,8.170403


In [5]:
output_path = Path.joinpath(processed_data_dir, data_set_name, "tall_out.csv")
result.to_csv(output_path)
output_path

WindowsPath('D:/University/Classes/CS481_Senior_Design_Project/f22-ai-cbl/data/processed/set_3/tall_out.csv')