In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


# Load the raw data from the CSV file
raw_data = pd.read_csv("raw_data.csv")

# Step 1: Handling Missing Values (No missing values in the provided data)

# Step 2: Encoding Categorical Data
label_encoder = LabelEncoder()
raw_data['Frailty'] = label_encoder.fit_transform(raw_data['Frailty'])  # Encode 'Frailty' (N -> 0, Y -> 1)

# Step 3: Feature Scaling
scaler = StandardScaler()
numeric_cols = ['Height', 'Weight', 'Age', 'Grip strength']
raw_data[numeric_cols] = scaler.fit_transform(raw_data[numeric_cols])

# Step 4: Data Splitting (80% training, 20% testing)
X = raw_data.drop('Frailty', axis=1)
y = raw_data['Frailty']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the preprocessed data
print("Preprocessed Data:")
print(raw_data)

# Save the cleaned data
raw_data.to_csv("cleaned_data.csv", index=False)

Preprocessed Data:
     Height    Weight       Age  Grip strength  Frailty
0 -1.766641 -1.473912 -0.204911       0.932505        0
1  1.829735  0.303670 -1.106520       1.165631        0
2  0.504754  1.562791  1.024556       0.699379        0
3 -0.252377  0.748066 -0.860627       0.466252        1
4 -0.504754  0.896198 -0.286876      -0.466252        1
5  0.063094 -0.659187  1.434378       0.000000        0
6  0.757132  0.674000  1.516343      -0.932505        1
7  0.946415  0.303670 -0.778662      -1.398757        1
8 -0.441660 -1.473912 -1.270449      -1.631883        0
9 -1.135698 -0.881385  0.532769       1.165631        0
