Bringing Libraries into scope

In [614]:
from nis import match

import pandas as pd
import numpy as np
import re

# Preprocessing

In [615]:
df_raw = pd.read_csv('../data_raw/raw_data.txt')
df_raw

Unnamed: 0,Height \tWeight \tAge \tGrip strength \tFrailty
0,65.8 \t112 \t30 \t30 \tN
1,71.5 \t136 \t19 \t31 \tN
2,69.4 \t153 \t45 \t29 \tN
3,68.2 \t142 \t22 \t28 \tY
4,67.8 \t144 \t29 \t24 \tY
5,68.7 \t123 \t50 \t26 \tN
6,69.8 \t141 \t51 \t22 \tY
7,70.1 \t136 \t23 \t20 \tY
8,67.9 \t112 \t17 \t19 \tN
9,66.8 \t120 \t39 \t31 \tN


As you can see, the data copy-pasted from the table from `Part_1/data_raw/Assignment 1.docx` is very messy and contains non-alphanumeric chars like `\t` & ` ` (`NBSP`).
Will clean up programmatically...

In [616]:
def clean_data(raw_path, clean_path):
    with open(raw_path, 'r') as f:
        raw_data = f.read()
    #First removing the whitespace chars characters
    cleaned_data = raw_data.replace(' ', ' ').replace('\\t', ' ').replace('\\n', ' ')
    lines = cleaned_data.strip().split('\n')

    data_rows = []
    headers = re.split(r'\s{2,}', lines[0].strip())

    for line in lines[1:]:
        values = re.split(r'\s+', line.strip())
        if values:
            data_rows.append(values)

    df = pd.DataFrame(data_rows, columns=headers)
    df.to_csv(clean_path, index=False)
    print(f"Cleaned data saved to {clean_path}")

And now we can call the pre-processing logic, visualize the data, and confirm both the csv in `/data_clean/cleaned_data.csv` and produced df look correct:

In [617]:
clean_data('../data_raw/raw_data.txt', '../data_clean/cleaned_data.csv')

df_cleaned = pd.read_csv('../data_clean/cleaned_data.csv')

df_cleaned

Cleaned data saved to ../data_clean/cleaned_data.csv


Unnamed: 0,Height,Weight,Age,Grip strength,Frailty
0,65.8,112,30,30,N
1,71.5,136,19,31,N
2,69.4,153,45,29,N
3,68.2,142,22,28,Y
4,67.8,144,29,24,Y
5,68.7,123,50,26,N
6,69.8,141,51,22,Y
7,70.1,136,23,20,Y
8,67.9,112,17,19,N
9,66.8,120,39,31,N


With this pre-processing done, we can transition to requirements; First Up:
# A. Unit Standardization
- Height_m = Height_in * 0.0254
- Weight_kg = Weight_lb * 0.45359237
### Note:
- Types from txt file may have been inferred incorrectly so type-casting certain columns May be necessary for dtype interpretability:
- `/var/folders/ls/hnyqgnc13n7cm30s_dv3yz4r0000gn/T/ipykernel_24574/3334074510.py:5: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[50.802345 61.68856  69.39963  64.41012  65.3173   55.79186  63.956524
 61.68856  50.802345 54.431084]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_normalized.loc[:, 'Weight'] = (df_normalized['Weight'].astype('float32') * weight_pounds_to_kg)`

In [618]:
# Handoff
df_normalized = df_cleaned

# aforementioned column type-casting
df_normalized['Weight'] = df_normalized['Weight'].astype('float64')
df_normalized['Height'] = df_normalized['Height'].astype('float64')

height_in_to_m = 0.0254
weight_pounds_to_kg = 0.45359237

#update values
df_normalized.loc[:, 'Weight'] = (df_normalized['Weight'].astype('float64') * weight_pounds_to_kg)
df_normalized.loc[:, 'Height'] = (df_normalized['Height'].astype('float64') * height_in_to_m)
#
# # rename columns
df_normalized = df_normalized.rename(columns={'Height': 'Height_m', 'Weight': 'Weight_kg'})

df_normalized



Unnamed: 0,Height_m,Weight_kg,Age,Grip strength,Frailty
0,1.67132,50.802345,30,30,N
1,1.8161,61.688562,19,31,N
2,1.76276,69.399633,45,29,N
3,1.73228,64.410117,22,28,Y
4,1.72212,65.317301,29,24,Y
5,1.74498,55.791862,50,26,N
6,1.77292,63.956524,51,22,Y
7,1.78054,61.688562,23,20,Y
8,1.72466,50.802345,17,19,N
9,1.69672,54.431084,39,31,N


# B. Feature Engineering
i. BMI = Weight_kg / (Height_m ** 2) (round to 2 decimals). \
ii. AgeGroup (categorical): "<30", "30–45", "46–60", ">60" based on Age_yr.

In [619]:
#i) BMI
df_fe = df_normalized
df_fe['BMI'] = round(df_fe['Weight_kg'] / (df_fe['Height_m']**2), 2)
# df_fe
#ii Partitioning age range
bins = [0, 29, 45, 59, np.inf]
labels = ["<30", "30–45", "46–60", ">60"]

# Apply pd.cut to the 'Age' column to create a new 'Age_Group' column
df_fe['Age_Group'] = pd.cut(x=df_fe['Age'], bins=bins, labels=labels)
df_fe

Unnamed: 0,Height_m,Weight_kg,Age,Grip strength,Frailty,BMI,Age_Group
0,1.67132,50.802345,30,30,N,18.19,30–45
1,1.8161,61.688562,19,31,N,18.7,<30
2,1.76276,69.399633,45,29,N,22.33,30–45
3,1.73228,64.410117,22,28,Y,21.46,<30
4,1.72212,65.317301,29,24,Y,22.02,<30
5,1.74498,55.791862,50,26,N,18.32,46–60
6,1.77292,63.956524,51,22,Y,20.35,46–60
7,1.78054,61.688562,23,20,Y,19.46,<30
8,1.72466,50.802345,17,19,N,17.08,<30
9,1.69672,54.431084,39,31,N,18.91,30–45


# C) Categorical & Numeric Encoding

i. Binary encoding: Frailty_binary (Y→1, N→0, store as `int8`).

In [620]:
#Handoff
df_cat_bin_encoded = df_fe.copy()

#Binary encoding
df_cat_bin_encoded['Frailty_binary'] = df_cat_bin_encoded['Frailty'].str.strip().map({'Y': 1, 'N': 0}).astype('int8')
df_cat_bin_encoded.drop('Frailty', axis=1)


Unnamed: 0,Height_m,Weight_kg,Age,Grip strength,BMI,Age_Group,Frailty_binary
0,1.67132,50.802345,30,30,18.19,30–45,0
1,1.8161,61.688562,19,31,18.7,<30,0
2,1.76276,69.399633,45,29,22.33,30–45,0
3,1.73228,64.410117,22,28,21.46,<30,1
4,1.72212,65.317301,29,24,22.02,<30,1
5,1.74498,55.791862,50,26,18.32,46–60,0
6,1.77292,63.956524,51,22,20.35,46–60,1
7,1.78054,61.688562,23,20,19.46,<30,1
8,1.72466,50.802345,17,19,17.08,<30,0
9,1.69672,54.431084,39,31,18.91,30–45,0


ii. One‑hot encode Age_Group into columns: `AgeGroup_<30, AgeGroup_30–45, AgeGroup_46–60, AgeGroup_>60`

In [621]:
df_one_hot_enc_age = df_cat_bin_encoded.copy()
df_one_hot_enc_age = pd.get_dummies(df_one_hot_enc_age, columns=['Age_Group'])
df_one_hot_enc_age

Unnamed: 0,Height_m,Weight_kg,Age,Grip strength,Frailty,BMI,Frailty_binary,Age_Group_<30,Age_Group_30–45,Age_Group_46–60,Age_Group_>60
0,1.67132,50.802345,30,30,N,18.19,0,False,True,False,False
1,1.8161,61.688562,19,31,N,18.7,0,True,False,False,False
2,1.76276,69.399633,45,29,N,22.33,0,False,True,False,False
3,1.73228,64.410117,22,28,Y,21.46,1,True,False,False,False
4,1.72212,65.317301,29,24,Y,22.02,1,True,False,False,False
5,1.74498,55.791862,50,26,N,18.32,0,False,False,True,False
6,1.77292,63.956524,51,22,Y,20.35,1,False,False,True,False
7,1.78054,61.688562,23,20,Y,19.46,1,True,False,False,False
8,1.72466,50.802345,17,19,N,17.08,0,True,False,False,False
9,1.69672,54.431084,39,31,N,18.91,0,False,True,False,False


# EDA & Reporting

i) Compute summary table: mean/median/std for numeric columns; save to reports/findings.md .

In [622]:
df_final = df_one_hot_enc_age

report_path = '../reports/findings/summary.md'

with open(report_path,"w+") as f:
    print(f"# i Quantify relation of strength ↔ frailty: compute correlation between Grip_kg and Frailty_binary, and report it.")
    f.write(df_final_summary.describe().to_markdown() + "\n")

# df_final_summary

# i Quantify relation of strength ↔ frailty: compute correlation between Grip_kg and Frailty_binary, and report it.


ii) Quantify relation of strength ↔ frailty: compute correlation between Grip_kg and Frailty_binary, and report it.


In [623]:
def interpret_correlation(flt: float) -> str:
    if not (-1.0 <= flt <= 1.0):
        return "Coefficient out of expected range (-1.0 to 1.0)"
    if flt >= 0.7:
        return 'strongly positive'
    elif flt >= 0.3:
        return 'moderately positive'
    elif flt > 0.0:
        return 'weakly positive'
    elif flt == 0.0:
        return 'neutral'
    elif flt <= -0.7:
        return 'strongly negative'
    elif flt <= -0.3:
        return 'moderately negative'
    else:
        return 'weakly negative'

Correlation reporting

In [624]:
grip_frailty_correlation = df_final['Grip strength'].corr(df_final['Frailty_binary'])

with open(report_path, 'a+') as f:
    f.write(f"The correlation coefficient is: **{grip_frailty_correlation:.4f}**, which can be categorized as a **{interpret_correlation(grip_frailty_correlation)}** correlation\n")

print(f"Correlation computed and saved to {report_path}")

Correlation computed and saved to ../reports/findings/summary.md
