In [None]:
# Re-import libraries after reset
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
from scipy.stats import pearsonr

In [None]:

# Re-load the training data
train_data = pd.read_csv('/train_subset (1).csv')

In [None]:
# Encode categorical variables
categorical_features = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
for col in categorical_features:
    if train_data[col].dtype == 'object':
        train_data[col] = train_data[col].map({'Yes': 1, 'No': 0})

In [None]:
# Encode target variable
train_data['cardio'] = train_data['cardio'].map({'Yes': 1, 'No': 0})

In [None]:
# --- Chi-Square Test for Categorical Features ---
X_cat = train_data[categorical_features]
y_target = train_data['cardio']

chi2_stats, p_values_cat = chi2(X_cat, y_target)
chi_square_results = pd.DataFrame({
    'Feature': categorical_features,
    'Chi2 Statistic': chi2_stats,
    'p-value': p_values_cat
})

In [None]:
# --- Pearson Correlation for Continuous Features ---
continuous_features = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
pearson_results = []

for feature in continuous_features:
    corr, p_value = pearsonr(train_data[feature], y_target)
    strength = (
        'Strong' if abs(corr) > 0.5 else
        'Moderate' if abs(corr) > 0.3 else
        'Weak'
    )
    direction = 'Positive' if corr > 0 else 'Negative'
    pearson_results.append((feature, corr, p_value, strength, direction))

pearson_results_df = pd.DataFrame(pearson_results, columns=['Feature', 'Correlation Coefficient', 'p-value', 'Strength', 'Direction'])


In [None]:
# Display results
chi_square_results, pearson_results_df

In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize continuous features
scaler = StandardScaler()
X_continuous = train_data[['age', 'height', 'weight', 'ap_hi', 'ap_lo']]
X_cont_scaled = scaler.fit_transform(X_continuous)
