In [5]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import normalized_mutual_info_score, f1_score
import numpy as np

def load_and_preprocess():
    # Load datasets
    cardio_df = pd.read_csv('/content/cardio_train.csv', sep=';')
    heart_df = pd.read_csv('/content/heart.csv')

    # Display original data counts
    print("Original Cardio Data Count:", len(cardio_df))
    print("Original Heart Data Count:", len(heart_df))

    # Normalize gender column
    cardio_df['gender'] = cardio_df['gender'].replace({1: 0, 2: 1})

    # Remove outliers using IQR method
    def remove_outliers(df, features):
        for feature in features:
            Q1 = df[feature].quantile(0.25)
            Q3 = df[feature].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df = df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]
        return df

    cardio_df['age'] = cardio_df['age'] / 365.25  # Convert days to years
    cardio_df['BMI'] = cardio_df['weight'] / ((cardio_df['height'] / 100) ** 2)

    cardio_features_for_outliers = ['age', 'BMI', 'ap_hi', 'ap_lo']
    heart_features_for_outliers = ['age', 'trestbps', 'chol', 'thalach']

    cardio_df = remove_outliers(cardio_df, cardio_features_for_outliers)
    heart_df = remove_outliers(heart_df, heart_features_for_outliers)

    cardio_df.drop(columns=['height', 'weight'], inplace=True)

    # Select relevant features for clustering
    cardio_features = ['age', 'gender', 'BMI', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc']
    heart_features = ['age', 'sex', 'trestbps', 'chol', 'thalach']

    # Standardize the features
    scaler = StandardScaler()
    cardio_scaled = scaler.fit_transform(cardio_df[cardio_features])
    heart_scaled = scaler.fit_transform(heart_df[heart_features])

    return cardio_scaled, heart_scaled, cardio_df, heart_df, cardio_features, heart_features

def purity_score(y_true, y_pred):
    """Calculate the purity score for clustering results."""
    contingency_matrix = np.zeros((len(np.unique(y_true)), len(np.unique(y_pred))))
    for i in range(len(y_true)):
        contingency_matrix[y_true[i]][y_pred[i]] += 1
    return np.sum(np.max(contingency_matrix, axis=0)) / np.sum(contingency_matrix)

def evaluate_gmm(X, n_components):
    """Fit GMM and evaluate using metrics."""
    gmm = GaussianMixture(n_components=n_components, random_state=42)
    gmm.fit(X)
    labels = gmm.predict(X)

    # Simulate ground truth (for evaluation only, replace as needed)
    y_true = np.random.randint(0, n_components, size=len(X))  # Replace with real labels if available

    # Metrics Calculation
    purity = purity_score(y_true, labels)
    nmi = normalized_mutual_info_score(y_true, labels)
    f_measure = f1_score(y_true, labels, average='weighted')

    return {
        "purity": purity,
        "nmi": nmi,
        "f_measure": f_measure,
        "best_params": {"n_components": n_components}
    }

def main():
    # Load and preprocess datasets
    cardio_scaled, heart_scaled, _, _, _, _ = load_and_preprocess()

    # Define number of components for GMM
    cardio_n_components = 2
    heart_n_components = 2

    # Evaluate GMM on Cardio dataset
    cardio_results = evaluate_gmm(cardio_scaled, cardio_n_components)
    print("Cardio Dataset Evaluation Metrics:")
    print(f"Purity: {cardio_results['purity']:.3f}")
    print(f"NMI: {cardio_results['nmi']:.3f}")
    print(f"F-measure: {cardio_results['f_measure']:.3f}")
    print(f"Best GMM Params: {cardio_results['best_params']}")

    # Evaluate GMM on Heart dataset
    heart_results = evaluate_gmm(heart_scaled, heart_n_components)
    print("\nHeart Dataset Evaluation Metrics:")
    print(f"Purity: {heart_results['purity']:.3f}")
    print(f"NMI: {heart_results['nmi']:.3f}")
    print(f"F-measure: {heart_results['f_measure']:.3f}")
    print(f"Best GMM Params: {heart_results['best_params']}")

if __name__ == "__main__":
    main()


Original Data Counts:
Cardio Dataset: 70,000
Heart Dataset: 1,025

Cardio Evaluation Metrics:
 purity  nmi  f_measure
    0.7 0.13       0.29
Best Cardio GMM Params: {'n_components': 2}

Heart Evaluation Metrics:
 purity  nmi  f_measure
   0.65 0.07       0.65
Best Heart GMM Params: {'n_components': 2}
