In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Load dataset
df = pd.read_csv("cvd_classified_dataset.csv")

# Encode target
y = df['CVD_Status'].map({"Non-CVD": 0, "CVD": 1})

# Extract input features
X = df[['Ages', 'Gender', 'Height', 'Weight', 'Activity Level', 'Disease']]

# Map Gender
X['Gender'] = X['Gender'].map({'Male': 0, 'Female': 1})

# Save metadata for dropdowns
metadata = {
    "Gender": df["Gender"].dropna().unique().tolist(),
    "Activity Level": df["Activity Level"].dropna().unique().tolist(),
    "Disease": df["Disease"].dropna().unique().tolist()
}
for k in metadata: metadata[k].sort()

with open("cvd_metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)

# ColumnTransformer
activity_order = [['Sedentary', 'Lightly Active', 'Moderately Active', 'Very Active', 'Extremely Active']]
preprocessor = ColumnTransformer(transformers=[
    ('activity_level', OrdinalEncoder(categories=activity_order), ['Activity Level']),
    ('disease', OrdinalEncoder(), ['Disease']),
    ('scale', StandardScaler(), ['Ages', 'Height', 'Weight'])
], remainder='passthrough')  # Keep Gender

# Models
models = {
    "LogisticRegression": LogisticRegression(),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "SVM": SVC(probability=True)
}

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

best_score = 0
best_model = None
best_model_name = ""

for name, model in models.items():
    pipe = Pipeline([
        ('preprocessing', preprocessor),
        ('classifier', model)
    ])
    pipe.fit(X_train, y_train)
    score = pipe.score(X_test, y_test)
    print(f"{name} Accuracy: {score:.2f}")
    if score > best_score:
        best_score = score
        best_model = pipe
        best_model_name = name

print(f"\n✅ Best Model: {best_model_name} ({best_score:.2f})")
with open("best_cvd_model.pkl", "wb") as f:
    pickle.dump(best_model, f)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Gender'] = X['Gender'].map({'Male': 0, 'Female': 1})


LogisticRegression Accuracy: 0.86
DecisionTree Accuracy: 1.00
RandomForest Accuracy: 1.00
SVM Accuracy: 0.86

✅ Best Model: DecisionTree (1.00)


In [4]:
df.iloc[0]

Ages                                                            25
Gender                                                        Male
Height                                                         180
Weight                                                          80
Activity Level                                   Moderately Active
Dietary Preference                                        Omnivore
Daily Calorie Target                                          2000
Protein                                                        120
Sugar                                                        125.0
Sodium                                                        24.0
Calories                                                      2020
Carbohydrates                                                  250
Fiber                                                         30.0
Fat                                                             60
Breakfast Suggestion                 Oatmeal with berries and 

In [None]:
# # import streamlit as st
# # import pickle
# # import numpy as np
# # import pandas as pd

# # key = "AIzaSyATtxQh_spw-yMHzIA-gyWdUxp5bfqry0s"
# # import google.generativeai as genai

# # genai.configure(api_key=key)  # Replace with your actual Gemini API key
# # gen_model = genai.GenerativeModel("gemini-2.5-flash")

# # # Load model and metadata
# # model = pickle.load(open("best_cvd_model.pkl", "rb"))
# # metadata = pickle.load(open("cvd_metadata.pkl", "rb"))

# # # Page setup
# # st.set_page_config(page_title="CVD Prediction App", page_icon="❤️")
# # st.title("🩺 CVD Risk Prediction")
# # st.markdown("Enter your details to check your risk for cardiovascular disease:")

# # # Input form
# # with st.form("cvd_form"):
# #     col1, col2 = st.columns(2)
    
# #     with col1:
# #         age = st.slider("Age", 10, 100, 30)
# #         height = st.number_input("Height (cm)", min_value=100, max_value=250, value=170)
# #         weight = st.number_input("Weight (kg)", min_value=30, max_value=200, value=70)

# #     with col2:
# #         gender = st.selectbox("Gender", metadata['Gender'])
# #         activity = st.selectbox("Activity Level", metadata['Activity Level'])
# #         disease = st.selectbox("Disease", metadata['Disease'])
# #         diet_prefrence = st.selectbox("Diet", ["Vegetarian", "Non-Vegetarian"])
# #     submitted = st.form_submit_button("🔍 Predict")

# # if submitted:
# #     gender_num = 0 if gender == "Male" else 1
# #     input_data = pd.DataFrame([{
# #             "Ages": age,
# #             "Gender": gender_num,
# #             "Height": height,
# #             "Weight": weight,
# #             "Activity Level": activity,
# #             "Disease": disease
            
# #         }])

# #     prediction = model.predict(input_data)[0]

# #     prob = model.predict_proba(input_data)[0][prediction] if hasattr(model, 'predict_proba') else None

# #     prompt_template = f"""
# #                             I want you to act as a professional nutritionist and fitness expert.

# #                             Based on the following user profile:
# #                             {input_data} 
# #                             and {diet_prefrence}
# #                             if {prediction} equal to 1 then he/she suffer from High Risk of CVD!
# #                             and if {prediction} equal to 0 then he/she has no CVD! risk 
# #                             so based of prediction value create a diet plan.
# #                             Give a full-day personalized plan including:
# #                             1. Breakfast suggestion
# #                             2. Lunch suggestion
# #                             3. Dinner suggestion
# #                             4. Healthy snack ideas
# #                             5. Simple home-based workout routine

# #                             Ensure recommendations align with dietary and health restrictions.
# #                             Give the plan in bullet points.
# #                             """
        
# #     st.subheader("🔎 Prediction Result:")
# #     if prediction == 1:
        
        
# #         st.error(f"⚠️ High Risk of CVD! (Confidence: {prob:.2f})" if prob else "⚠️ High Risk of CVD!")
        
# #         if st.button("🍽️ Generate Personalized Diet Plan"):
# #             with st.spinner("Generating your personalized diet plan..."):
# #                 gemini_response = gen_model.generate_content(prompt_template)

# #             st.markdown("### 💡 Personalized Daily Plan")
# #             st.markdown(gemini_response.text)
        
# #     else:
# #         st.success(f"✅ Low Risk of CVD (Confidence: {prob:.2f})" if prob else "✅ Low Risk of CVD")
# #         st.button("Not risk but you want a diet plan")
# #         response = gen_model.generate_content(prompt_template)

# #         st.title("💡 Daily Plan:")
# #         st.markdown(response.text)
        

import streamlit as st
import pickle
import numpy as np
import pandas as pd
import google.generativeai as genai

# 🔐 Gemini setup
genai.configure(api_key="AIzaSyATtxQh_spw-yMHzIA-gyWdUxp5bfqry0s")
gemini_model = genai.GenerativeModel("gemini-2.5-flash")

# 📦 Load model and metadata
ml_model = pickle.load(open("best_cvd_model.pkl", "rb"))
metadata = pickle.load(open("cvd_metadata.pkl", "rb"))

# 🩺 Page setup
st.set_page_config(page_title="CVD Prediction App", page_icon="❤️")
st.title("🩺 CVD Risk Prediction & Diet Recommendation")

# Initialize session state if not present
if "prediction" not in st.session_state:
    st.session_state.prediction = None
if "show_diet" not in st.session_state:
    st.session_state.show_diet = False
if "input_data" not in st.session_state:
    st.session_state.input_data = {}

# --- Form for input ---
with st.form("cvd_form"):
    col1, col2 = st.columns(2)
    with col1:
        age = st.slider("Age", 10, 100, 30)
        height = st.number_input("Height (cm)", min_value=100, max_value=250, value=170)
        weight = st.number_input("Weight (kg)", min_value=30, max_value=200, value=70)
    with col2:
        gender = st.selectbox("Gender", metadata['Gender'])
        activity = st.selectbox("Activity Level", metadata['Activity Level'])
        disease = st.selectbox("Disease", metadata['Disease'])
        diet_pref = st.selectbox("Dietary Preference", ["Vegetarian", "Non-Vegetarian"])
    submitted = st.form_submit_button("🔍 Predict")

# --- On form submit ---
if submitted:
    gender_num = 0 if gender == "Male" else 1
    input_df = pd.DataFrame([{
        "Ages": age,
        "Gender": gender_num,
        "Height": height,
        "Weight": weight,
        "Activity Level": activity,
        "Disease": disease
    }])
    prediction = ml_model.predict(input_df)[0]
    prob = ml_model.predict_proba(input_df)[0][prediction] if hasattr(ml_model, 'predict_proba') else None

    # Save state
    st.session_state.prediction = prediction
    st.session_state.input_data = {
        "age": age, "gender": gender, "height": height, "weight": weight,
        "activity": activity, "disease": disease, "diet_pref": diet_pref
    }
    st.session_state.show_diet = False  # Reset if previously clicked

# --- Show Prediction Result ---
if st.session_state.prediction is not None:
    prediction = st.session_state.prediction
    data = st.session_state.input_data

    st.subheader("🔎 Prediction Result:")
    if prediction == 1:
        st.error("⚠️ High Risk of CVD")

        # Show button to generate plan
        if st.button("🍽️ Generate Personalized Diet Plan"):
            st.session_state.show_diet = True

    else:
        st.success("✅ Low Risk of CVD")

# --- Show Diet Plan if button clicked ---
if st.session_state.show_diet:
    data = st.session_state.input_data
    prompt = f"""
    I want you to act as a professional nutritionist and fitness expert.

    Based on this profile:
    - Age: {data['age']}
    - Gender: {data['gender']}
    - Height: {data['height']} cm
    - Weight: {data['weight']} kg
    - Activity Level: {data['activity']}
    - Disease: {data['disease']}
    - Dietary Preference: {data['diet_pref']}
    - Health Condition: High Risk of CVD

    Please provide a full-day personalized diet and workout plan:
    - Breakfast
    - Lunch
    - Dinner
    - Healthy snacks
    - Simple home-based workout

    Use bullet points. Avoid excess salt, sugar, and red meat.
    """

    with st.spinner("Generating diet plan..."):
        response = gemini_model.generate_content(prompt)

    st.markdown("### 💡 Personalized Diet Plan")
    st.markdown(response.text)


In [25]:
import numpy as np
import pandas as pd

In [None]:
df= pd.read_csv('cvd_classified_dataset.csv')

In [27]:
data1.head()

Unnamed: 0,Ages,Gender,Height,Weight,Activity Level,Dietary Preference,Daily Calorie Target,Protein,Sugar,Sodium,...,Dinner Calories,Dinner Protein.1,Dinner Carbohydrates.1,Dinner Fats,Snack Suggestion,Snacks Calories,Snacks Protein,Snacks Carbohydrates,Snacks Fats,Disease
0,25,Male,180,80,Moderately Active,Omnivore,2000,120,125.0,24.0,...,2020.0,60.0,250.0,60.0,Greek yogurt with fruit,150,10,20,5,Weight Gain
1,32,Female,165,65,Lightly Active,Vegetarian,1600,80,100.0,16.0,...,10.0,5.0,31.0,27.0,Apple with almond butter,180,4,30,8,"Weight Gain, Hypertension, Heart Disease"
2,48,Male,175,95,Sedentary,Vegan,2200,100,150.0,20.0,...,326.0,55.0,150.0,15.0,Trail mix,300,8,34,20,Weight Gain
3,55,Female,160,70,Very Active,Omnivore,2500,140,175.0,28.0,...,278.0,20.0,163.0,27.0,Banana with peanut butter,210,5,30,9,Weight Gain
4,62,Male,170,85,Sedentary,Vegetarian,2000,80,125.0,16.0,...,186.0,6.3,186.0,22.0,Fruit and nut mix,250,6,28,15,Weight Gain


In [29]:
data3.head()

Unnamed: 0,Gender,Activity Level,Dietary Preference,Breakfast Suggestion,Lunch Suggestion,Dinner Suggestion,Snack Suggestion,Disease,Ages,Height,Weight,Daily Calorie Target,Protein,Sugar,Sodium,Calories,Carbohydrates,Fiber,Fat
0,Male,Moderately Active,Omnivore,Oatmeal with berries and nuts,Grilled chicken salad with mixed greens,Salmon with roasted vegetables,Greek yogurt with fruit,Weight Gain,25,180,80,2000,120,125.0,24.0,2020,250,30.0,60
1,Female,Lightly Active,Vegetarian,Tofu scramble with veggies,Lentil soup with whole wheat bread,Vegetable stir-fry with brown rice,Apple with almond butter,"Weight Gain, Hypertension, Heart Disease",32,165,65,1600,80,100.0,16.0,1480,200,24.0,40
2,Male,Sedentary,Vegan,Tofu and veggie breakfast burrito,Black bean burger on a whole wheat bun,Lentil and vegetable curry,Trail mix,Weight Gain,48,175,95,2200,100,150.0,20.0,2185,300,36.0,65
3,Female,Very Active,Omnivore,Greek yogurt with granola and fruit,Chicken and vegetable stir-fry,Turkey chili with brown rice,Banana with peanut butter,Weight Gain,55,160,70,2500,140,175.0,28.0,2680,350,42.0,80
4,Male,Sedentary,Vegetarian,Scrambled eggs with whole wheat toast and avocado,Quinoa salad with chickpeas and vegetables,Vegetarian chili with cornbread,Fruit and nut mix,Weight Gain,62,170,85,2000,80,125.0,16.0,1815,250,30.0,55


In [32]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1698 entries, 0 to 1697
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Ages                     1698 non-null   int64  
 1   Gender                   1698 non-null   object 
 2   Height                   1698 non-null   int64  
 3   Weight                   1698 non-null   int64  
 4   Activity Level           1698 non-null   object 
 5   Dietary Preference       1698 non-null   object 
 6   Daily Calorie Target     1698 non-null   int64  
 7   Protein                  1698 non-null   int64  
 8   Sugar                    1698 non-null   float64
 9   Sodium                   1698 non-null   float64
 10  Calories                 1698 non-null   int64  
 11  Carbohydrates            1698 non-null   int64  
 12  Fiber                    1698 non-null   float64
 13  Fat                      1698 non-null   int64  
 14  Breakfast Suggestion    

In [33]:
data1.isnull().sum()

Ages                       0
Gender                     0
Height                     0
Weight                     0
Activity Level             0
Dietary Preference         0
Daily Calorie Target       0
Protein                    0
Sugar                      0
Sodium                     0
Calories                   0
Carbohydrates              0
Fiber                      0
Fat                        0
Breakfast Suggestion       0
Breakfast Calories         0
Breakfast Protein          0
Breakfast Carbohydrates    1
Breakfast Fats             0
Lunch Suggestion           0
Lunch Calories             0
Lunch Protein              0
Lunch Carbohydrates        0
Lunch Fats                 0
Dinner Suggestion          0
Dinner Calories            0
Dinner Protein.1           0
Dinner Carbohydrates.1     0
Dinner Fats                0
Snack Suggestion           0
Snacks Calories            0
Snacks Protein             0
Snacks Carbohydrates       0
Snacks Fats                0
Disease       

In [34]:
data1.dropna(inplace=True)

In [35]:
data2.shape

(1000, 20)

In [36]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Patient_ID                        1000 non-null   object 
 1   Age                               1000 non-null   int64  
 2   Gender                            1000 non-null   object 
 3   Weight_kg                         1000 non-null   float64
 4   Height_cm                         1000 non-null   int64  
 5   BMI                               1000 non-null   float64
 6   Disease_Type                      796 non-null    object 
 7   Severity                          1000 non-null   object 
 8   Physical_Activity_Level           1000 non-null   object 
 9   Daily_Caloric_Intake              1000 non-null   int64  
 10  Cholesterol_mg/dL                 1000 non-null   float64
 11  Blood_Pressure_mmHg               1000 non-null   int64  
 12  Glucose

In [37]:
data2.isnull().sum()

Patient_ID                            0
Age                                   0
Gender                                0
Weight_kg                             0
Height_cm                             0
BMI                                   0
Disease_Type                        204
Severity                              0
Physical_Activity_Level               0
Daily_Caloric_Intake                  0
Cholesterol_mg/dL                     0
Blood_Pressure_mmHg                   0
Glucose_mg/dL                         0
Dietary_Restrictions                334
Allergies                           323
Preferred_Cuisine                     0
Weekly_Exercise_Hours                 0
Adherence_to_Diet_Plan                0
Dietary_Nutrient_Imbalance_Score      0
Diet_Recommendation                   0
dtype: int64

In [38]:
data2.dropna(inplace=True)

In [39]:
data3.shape

(1698, 19)

In [40]:
data3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1698 entries, 0 to 1697
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Gender                1698 non-null   object 
 1   Activity Level        1698 non-null   object 
 2   Dietary Preference    1698 non-null   object 
 3   Breakfast Suggestion  1698 non-null   object 
 4   Lunch Suggestion      1698 non-null   object 
 5   Dinner Suggestion     1698 non-null   object 
 6   Snack Suggestion      1698 non-null   object 
 7   Disease               1698 non-null   object 
 8   Ages                  1698 non-null   int64  
 9   Height                1698 non-null   int64  
 10  Weight                1698 non-null   int64  
 11  Daily Calorie Target  1698 non-null   int64  
 12  Protein               1698 non-null   int64  
 13  Sugar                 1698 non-null   float64
 14  Sodium                1698 non-null   float64
 15  Calories             

In [41]:
data3.isnull().sum()

Gender                  0
Activity Level          0
Dietary Preference      0
Breakfast Suggestion    0
Lunch Suggestion        0
Dinner Suggestion       0
Snack Suggestion        0
Disease                 0
Ages                    0
Height                  0
Weight                  0
Daily Calorie Target    0
Protein                 0
Sugar                   0
Sodium                  0
Calories                0
Carbohydrates           0
Fiber                   0
Fat                     0
dtype: int64

In [42]:
data4.shape

(1698, 36)

In [43]:
data4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1698 entries, 0 to 1697
Data columns (total 36 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Ages                     1698 non-null   int64  
 1   Gender                   1698 non-null   object 
 2   Height                   1698 non-null   int64  
 3   Weight                   1698 non-null   int64  
 4   Activity Level           1698 non-null   object 
 5   Dietary Preference       1698 non-null   object 
 6   Daily Calorie Target     1698 non-null   int64  
 7   Protein                  1698 non-null   int64  
 8   Sugar                    1698 non-null   float64
 9   Sodium                   1698 non-null   float64
 10  Calories                 1698 non-null   int64  
 11  Carbohydrates            1698 non-null   int64  
 12  Fiber                    1698 non-null   float64
 13  Fat                      1698 non-null   int64  
 14  Breakfast Suggestion    

In [44]:
data4.isnull().sum()

Ages                       0
Gender                     0
Height                     0
Weight                     0
Activity Level             0
Dietary Preference         0
Daily Calorie Target       0
Protein                    0
Sugar                      0
Sodium                     0
Calories                   0
Carbohydrates              0
Fiber                      0
Fat                        0
Breakfast Suggestion       0
Breakfast Calories         0
Breakfast Protein          0
Breakfast Carbohydrates    1
Breakfast Fats             0
Lunch Suggestion           0
Lunch Calories             0
Lunch Protein              0
Lunch Carbohydrates        0
Lunch Fats                 0
Dinner Suggestion          0
Dinner Calories            0
Dinner Protein.1           0
Dinner Carbohydrates.1     0
Dinner Fats                0
Snack Suggestion           0
Snacks Calories            0
Snacks Protein             0
Snacks Carbohydrates       0
Snacks Fats                0
Disease       

In [45]:
data4.dropna(inplace=True)

In [57]:
import pandas as pd
d = pd.read_csv("cvd_classified_dataset.csv")
d.head()

Unnamed: 0,Ages,Gender,Height,Weight,Activity Level,Dietary Preference,Daily Calorie Target,Protein,Sugar,Sodium,...,Dinner Protein.1,Dinner Carbohydrates.1,Dinner Fats,Snack Suggestion,Snacks Calories,Snacks Protein,Snacks Carbohydrates,Snacks Fats,Disease,CVD_Status
0,25,Male,180,80,Moderately Active,Omnivore,2000,120,125.0,24.0,...,60.0,250.0,60.0,Greek yogurt with fruit,150,10,20,5,Weight Gain,Non-CVD
1,32,Female,165,65,Lightly Active,Vegetarian,1600,80,100.0,16.0,...,5.0,31.0,27.0,Apple with almond butter,180,4,30,8,"Weight Gain, Hypertension, Heart Disease",CVD
2,48,Male,175,95,Sedentary,Vegan,2200,100,150.0,20.0,...,55.0,150.0,15.0,Trail mix,300,8,34,20,Weight Gain,Non-CVD
3,55,Female,160,70,Very Active,Omnivore,2500,140,175.0,28.0,...,20.0,163.0,27.0,Banana with peanut butter,210,5,30,9,Weight Gain,Non-CVD
4,62,Male,170,85,Sedentary,Vegetarian,2000,80,125.0,16.0,...,6.3,186.0,22.0,Fruit and nut mix,250,6,28,15,Weight Gain,Non-CVD


In [58]:
d.columns

Index(['Ages', 'Gender', 'Height', 'Weight', 'Activity Level',
       'Dietary Preference', 'Daily Calorie Target', 'Protein', 'Sugar',
       'Sodium', 'Calories', 'Carbohydrates', 'Fiber', 'Fat',
       'Breakfast Suggestion', 'Breakfast Calories', 'Breakfast Protein',
       'Breakfast Carbohydrates', 'Breakfast Fats', 'Lunch Suggestion',
       'Lunch Calories', 'Lunch Protein', 'Lunch Carbohydrates', 'Lunch Fats',
       'Dinner Suggestion', 'Dinner Calories', 'Dinner Protein.1',
       'Dinner Carbohydrates.1', 'Dinner Fats', 'Snack Suggestion',
       'Snacks Calories', 'Snacks Protein', 'Snacks Carbohydrates',
       'Snacks Fats', 'Disease', 'CVD_Status'],
      dtype='object')

In [None]:
# label
'Ages', 'Gender', 'Height', 'Weight', 'Activity Level','Disease'

# target
'CVD_Status'

In [155]:
x1 = d[['Ages', 'Gender', 'Height', 'Weight', 'Activity Level','Disease','CVD_Status']]
x = x1.drop("CVD_Status",axis=1)
x

Unnamed: 0,Ages,Gender,Height,Weight,Activity Level,Disease
0,25,Male,180,80,Moderately Active,Weight Gain
1,32,Female,165,65,Lightly Active,"Weight Gain, Hypertension, Heart Disease"
2,48,Male,175,95,Sedentary,Weight Gain
3,55,Female,160,70,Very Active,Weight Gain
4,62,Male,170,85,Sedentary,Weight Gain
...,...,...,...,...,...,...
1693,53,Female,182,76,Sedentary,"Diabetes, Acne, Weight Gain, Hypertension, Hea..."
1694,38,Male,150,76,Very Active,Weight Gain
1695,57,Male,165,73,Very Active,Weight Gain
1696,40,Male,166,91,Extremely Active,Weight Gain


In [156]:
y= x1["CVD_Status"]
y

0       Non-CVD
1           CVD
2       Non-CVD
3       Non-CVD
4       Non-CVD
         ...   
1693        CVD
1694    Non-CVD
1695    Non-CVD
1696    Non-CVD
1697    Non-CVD
Name: CVD_Status, Length: 1698, dtype: object

In [157]:
# for i in x.columns:
#     if x[i].dtypes=="object":
#         print(i, x[i].unique())
#         print("================")

In [158]:
x["Gender"] =x["Gender"].map({'Male' :0, "Female":1})
x

Unnamed: 0,Ages,Gender,Height,Weight,Activity Level,Disease
0,25,0,180,80,Moderately Active,Weight Gain
1,32,1,165,65,Lightly Active,"Weight Gain, Hypertension, Heart Disease"
2,48,0,175,95,Sedentary,Weight Gain
3,55,1,160,70,Very Active,Weight Gain
4,62,0,170,85,Sedentary,Weight Gain
...,...,...,...,...,...,...
1693,53,1,182,76,Sedentary,"Diabetes, Acne, Weight Gain, Hypertension, Hea..."
1694,38,0,150,76,Very Active,Weight Gain
1695,57,0,165,73,Very Active,Weight Gain
1696,40,0,166,91,Extremely Active,Weight Gain


In [159]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

# Define the specific order as a list of lists
cat = [['Sedentary', 'Lightly Active', 'Moderately Active', 'Very Active', 'Extremely Active']]

# Pass it to OrdinalEncoder
ord = OrdinalEncoder(categories=cat)

# Transform the data
x[["Activity Level"]] = ord.fit_transform(x[["Activity Level"]])


In [160]:
x[["Activity Level"]] = x[["Activity Level"]].astype("i")

In [161]:
ord2 =OrdinalEncoder()
x[["Disease"]]= ord2.fit_transform(x[["Disease"]])


In [162]:
y.nunique()

2

In [163]:
y = y.map({"Non-CVD":0,"CVD":1})

In [164]:
# model 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x , y , test_size=.2, random_state=42)

In [165]:
clf = LogisticRegression()
clf.fit(x_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [166]:
clf.score(x_train,y_train)

0.8578792341678939

In [167]:
# predicttion 
input = x.sample(1)
input

Unnamed: 0,Ages,Gender,Height,Weight,Activity Level,Disease
1152,33,0,167,86,3,9.0


In [168]:
clf.predict(input)

array([0])

In [174]:
y

0       0
1       1
2       0
3       0
4       0
       ..
1693    1
1694    0
1695    0
1696    0
1697    0
Name: CVD_Status, Length: 1698, dtype: int64

In [175]:
(pd.DataFrame(x.iloc[1]).T)

Unnamed: 0,Ages,Gender,Height,Weight,Activity Level,Disease
1,32.0,1.0,165.0,65.0,1.0,10.0


In [176]:
clf.predict(pd.DataFrame(x.iloc[1]).T)

array([0])

In [177]:
import pickle

pickle.dump(clf,open("clf.pkl","wb"))

### pipline

In [None]:


# Define the specific order as a list of lists
cat = [['Sedentary', 'Lightly Active', 'Moderately Active', 'Very Active', 'Extremely Active']]
x["Gender"] =x["Gender"].map({'Male' :0, "Female":1})

# Pass it to OrdinalEncoder
ord = OrdinalEncoder(categories=cat)

ord2 =OrdinalEncoder()
x[["Disease"]]= ord2.fit_transform(x[["Disease"]])

Pipeline([('ord', OrdinalEncoder(categories=cat)), ("ord2", ),('LR', LogisticRegression())])

Unnamed: 0,Ages,Gender,Height,Weight,Activity Level,Disease,CVD_Status
0,25,Male,180,80,Moderately Active,Weight Gain,Non-CVD
1,32,Female,165,65,Lightly Active,"Weight Gain, Hypertension, Heart Disease",CVD
2,48,Male,175,95,Sedentary,Weight Gain,Non-CVD
3,55,Female,160,70,Very Active,Weight Gain,Non-CVD
4,62,Male,170,85,Sedentary,Weight Gain,Non-CVD
...,...,...,...,...,...,...,...
1693,53,Female,182,76,Sedentary,"Diabetes, Acne, Weight Gain, Hypertension, Hea...",CVD
1694,38,Male,150,76,Very Active,Weight Gain,Non-CVD
1695,57,Male,165,73,Very Active,Weight Gain,Non-CVD
1696,40,Male,166,91,Extremely Active,Weight Gain,Non-CVD


In [135]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

#  Custom order for Activity Level
activity_order = [['Sedentary', 'Lightly Active', 'Moderately Active', 'Very Active', 'Extremely Active']]

# Manually map Gender to numeric (already done)
x["Gender"] = x["Gender"].map({'Male': 0, 'Female': 1})

# ColumnTransformer: apply different preprocessing to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('activity_ord', OrdinalEncoder(categories=activity_order), ['Activity Level']),
        ('disease_ord', OrdinalEncoder(), ['Disease'])
        # You can add more transformers here for other categorical columns
    ],
    remainder='passthrough'  # keeps other columns like 'Gender'
)

# Pipeline: preprocessing + model
model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression())
])

# Example: model_pipeline.fit(X_train, y_train)

In [137]:
x_train

Unnamed: 0,Ages,Gender,Height,Weight,Activity Level,Disease
820,28,1,156,101,2,9.0
497,68,1,160,70,0,10.0
462,35,0,185,95,2,9.0
974,59,0,179,59,0,12.0
631,65,0,192,84,2,12.0
...,...,...,...,...,...,...
1130,60,0,174,77,3,9.0
1294,53,0,189,58,4,9.0
860,57,1,187,103,2,9.0
1459,31,0,172,67,0,9.0


In [136]:
m2 = model_pipeline.fit(x_train, y_train)


ValueError: invalid literal for int() with base 10: 'Sedentary'