In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification

# Generating synthetic data
n_samples = 1000

# Generating features
age = np.random.randint(18, 80, size=n_samples)
gender = np.random.choice(['Male', 'Female'], size=n_samples)
bmi = np.random.uniform(18, 40, size=n_samples)
blood_pressure = np.random.randint(90, 180, size=n_samples)
cholesterol = np.random.randint(120, 300, size=n_samples)
family_history = np.random.choice([0, 1], size=n_samples)
exercise_hours = np.random.randint(0, 24, size=n_samples)
smoking_status = np.random.choice(['Never Smoked', 'Former Smoker', 'Current Smoker'], size=n_samples)
alcohol_consumption = np.random.choice(['None', 'Moderate', 'Heavy'], size=n_samples)
stress_level = np.random.randint(0, 11, size=n_samples)  # Assume stress level ranges from 0 to 10
sleep_duration = np.random.randint(4, 12, size=n_samples)  # Assume sleep duration in hours
fast_food_intake = np.random.randint(0, 4, size=n_samples)  # Assume frequency of fast food intake per week

# Generating target variable (disease presence)
# For simplicity, let's generate a binary target where 1 indicates presence and 0 indicates absence of disease
# You can replace this with your own logic for generating target variable based on other features
X, y = make_classification(n_samples=n_samples, n_features=11, n_classes=2, random_state=42)

# Creating a DataFrame
data = pd.DataFrame({
    'age': age,
    'gender': gender,
    'bmi': bmi,
    'blood_pressure': blood_pressure,
    'cholesterol': cholesterol,
    'family_history': family_history,
    'exercise_hours': exercise_hours,
    'smoking_status': smoking_status,
    'alcohol_consumption': alcohol_consumption,
    'stress_level': stress_level,
    'sleep_duration': sleep_duration,
    'fast_food_intake': fast_food_intake,
    'disease': y
})

# Encoding categorical variables
data = pd.get_dummies(data, columns=['gender', 'smoking_status', 'alcohol_consumption'])

# Saving the dataset to a CSV file
data.to_csv('health_data_extended.csv', index=False)
print(data.head())


   age        bmi  blood_pressure  cholesterol  family_history  \
0   43  31.089262             171          202               1   
1   72  19.243039             111          246               1   
2   77  36.322424             149          161               0   
3   18  32.782892             115          120               1   
4   74  30.318941             130          270               0   

   exercise_hours  stress_level  sleep_duration  fast_food_intake  disease  \
0               9             6               7                 3        0   
1              19            10               6                 1        0   
2               9             2               5                 3        1   
3              17             1               8                 0        0   
4              16             9               4                 2        1   

   gender_Female  gender_Male  smoking_status_Current Smoker  \
0              1            0                              1   
1     

In [2]:
import pandas as pd

# Load the dataset
data = pd.read_csv('health_data_extended.csv')

# Perform data cleaning and preprocessing
# Handle missing values
data.dropna(inplace=True)
print(data)

# Handle outliers (if necessary)

# Normalize or standardize features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.drop(columns=['disease']))
data[data.columns[:-1]] = scaled_features



     age        bmi  blood_pressure  cholesterol  family_history  \
0     43  31.089262             171          202               1   
1     72  19.243039             111          246               1   
2     77  36.322424             149          161               0   
3     18  32.782892             115          120               1   
4     74  30.318941             130          270               0   
..   ...        ...             ...          ...             ...   
995   64  31.806099             148          248               1   
996   24  36.605539             151          216               0   
997   29  33.355212             120          263               0   
998   52  24.241424             155          159               1   
999   70  20.177857             167          126               0   

     exercise_hours  stress_level  sleep_duration  fast_food_intake  disease  \
0                 9             6               7                 3        0   
1                19    

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



data = pd.read_csv('health_data_extended.csv')

# Perform data cleaning and preprocessing
# Handle missing values
data.dropna(inplace=True)

# Normalize or standardize features
scaler = MinMaxScaler()  # Using MinMaxScaler to ensure all features are non-negative
scaled_features = scaler.fit_transform(data.drop(columns=['disease']))
data[data.columns[:-1]] = scaled_features

# Feature selection using chi-squared test
X = data.drop(columns=['disease'])
y = data['disease']
selector = SelectKBest(score_func=chi2, k=5)
selected_features = selector.fit(X, y)
selected_features_indices = selected_features.get_support(indices=True)
selected_features_names = X.columns[selected_features_indices]
X_selected = data[selected_features_names]



In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [5]:
logistic_regression = LogisticRegression()
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
svm = SVC()


In [6]:
# Fit the models
logistic_regression.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
svm.fit(X_train, y_train)

In [7]:
# Predictions
lr_predictions = logistic_regression.predict(X_test)
dt_predictions = decision_tree.predict(X_test)
rf_predictions = random_forest.predict(X_test)
svm_predictions = svm.predict(X_test)

# Evaluate model performance
print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, lr_predictions))
print("Precision:", precision_score(y_test, lr_predictions))
print("Recall:", recall_score(y_test, lr_predictions))
print("F1 Score:", f1_score(y_test, lr_predictions))
# Evaluate Decision Tree model
print("\nDecision Tree:")
print("Accuracy:", accuracy_score(y_test, dt_predictions))
print("Precision:", precision_score(y_test, dt_predictions))
print("Recall:", recall_score(y_test, dt_predictions))
print("F1 Score:", f1_score(y_test, dt_predictions))

# Evaluate Random Forest model
print("\nRandom Forest:")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print("Precision:", precision_score(y_test, rf_predictions))
print("Recall:", recall_score(y_test, rf_predictions))
print("F1 Score:", f1_score(y_test, rf_predictions))

# Evaluate SVM model
print("\nSVM:")
print("Accuracy:", accuracy_score(y_test, svm_predictions))
print("Precision:", precision_score(y_test, svm_predictions))
print("Recall:", recall_score(y_test, svm_predictions))
print("F1 Score:", f1_score(y_test, svm_predictions))


Logistic Regression:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

Decision Tree:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

Random Forest:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

SVM:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [8]:
cv_scores = cross_val_score(logistic_regression, X_selected, y, cv=5)
print("Cross-Validation Scores for Logistic Regression:", cv_scores)
print("Mean CV Score for Logistic Regression:", cv_scores.mean())

Cross-Validation Scores for Logistic Regression: [1. 1. 1. 1. 1.]
Mean CV Score for Logistic Regression: 1.0


In [9]:
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

random_search = RandomizedSearchCV(random_forest, param_distributions, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)


Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10}


In [10]:
# Predictions using the best model
best_model = random_search.best_estimator_
best_model_predictions = best_model.predict(X_test)

# Evaluate the best model
print("Best Model Performance:")
print("Accuracy:", accuracy_score(y_test, best_model_predictions))
print("Precision:", precision_score(y_test, best_model_predictions))
print("Recall:", recall_score(y_test, best_model_predictions))
print("F1 Score:", f1_score(y_test, best_model_predictions))

Best Model Performance:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


User Interface:(optional)
A user-friendly interface that allows users to input their health-related
data and receive predictions about the likelihood of having a particular disease.

In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



data = pd.read_csv('health_data_extended.csv')

# Perform data cleaning and preprocessing
# Handle missing values
data.dropna(inplace=True)

# Normalize or standardize features
scaler = MinMaxScaler()  # Using MinMaxScaler to ensure all features are non-negative
scaled_features = scaler.fit_transform(data.drop(columns=['disease']))
data[data.columns[:-1]] = scaled_features

# Feature selection using chi-squared test
X = data.drop(columns=['disease'])
y = data['disease']
selector = SelectKBest(score_func=chi2, k=5)
selected_features = selector.fit(X, y)
selected_features_indices = selected_features.get_support(indices=True)
selected_features_names = X.columns[selected_features_indices]
X_selected = data[selected_features_names]



# Train a Random Forest model
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

# Define a function to make predictions
def predict_disease(features):
    scaled_features = scaler.transform([features])
    selected_features = scaled_features[:, selected_features_indices]
    prediction = random_forest.predict(selected_features)[0]
    return prediction

# User Interface
print("Welcome to the Disease Prediction System!")
print("Please enter your health-related data:")

age = int(input("Enter your age: "))
gender = input("Enter your gender (Male/Female): ")
bmi = float(input("Enter your BMI: "))
blood_pressure = int(input("Enter your blood pressure: "))
cholesterol = int(input("Enter your cholesterol level: "))
family_history = int(input("Do you have a family history of the disease? (0 for No, 1 for Yes): "))
exercise_hours = int(input("Enter your weekly exercise hours: "))
smoking_status = input("Enter your smoking status (Never Smoked/Former Smoker/Current Smoker): ")
alcohol_consumption = input("Enter your alcohol consumption level (None/Moderate/Heavy): ")
stress_level = int(input("Enter your stress level (0-10): "))
sleep_duration = int(input("Enter your average sleep duration (hours): "))
fast_food_intake = int(input("Enter your weekly frequency of fast food intake: "))

features = [age, bmi, blood_pressure, cholesterol, family_history, exercise_hours, stress_level, sleep_duration, fast_food_intake]

# Convert categorical inputs to one-hot encoded format
gender_male = 1 if gender.lower() == 'male' else 0
gender_female = 1 if gender.lower() == 'female' else 0

smoking_status_never = 1 if smoking_status.lower() == 'never smoked' else 0
smoking_status_former = 1 if smoking_status.lower() == 'former smoker' else 0
smoking_status_current = 1 if smoking_status.lower() == 'current smoker' else 0

alcohol_none = 1 if alcohol_consumption.lower() == 'none' else 0
alcohol_moderate = 1 if alcohol_consumption.lower() == 'moderate' else 0
alcohol_heavy = 1 if alcohol_consumption.lower() == 'heavy' else 0

features += [gender_male, gender_female, smoking_status_never, smoking_status_former, smoking_status_current, alcohol_none, alcohol_moderate, alcohol_heavy]

# Make prediction
prediction = predict_disease(features)

# Output prediction
if prediction == 1:
    print("\nBased on the provided data, you are predicted to have the disease.")
else:
    print("\nBased on the provided data, you are predicted to not have the disease.")


Welcome to the Disease Prediction System!
Please enter your health-related data:

Based on the provided data, you are predicted to not have the disease.


