In [4]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset (replace 'your_dataset.csv' with the actual file name)
data = pd.read_csv('diabetes.csv')

# Display basic information about the dataset
print("Dataset Information:")
print(data.info())

# Display the first few rows of the dataset
print("\nSample Data:")
print(data.head())

# Handle missing values
imputer = SimpleImputer(strategy='mean')

# Specify the columns for imputation
selected_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

# Check if the specified columns exist in the dataset
if all(column in data.columns for column in selected_columns):
    data[selected_columns] = imputer.fit_transform(data[selected_columns])
else:
    print("\nSpecified columns not found in the dataset.")

# Split the data into features (X) and target variable (y)
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create a Random Forest Classifier
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display the evaluation results
print("\nModel Evaluation:")
print(f'Accuracy: {accuracy:.4f}\n')
print('Classification Report:')
print(classification_rep)

# Interpretation
print("\nSummary:-")
print("The provided Python code uses a Random Forest classifier to build a machine learning model for classifying patients based on their risk of developing diabetes. The dataset includes features such as pregnancies, glucose levels, blood pressure, skin thickness, insulin, BMI, diabetes pedigree function, and age. The model is trained, evaluated, and its performance is assessed using metrics such as accuracy, confusion matrix, and classification report. The goal is to assist healthcare professionals in identifying patients at elevated risk for targeted intervention and preventive measures, ultimately improving outcomes in diabetes management.")

# Prediction
print("\nForecast:")
print("For future patient data, the model can be utilized to predict the likelihood of diabetes, thereby aiding healthcare professionals in targeted intervention strategies and preventive measures.")


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

Sample Data:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29   