In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


# Load data
df = pd.read_csv('cleaned_data.csv')

# Display the first few rows of the dataframe
print(df.head())



   Has_Diabetes  High_BP  Checked_Cho  Told_High_Cho     BMI  Smoker  Stroke  \
0         False     True         True           True  2389.0   False   False   
1         False     True         True           True  2500.0    True   False   
2          True    False         True          False  2468.0    True   False   
3         False    False         True          False  2569.0    True   False   
4          True     True         True           True  2850.0   False   False   

   Heart_Attack  Physical_Health  Recent_Exercise  Avg_Drink  \
0         False              0.0             True        1.0   
1         False              0.0             True        2.0   
2         False              0.0             True        1.0   
3         False             15.0            False        1.0   
4         False              0.0            False        1.0   

   5_or_More_Drinks  Gen_Health  Mental_Health  Diff_Walking    Sex   Age  \
0               0.0         2.0            3.0         Fa

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Load data
df = pd.read_csv('cleaned_data.csv')

# Define features (X) and target variable (y)
X = df.drop('Has_Diabetes', axis=1)
y = df['Has_Diabetes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Make Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9042844444444444
[[25308   140]
 [ 2552   125]]
              precision    recall  f1-score   support

       False       0.91      0.99      0.95     25448
        True       0.47      0.05      0.08      2677

    accuracy                           0.90     28125
   macro avg       0.69      0.52      0.52     28125
weighted avg       0.87      0.90      0.87     28125



In [4]:
import pandas as pd

df = pd.read_csv('cleaned_data.csv')

num_diabetes_true = df[df['Has_Diabetes'] == True].shape[0]
num_diabetes_false = df[df['Has_Diabetes'] == False].shape[0]


print("Number of people who have stated true for having diabetes:", num_diabetes_true)
print("Number of people who have stated false for having diabetes:", num_diabetes_false)



Number of people who have stated true for having diabetes: 13386
Number of people who have stated false for having diabetes: 127237


In [5]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv('cleaned_data.csv')

# Define features and target
X = df.drop('Has_Diabetes', axis=1)
y = df['Has_Diabetes']

# Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Now, X_train_resampled and y_train_resampled contain the oversampled data
# You can use them to train your model


In [8]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Load data
df = pd.read_csv('cleaned_data.csv')

# Define features and target
X = df.drop('Has_Diabetes', axis=1)
y = df['Has_Diabetes']

# Splitting the Data with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scaling Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training the Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Plotting
# Bar Plot for Diabetes Distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Has_Diabetes', data=df)
plt.title('Distribution of Diabetes')
plt.xlabel('Has Diabetes')
plt.ylabel('Count')
plt.show()




# Histograms for Numerical Features
numerical_features = ['BMI', 'Age', 'Income']
plt.figure(figsize=(12, 6))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(1, len(numerical_features), i)
    sns.histplot(df[feature], bins=20, kde=True)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# ROC Curve
from sklearn.metrics import roc_curve, auc

y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Confusion Matrix Heatmap
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='g', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


ModuleNotFoundError: No module named 'matplotlib'