In [None]:
# Answer1.

# Answer1.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import the dataset
dataset = pd.read_csv('diabetes.csv')

# Display the first few rows of the dataset
print(dataset.head())

# Get descriptive statistics of the variables
print(dataset.describe())

# Calculate the correlation matrix
correlation_matrix = dataset.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Plot histograms of the variables
dataset.hist(figsize=(10, 8))
plt.tight_layout()
plt.show()

# Create pairplots of the variables
sns.pairplot(dataset, hue='Outcome')
plt.show()

In [None]:
# Answer2.

import pandas as pd
from scipy.stats import zscore

# Import the dataset
dataset = pd.read_csv('diabetes.csv')

# Handling missing values
dataset = dataset.dropna()  # Remove rows with missing values
# Alternatively, you can impute missing values
# dataset = dataset.fillna(dataset.mean())  # Replace with mean

# Handling outliers using z-scores
z_scores = zscore(dataset[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']])
dataset = dataset[(z_scores < 3).all(axis=1)]  # Remove rows with z-scores above 3

# Handling categorical variables (if any)
# dataset = pd.get_dummies(dataset, columns=['CategoricalVariable'])  # Convert categorical variables to dummy variables

# Confirm the changes
print(dataset.head())

In [None]:
# Answer3.

import pandas as pd
from sklearn.model_selection import train_test_split

# Import the dataset
dataset = pd.read_csv('diabetes.csv')

# Preprocess the dataset (clean missing values, remove outliers, etc.)

# Split the dataset into features (X) and target variable (y)
X = dataset.drop('Outcome', axis=1)
y = dataset['Outcome']

# Set the random seed for reproducibility
random_seed = 42

# Split the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

# Verify the split
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

In [None]:
# Answer4.

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Separate features and target variable
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the decision tree classifier
clf = DecisionTreeClassifier()

# Define the hyperparameters to be tuned
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3]
}

# Perform grid search cross-validation
grid_search = GridSearchCV(clf, params, cv=5)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

In [None]:
# Answer5.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Separate features and target variable
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the decision tree classifier
clf = DecisionTreeClassifier()

# Define the hyperparameters to be tuned
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3]
}

# Perform grid search cross-validation
grid_search = GridSearchCV(clf, params, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Calculate false positive rate, true positive rate, and area under the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Answer6.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import graphviz

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Separate features and target variable
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the decision tree classifier
clf = DecisionTreeClassifier()

# Fit the decision tree classifier on the training data
clf.fit(X_train, y_train)

# Visualize the decision tree
dot_data = export_graphviz(clf, out_file=None, feature_names=X.columns, class_names=['No Diabetes', 'Diabetes'],
                           filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("decision_tree")

# Show the decision tree
graph

In [None]:
# Answer7.

import pandas as pd

# Load new data for testing
new_data = pd.read_csv('diabetes.csv')  # Replace 'new_data.csv' with the actual filename of your new data

# Separate features and target variable
X_new = new_data.drop('Outcome', axis=1)
y_new = new_data['Outcome']

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict on new data
y_pred_new = best_model.predict(X_new)

# Calculate evaluation metrics on new data
accuracy_new = accuracy_score(y_new, y_pred_new)
precision_new = precision_score(y_new, y_pred_new)
recall_new = recall_score(y_new, y_pred_new)
f1_new = f1_score(y_new, y_pred_new)

# Print evaluation metrics on new data
print("Accuracy on new data:", accuracy_new)
print("Precision on new data:", precision_new)
print("Recall on new data:", recall_new)
print("F1 Score on new data:", f1_new)

# Create a range of values to vary the input feature
feature_to_vary = 'Glucose'  # Replace 'Glucose' with the actual feature you want to perform sensitivity analysis on
varying_values = np.linspace(X_new[feature_to_vary].min(), X_new[feature_to_vary].max(), num=10)

# Evaluate the model's predictions with varying feature values
for value in varying_values:
    X_sensitivity = X_new.copy()
    X_sensitivity[feature_to_vary] = value
    y_pred_sensitivity = best_model.predict(X_sensitivity)
    # Perform desired analysis or print the results
    # For example:
    print("Predicted outcome for {} {}: {}".format(feature_to_vary, value, y_pred_sensitivity))
    
scenario_1 = X_new.copy()
scenario_1['BMI'] += 5  # Increase BMI by 5 units
scenario_2 = X_new.copy()
scenario_2['Age'] -= 10  # Decrease Age by 10 years

# Evaluate the model's predictions on hypothetical scenarios
y_pred_scenario_1 = best_model.predict(scenario_1)
y_pred_scenario_2 = best_model.predict(scenario_2)
# Perform desired analysis or print the results
# For example:
print("Predicted outcome for scenario 1:", y_pred_scenario_1)
print("Predicted outcome for scenario 2:", y_pred_scenario_2)

