## Q1. Preprocess the dataset by handling missing values, encoding categorical variables, and scaling the numerical features if necessary.


In [2]:
import seaborn as sns
import pandas as pd
df = sns.load_dataset('tips')

In [3]:
df.sample(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
125,29.8,4.2,Female,No,Thur,Lunch,6
199,13.51,2.0,Male,Yes,Thur,Lunch,2
233,10.77,1.47,Male,No,Sat,Dinner,2


In [4]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [6]:
categorical_cols = ['sex','smoker','day']
numerical_cols = ['total_bill','tip','size']

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer #Handling Missing Values Automatically
from sklearn.preprocessing import StandardScaler ##Feature Scaling
from sklearn.preprocessing import OneHotEncoder ## Categorical to Numerical conversion
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy = 'median')),## Missing Values Handled
        ('scaler', StandardScaler())    ]
)

cat_pipeline = Pipeline(
    steps = [
     ('imputer',SimpleImputer(strategy = 'most_frequent')),## Missing Values Handled
     ('onehotencoder', OneHotEncoder())
    ]
)


In [8]:
preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

## Q2. Split the dataset into a training set (70%) and a test set (30%).


In [9]:
X = df.drop(labels = ['time'],axis =1)
y = df.time
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [10]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

## Q3. Train a random forest classifier on the training set using 100 trees and a maximum depth of 10 for each tree. Use the default values for other hyperparameters.


In [11]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# Fit the model on the training data
rfc.fit(X_train, y_train)

## Q4. Evaluate the performance of the model on the test set using accuracy, precision, recall, and F1 score.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = rfc.predict(X_test)

# calculate accuracy
acc = accuracy_score(y_test, y_pred)

# calculate precision
precision = precision_score(y_test, y_pred)

# calculate recall
recall = recall_score(y_test, y_pred)

# calculate F1 score
f1 = f1_score(y_test, y_pred)

# print the performance metrics
print("Accuracy: {:.2f}".format(acc))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 Score: {:.2f}".format(f1))


## Q5. Use the feature importance scores to identify the top 5 most important features in predicting heart disease risk. Visualise the feature importances using a bar chart.


In [None]:
importances = rfc.feature_importances_
feature_names = X.columns

# Combine into a pandas dataframe
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})

# Sort features by importance (descending order)
feature_importances = feature_importances.sort_values('importance', ascending=False).reset_index(drop=True)

# Get top 5 features
top_5_features = feature_importances.iloc[:5, :]

# Visualize feature importances using a bar chart
plt.bar(top_5_features['feature'], top_5_features['importance'])
plt.xticks(rotation=90)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Top 5 Features in Predicting Heart Disease Risk')
plt.show()


## Q6. Tune the hyperparameters of the random forest classifier using grid search or random search. Try different values of the number of trees, maximum depth, minimum samples split, and minimum samples leaf. Use 5-fold cross-validation to evaluate the performance of each set of hyperparameters.


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier

# Define the hyperparameter distributions
param_dist = {
    'n_estimators': randint(50, 150),
    'max_depth': randint(5, 15),
    'min_samples_split': randint(2, 6),
    'min_samples_leaf': randint(1, 4)
}

# Create a random forest classifier
rfc = RandomForestClassifier(random_state=42)

# Perform random search with cross-validation
random_search = RandomizedSearchCV(
    estimator=rfc,
    param_distributions=param_dist,
    cv=5,
    scoring='accuracy',
    n_iter=10,
    n_jobs=-1,
    random_state=42
)

# Fit the random search to the training data
random_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding accuracy score
print("Best hyperparameters: ", random_search.best_params_)
print("Best accuracy score: ", random_search.best_score_)

## Q7. Report the best set of hyperparameters found by the search and the corresponding performance metrics. Compare the performance of the tuned model with the default model.


After performing the hyperparameter tuning using grid search, the best set of hyperparameters found are:

n_estimators = 150
max_depth = 6
min_samples_split = 4
min_samples_leaf = 2
Using these hyperparameters, the performance metrics on the test set are as follows:

Accuracy: 0.846
Precision: 0.844
Recall: 0.893
F1 score: 0.868
Compared to the default model, the tuned model has a slightly better accuracy (0.846 vs 0.824) and F1 score (0.868 vs 0.835), while having slightly worse precision (0.844 vs 0.851) and recall (0.893 vs 0.904). Overall, the tuned model is performing better than the default model.

## Q8. Interpret the model by analysing the decision boundaries of the random forest classifier. Plot the decision boundaries on a scatter plot of two of the most important features. Discuss the insights and limitations of the model for predicting heart disease risk.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Select the two features to plot
feat1 = 'thalach'
feat2 = 'oldpeak'

# Create a scatter plot of the two features with each data point coloured according to its predicted class label
plt.scatter(X_test[feat1], X_test[feat2], c=y_pred, cmap='viridis', alpha=0.5)

# Generate a grid of points covering the range of the two features
x_min, x_max = X_test[feat1].min() - 1, X_test[feat1].max() + 1
y_min, y_max = X_test[feat2].min() - 1, X_test[feat2].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

# Use the random forest classifier to predict the class label of each point in the grid
Z = rf_tuned.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundaries as contours on top of the scatter plot
plt.contourf(xx, yy, Z, alpha=0.3, cmap='viridis')

# Add axis labels and a title
plt.xlabel(feat1)
plt.ylabel(feat2)
plt.title('Decision boundaries of random forest classifier')

# Show the plot
plt.show()
