# ***Quiz-2 Problem Statement***

Objective: Implement a machine learning pipeline that involves loading a dataset, preprocessing the data, performing feature selection (optional), tuning hyperparameters, applying k-fold cross-validation, training a model, and evaluating its performance using either a Naive Bayes or K-Nearest Neighbors (KNN) classifier.

# Dataset Loading:
a .Load the Pima Indians Diabetes dataset from google classroom.

In [None]:
import pandas as pd
# load data from file
df = pd.read_csv("/content/diabetes.csv")

df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [None]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


# Data Preprocessing:
a. Split the data into training and test sets (80% training, 20% testing).


In [None]:
from sklearn.model_selection import train_test_split

X=df
y=df['Outcome']
data = df.iloc[:, :-1]
x = data

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

b. Standardize the features to have zero mean and unit variance.


In [None]:
from sklearn.preprocessing import StandardScaler

# Assume X is your data with multiple features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Now X_standardized will have features with zero mean and unit variance

In [None]:
# normalization
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler

data = df.iloc[:, :-1]
scaler = zscore(data)


normalized_data = pd.DataFrame(scaler)
normalized_data[:10]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.848324,0.149641,0.90727,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.23388,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.90727,0.765836,1.409746,5.484909,-0.020496
5,0.342981,-0.153185,0.253036,-1.288212,-0.692891,-0.811341,-0.818079,-0.27576
6,-0.250952,-1.342476,-0.98771,0.719086,0.071204,-0.125977,-0.676133,-0.616111
7,1.827813,-0.184482,-3.572597,-1.288212,-0.692891,0.419775,-1.020427,-0.360847
8,-0.547919,2.381884,0.046245,1.534551,4.021922,-0.189437,-0.947944,1.681259
9,1.23388,0.128489,1.390387,-1.288212,-0.692891,-4.060474,-0.724455,1.766346


# Feature Selection (Optional):
a. Optionally, apply feature selection techniques such as chi-square/SelectKBest to retain the top features.


In [None]:
# select k best feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression
from numpy import array

select = SelectKBest(score_func=chi2, k=3)

z = select.fit_transform(x,y)

filter = select.get_support()

feat_labels=x.columns

features = array(feat_labels)
print("All features:")
print(features)

print("Selected best 3:")
print(features[filter])

All features:
['Pregnancies' 'Glucose' 'BloodPressure' 'SkinThickness' 'Insulin' 'BMI'
 'DiabetesPedigreeFunction' 'Age']
Selected best 3:
['Glucose' 'Insulin' 'Age']


# Hyperparameter Tuning:
a. For KNN, tune the number of neighbors (n_neighbors) and the distance metric (metric).

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Define the KNN model
knn = KNeighborsClassifier()

# Define the parameter grid
param_grid_knn = {
    'n_neighbors': [3,4,5,6,7,8]
}

# Perform grid search cross-validation
grid_search_knn = GridSearchCV(estimator=knn, param_grid=param_grid_knn, cv=5)
grid_search_knn.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters for KNN:", grid_search_knn.best_params_)
print("Best Cross-validation Score for KNN:", grid_search_knn.best_score_)


Best Parameters for KNN: {'n_neighbors': 8}
Best Cross-validation Score for KNN: 0.7444089031054245


b. For Naive Bayes, tune the smoothing parameter (alpha).


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB

knn = KNeighborsClassifier()
nb=MultinomialNB()

param_grid = {
    'n_neighbors': [3,4,5,6,7,8]
} # for KNN
param_nb={'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]} # for Naive Bayes

grid_search = GridSearchCV(estimator=nb, param_grid=param_nb, cv=5, n_jobs=1)

grid_search.fit(X_train, y_train)
print("Grid Search Best Parameters:", grid_search.best_params_)
print("Grid Search Best Score:", grid_search.best_score_)

Grid Search Best Parameters: {'alpha': 1e-05}
Grid Search Best Score: 0.806330801012928


# K-Fold Cross-Validation:
a. Use Stratified K-Fold Cross-Validation with 5 folds to evaluate model performance.


In [None]:
from sklearn.model_selection import StratifiedKFold


X=X.values
y=y.values


knn = KNeighborsClassifier(n_neighbors=8)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = []

# Split and train model
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    #print(train_index)
    y_train, y_test = y[train_index], y[test_index]
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))

average_score = np.mean(scores)
print(f"Average Accuracy: {average_score:.2f}")

Average Accuracy: 0.74


# Training and Testing:
a. Train the selected classifier (Naive Bayes or KNN) using the best hyperparameters identified.

b. Evaluate the model on the test set and report the accuracy.


# Cross Validation


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score



# Convert text data to count features
vectorizer = CountVectorizer()


# Initialize and train Multinomial Naive Bayes model
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

# Make predictions
y_pred = mnb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 64.05%


In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

gnb = GaussianNB()
gnb.fit(X_train, y_train)
# Make predictions
y_pred = gnb.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 100.00%


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score



# Initialize and train Bernoulli Naive Bayes model
bnb = BernoulliNB()
bnb.fit(X_train, y_train)

# Make predictions
y_pred = bnb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 100.00%


In [None]:
# using k-fold

import numpy as np
from sklearn.model_selection import cross_val_score



# gausian
scores = cross_val_score(gnb, X, y, cv=10)
print(f'Cross Validation Scores: {scores}')
print(f'Mean CV Score: {np.mean(scores)}\n\n')

# multinomial
scores = cross_val_score(mnb, X, y, cv=10)
print(f'Cross Validation Scores: {scores}')
print(f'Mean CV Score: {np.mean(scores)}\n\n')

# burnoli
scores = cross_val_score(bnb, X, y, cv=10)
print(f'Cross Validation Scores: {scores}')
print(f'Mean CV Score: {np.mean(scores)}')

Cross Validation Scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Mean CV Score: 1.0


Cross Validation Scores: [0.67532468 0.62337662 0.72727273 0.51948052 0.66233766 0.64935065
 0.67532468 0.64935065 0.59210526 0.56578947]
Mean CV Score: 0.6339712918660287


Cross Validation Scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Mean CV Score: 1.0
