<a href="https://colab.research.google.com/github/codehacker4655/ML-with-theory-inform-of-comments-/blob/main/KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.datasets import make_classification
X,y=make_classification(n_samples=1000,n_features=3,n_redundant=1,n_classes=2,random_state=999)

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [4]:
classifier=KNeighborsClassifier(n_neighbors=5,algorithm="auto")
classifier.fit(x_train,y_train)

In [12]:
y_pred=classifier.predict(x_test)

In [13]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [14]:
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred) )

[[98  5]
 [12 85]]
0.915
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       103
           1       0.94      0.88      0.91        97

    accuracy                           0.92       200
   macro avg       0.92      0.91      0.91       200
weighted avg       0.92      0.92      0.91       200



# Regression

In [15]:
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=1000, n_features=2, noise=10, random_state=42)

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [17]:
from sklearn.neighbors import KNeighborsRegressor

In [18]:
regressor=KNeighborsRegressor(n_neighbors=6,algorithm='auto')
regressor.fit(X_train,y_train)

In [19]:
y_pred=regressor.predict(X_test)

In [20]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [21]:
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))

0.9189275159979495
9.009462452972217
127.45860414317289


In [6]:
#In KNeighborsClassifier p=2 indicates euclidiean distance if we set it to 1 than it is manhattan_distance algorithm is ball tree,kd tree, and if we set it to auto based on the dataset it will take algorithm

In [7]:
#uniform:uniform weights.All points in each neighborhood are weighted equally
#distance:weight points by the inverse of their distance.in this case,closer neighbours of a query point will have a greater influence than neighbors which are further way

In [8]:
# ================================================================
# 📌 MACHINE LEARNING ALGORITHMS – QUICK NOTES
# ================================================================

# ------------------------------------------------
# 1. LINEAR REGRESSION
# ------------------------------------------------
# Type: Supervised Learning (Regression)
# Goal: Predict continuous output (y) from input features (X).
# Cost Function: Mean Squared Error (MSE)
# Training: Gradient Descent updates coefficients (w, b).
# Regularization: Can use Ridge (L2), Lasso (L1), ElasticNet.
# ------------------------------------------------


# ------------------------------------------------
# 2. LOGISTIC REGRESSION
# ------------------------------------------------
# Type: Supervised Learning (Classification)
# Goal: Classify data into categories (binary/multiclass).
# Hypothesis: Sigmoid(w·x + b) → gives probability.
# Cost Function: Log Loss (Cross-Entropy).
# Training: Gradient Descent.
# Weakness: Sensitive to outliers, overlapping classes.
# ------------------------------------------------


# ------------------------------------------------
# 3. SUPPORT VECTOR MACHINE (SVM / SVC)
# ------------------------------------------------
# Type: Supervised Learning (Classification)
# Goal: Find hyperplane that maximizes margin between classes.
# Key Idea: Support vectors = closest points that decide boundary.
# Cost Function: Hinge Loss + Regularization
#     J(w,b) = 1/2 ||w||^2 + C Σ max(0, 1 - y_i(w·x_i+b))
# Training:
#     - For small data → Quadratic Programming (QP) solves exactly.
#     - For large data → SGD on hinge loss (like logistic regression).
# Why Margin? → Larger margin → better generalization, less overfitting.
# ------------------------------------------------


# ------------------------------------------------
# 4. NAÏVE BAYES
# ------------------------------------------------
# Type: Supervised Learning (Classification, probabilistic).
# Based on Bayes’ Theorem:
#     P(Y|X1,X2,...,Xn) = [P(Y) * Π P(Xi|Y)] / P(X1,...,Xn)
# Assumption: Features are conditionally independent given Y.
# Advantage: Simple, fast, works well for text classification.
# Limitation: Performs poorly if features are strongly correlated
#             (double counting evidence).
# Variants:
#     - Gaussian NB: when features are continuous (assume Gaussian dist).
#     - Multinomial NB: when features are counts (text, word freq).
#     - Bernoulli NB: when features are binary (text presence/absence).
# Training Process:
#     - Estimate P(Y) from frequency of labels.
#     - Estimate P(Xi|Y) from frequency/likelihood of features per class.
#     - Use Bayes’ Rule to predict class with max probability.
# ------------------------------------------------


# ------------------------------------------------
# 5. K-NEAREST NEIGHBORS (KNN)
# ------------------------------------------------
# Type: Supervised Learning (Classification/Regression).
# Goal: Predict label based on majority vote (classification)
#       or average (regression) of K nearest neighbors.
# Distance Metric: Usually Euclidean distance.
# Training: No explicit training (lazy learner).
# Prediction:
#     - Compute distance from query point to all training points.
#     - Select K nearest neighbors.
#     - Majority vote (classification) or mean (regression).
# Weakness:
#     - Slow for large datasets (computes distance to all points).
#     - Sensitive to scaling (need normalization).
# ------------------------------------------------


# ================================================================
# 📊 WHEN TO USE WHICH?
# ================================================================
# - Linear Regression → Predict continuous outcomes.
# - Logistic Regression → Classification with linearly separable classes.
# - SVM → When classes overlap slightly but margin helps generalization.
# - Naïve Bayes → Text classification (spam, sentiment), high-dimensional sparse data.
# - KNN → Small datasets, non-linear boundaries, easy to implement.
#
# Regularization (Ridge, Lasso, ElasticNet, GridSearchCV, RandomizedSearchCV):
#     → Only applicable when algorithm has coefficients/weights (like LR, Logistic, SVM).
#     → Not used in lazy learners like KNN or in Naïve Bayes (probabilistic).
# ================================================================

In [9]:
# ==============================
# 🔑 Do Naïve Bayes models have parameters?
# ==============================
# - In Logistic Regression, Linear Regression, SVM → we learn parameters (weights w and bias b).
# - In Naïve Bayes → we DO NOT learn weights.
# - Instead, we estimate probabilities from training data:
#     1. Class Prior:
#        P(Y=c) = (# samples with class c) / (total samples)
#     2. Likelihood of features given class:
#        - For categorical features:
#          P(X_i = x | Y=c) = count(X_i=x and Y=c) / count(Y=c)
#        - For continuous features (Gaussian NB):
#          Estimate mean (μ_c) and variance (σ_c²) per class.
# - So in Naïve Bayes, the "parameters" are probability tables (categorical)
#   or mean & variance (continuous).
# - ✅ There are no learned weights like (w,b).

# ==============================
# 🟨 Naïve Bayes Training Process
# ==============================
# 1. Bayes’ Theorem:
#    P(Y|X_1, X_2, …, X_n) = [ P(Y) * P(X_1, X_2, …, X_n | Y) ] / P(X_1, X_2, …, X_n)
#
# 2. Naïve Independence Assumption:
#    P(X_1, X_2, …, X_n | Y) ≈ Π P(X_i | Y)
#
# 3. Training = Probability Estimation (NO cost function, NO gradient descent):
#    - Class Prior:
#        P(Y=c) = (# samples in class c) / (total samples)
#    - Feature Likelihood:
#        P(X_i=x | Y=c) = count(X_i=x in class c) / (# samples in class c)
#
# 4. Prediction:
#    For a new sample (x1, x2, …, xn):
#    Compute Posterior Probability:
#        P(Y=c | X) ∝ P(Y=c) * Π P(X_i=xi | Y=c)
#    Choose class with highest posterior.

# ✅ Key Notes:
# - No cost function
# - No gradient descent
# - Just frequency/probability estimation
# - Very fast compared to regression/SVM

# ==============================
# 📌 Algorithm Comparison
# ==============================
# 1. Linear Regression
#    - Type: Supervised (Regression)
#    - Goal: Predict continuous values
#    - Parameters: Coefficients (w,b)
#    - Training: Minimize MSE via Gradient Descent
#    - Regularization: Ridge, Lasso, Elastic Net
#
# 2. Logistic Regression
#    - Type: Supervised (Classification)
#    - Goal: Predict probability of class
#    - Parameters: Coefficients (w,b)
#    - Training: Minimize Log Loss via Gradient Descent
#    - Weakness: Sensitive to outliers, overlapping data
#
# 3. Support Vector Machine (SVM / SVC)
#    - Type: Supervised (Classification)
#    - Goal: Find hyperplane that maximizes margin
#    - Parameters: (w,b)
#    - Training: Optimize Hinge Loss + Regularization
#    - Key Idea: Only support vectors influence the boundary
#
# 4. Naïve Bayes
#    - Type: Supervised (Classification, Probabilistic)
#    - Goal: Classify using Bayes’ theorem
#    - Parameters: Class priors, conditional probabilities
#    - Training: Estimate probabilities from data
#    - Strengths: Fast, effective in text classification
#    - Weaknesses: Assumes feature independence
#
# 5. K-Nearest Neighbors (KNN)
#    - Type: Supervised (Classification + Regression)
#    - Goal: Predict label from majority vote (Cls) / mean (Reg)
#    - Parameters: None (only hyperparameter K)
#    - Training: No explicit training (lazy learner)
#    - Prediction: Compute distance → pick K nearest → vote/average
#    - Weakness: Slow for large data, sensitive to scaling

# ==============================
# 📝 Summary Table
# ==============================
# Algorithm            | Learning Type         | Parameters Learned        | Training Style
# ----------------------------------------------------------------------------------------
# Linear Regression    | Supervised (Reg)      | Coefficients (w,b)        | Gradient Descent (MSE)
# Logistic Regression  | Supervised (Cls)      | Coefficients (w,b)        | Gradient Descent (Log Loss)
# SVM (SVC)            | Supervised (Cls)      | Coefficients (w,b)        | Hinge Loss Optimization
# Naïve Bayes          | Supervised (Cls)      | Probabilities (priors,cond)| Frequency/Likelihood Estimation
# KNN                  | Supervised (Cls/Reg)  | None (only K)             | Lazy learning, distance-based

In [10]:
# -------------------------------
# 1️⃣ Linear/Logistic Regression
# -------------------------------
#from sklearn.linear_model import LogisticRegression

# Model initialization
#model_lr = LogisticRegression()

# Training (fit)
#model_lr.fit(X_train, y_train)
# Internally:
# 1. Initialize weights w and bias b (zeros or random)
# 2. For each iteration:
#       - Compute predictions using current weights: y_hat = sigmoid(X*w + b)
#       - Calculate loss (log loss)
#       - Compute gradient w.r.t weights and bias
#       - Update weights and bias to reduce loss
# 3. Stop when loss converges or max iterations reached
# Result: model_lr has optimized weights (w) and bias (b)

# -------------------------------
# 2️⃣ Naïve Bayes
# -------------------------------
#from sklearn.naive_bayes import MultinomialNB

# Model initialization
#model_nb = MultinomialNB()

# Training (fit)
#model_nb.fit(X_train, y_train)
# Internally:
# 1. Count how many samples belong to each class
#       => P(class) = count_class / total_samples
# 2. Count how many times each feature occurs in each class
#       => P(feature=value | class) = count(feature=value in class) / count_class
# 3. Store these probabilities (or mean/variance if GaussianNB)
# No gradient descent, no optimization
# Prediction uses Bayes theorem: P(class|X) ∝ P(class) * Π P(feature_i|class)

# -------------------------------
# 3️⃣ K-Nearest Neighbors (KNN)
# -------------------------------
#from sklearn.neighbors import KNeighborsClassifier

# Model initialization
#model_knn = KNeighborsClassifier(n_neighbors=3)

# Training (fit)
#model_knn.fit(X_train, y_train)
# Internally:
# 1. Just store the training data X_train and labels y_train in memory
# 2. No parameters learned, no optimization
# Prediction:
#    - Compute distance from new sample to all training points
#    - Find K nearest neighbors
#    - Take majority vote (classification) or average (regression)

In [11]:
#Exactly! Let’s go step by step and go deeper into how Naïve Bayes “training” works. I’ll also contrast it with Linear/Logistic Regression and KNN to make it crystal clear.


#---

#1️⃣ Linear/Logistic Regression / SVM (Parametric Models)

#Training goal: Find the best parameters (weights w and bias b) that minimize a loss/cost function.

#How it works internally in .fit():

#Initialize weights and bias (maybe zeros or random).

#Iteratively update weights using gradient descent to reduce the loss.

#Stop when convergence is reached (loss change is very small) or max iterations.


#Result: A set of numbers (w, b) that define the model.


#Example:

# Logistic Regression
#from sklearn.linear_model import LogisticRegression

#model = LogisticRegression()
#model.fit(X_train, y_train)  # finds the optimal weights (w) and bias (b)

#After .fit(), the model has learned coefficients, and predictions use:

#\hat{y} = \sigma(w^T X + b)


#---

#2️⃣ Naïve Bayes (Non-parametric / Probabilistic)

#Training goal: Estimate probabilities from the training data. No weights to optimize.

#Internal process of .fit():

#1. Compute prior probabilities of each class:




#P(\text{class}) = \frac{\text{count of samples in class}}{\text{total samples}}

#2. Compute conditional probabilities of features given the class:

#Categorical features: frequency counts → probability table




#P(\text{feature=value} \mid \text{class}) = \frac{\text{count(feature=value in class)}}{\text{count of class samples}}

#P(x_i \mid y=c) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp\Big(-\frac{(x_i-\mu)^2}{2\sigma^2}\Big)

#Result: Tables of probabilities or statistics (mean, variance).

#Prediction: Use Bayes theorem:


#P(class \mid X) \propto P(class) \prod_i P(feature_i \mid class)

#✅ So training is literally just counting frequencies and calculating stats. No gradient descent, no optimization.

#Example (categorical):

#from sklearn.naive_bayes import MultinomialNB

#model = MultinomialNB()
#model.fit(X_train, y_train)  # counts how many times each word appears in each class

#After .fit(), it has stored:

#P(class=spam) = 0.6
#P(word="offer" | class=spam) = 0.2
#...


#---

#3️⃣ KNN (Lazy learner)

#Training goal: None.

#Internal process of .fit(): Just store all the training data.

#Prediction: Compute distances from the new point to all stored points, pick K nearest neighbors, vote/average.


#from sklearn.neighbors import KNeighborsClassifier

#model = KNeighborsClassifier(n_neighbors=3)
#model.fit(X_train, y_train)  # literally just stores X_train and y_train

#Prediction can be slow because distances are computed at prediction time, not during training.



#---

#✅ Summary Table

#Model Type	.fit() action	Parameters learned?	Optimization?

#Linear/Logistic/SVM	Optimize weights w and bias b	Yes	Yes
#Naïve Bayes	Count frequencies / compute probabilities	Yes (probabilities)	No
#KNN	Store data	No	No





#So for Naïve Bayes, yes exactly: training just calculates the probabilities from the training data and stores them. Then at prediction, it just uses those probabilities to classify new samples.


#---

#If you want, I can also make a small illustrative code snippet showing internally what .fit() does for all three models—so you can see the difference line by line in Python.

#Do you want me to do that?