<a href="https://colab.research.google.com/github/gunabalan-0411/Machine-Learning/blob/main/Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## MNIST Dataset

In [None]:
from sklearn.datasets import fetch_openml
# real_life, toy, synthetic datasets
# fetch_*, load_*, make_*

mnist = fetch_openml('mnist_784', as_frame=False)
X, y = mnist.data, mnist.target

print(X.shape)



In [None]:
# printing the image
import matplotlib.pyplot as plt

def show_img(image_2d):
  matrix = image_2d.reshape(28,28)
  plt.imshow(matrix, cmap='binary')

print("It is",y[7])
show_img(X[7])

In [None]:
# Simple classification
from sklearn.linear_model import  SGDClassifier

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
y_train_5 = (y_train == '5')
y_test_5 = (y_test == '5')
model = SGDClassifier()
model.fit(X_train, y_train_5)



In [None]:
print("Is 5?",model.predict([X[0]]))
print(f"actual: {y[0]}")

In [None]:
from sklearn.model_selection import cross_val_score

cvs = cross_val_score(model, X_train, y_train_5, cv=3, scoring='accuracy')
print(cvs)

In [None]:
from sklearn.dummy import DummyClassifier

dummy_model = DummyClassifier()
dummy_model.fit(X_train, y_train_5)
print(any(dummy_model.predict(X_train)))

cvs = cross_val_score(dummy_model, X_train, y_train_5, cv=3, scoring='accuracy')
print(cvs)

In [None]:
from sklearn.model_selection import cross_val_predict

y_scores = cross_val_predict(model, X_train, y_train_5, method="decision_function", cv = 3)
print(y_scores)

In [None]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
plt.figure(figsize=(8, 4))  # extra code – it's not needed, just formatting
plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
plt.vlines(thresholds, 0, 1.0, "k", "dotted", label="threshold")

# extra code – this section just beautifies Figure 3–5
idx = (thresholds >= thresholds).argmax()  # first index ≥ threshold
plt.plot(thresholds[idx], precisions[idx], "bo")
plt.plot(thresholds[idx], recalls[idx], "go")
plt.axis([-50000, 50000, 0, 1])
plt.grid()
plt.xlabel("Threshold")
plt.legend(loc="center right")

plt.show()

In [None]:
from sklearn.metrics import roc_curve
tpr, fpr, thresholds = roc_curve(y_train_5, y_scores)

from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_5, y_scores)

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv = 3, method="predict_proba")


In [None]:
y_scores_forest = y_probas_forest[:, 1]
precisions_forest, recalls_forest, thresholds_forest = precision_recall_curve(
    y_train_5, y_scores_forest)
plt.figure(figsize=(6, 5))  # extra code – not needed, just formatting

plt.plot(recalls_forest, precisions_forest, "b-", linewidth=2,
         label="Random Forest")
plt.plot(recalls, precisions, "--", linewidth=2, label="SGD")

# extra code – just beautifies Figure 3–8
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.axis([0, 1, 0, 1])
plt.grid()
plt.legend(loc="lower left")

plt.show()

## Multiclass Classification

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(random_state=42)
svm_clf.fit(X_train[:2000], y_train[:2000])

In [None]:
print(svm_clf.predict([X_train[2]]))
some_digit_scores = svm_clf.decision_function([X_train[2]])
some_digit_scores.round(2)

In [None]:
from sklearn.multiclass import OneVsRestClassifier

ovr = OneVsRestClassifier(SVC(random_state = 42))
ovr.fit(X_train[:2000], y_train[:2000])
print(ovr.predict([X_train[2]]))
ovr_digit_scores = ovr.decision_function([X_train[2]])
ovr_digit_scores.round(2)


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)
print(sgd_clf.predict([X_train[2]]))
print(sgd_clf.decision_function([X_train[2]]).round(2))

cvs = cross_val_score(sgd_clf, X_train, y_train, cv = 3, scoring = "accuracy")
print(cvs)



In [None]:
# evaluate the result
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv = 3)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, normalize = "true", values_format = ".0%")
plt.show()

# to make errors stand out
sample_weight = (y_train != y_train_pred)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, normalize = "true", values_format = ".0%", sample_weight = sample_weight)
plt.show()

In [None]:
cl_a, cl_b = '3', '5'
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

In [None]:
# extra code – this cell generates Figure 3–11
size = 5
pad = 0.2
plt.figure(figsize=(size, size))
for images, (label_col, label_row) in [(X_ba, (0, 0)), (X_bb, (1, 0)),
                                       (X_aa, (0, 1)), (X_ab, (1, 1))]:
    for idx, image_data in enumerate(images[:size*size]):
        x = idx % size + label_col * (size + pad)
        y = idx // size + label_row * (size + pad)
        plt.imshow(image_data.reshape(28, 28), cmap="binary",
                   extent=(x, x + 1, y, y + 1))
plt.xticks([size / 2, size + pad + size / 2], [str(cl_a), str(cl_b)])
plt.yticks([size / 2, size + pad + size / 2], [str(cl_b), str(cl_a)])
plt.plot([size + pad / 2, size + pad / 2], [0, 2 * size + pad], "k:")
plt.plot([0, 2 * size + pad], [size + pad / 2, size + pad / 2], "k:")
plt.axis([0, 2 * size + pad, 0, 2 * size + pad])
plt.xlabel("Predicted label")
plt.ylabel("True label")

plt.show()

In [None]:
from sklearn.multioutput import ClassifierChain
from sklearn.svm import SVC

chain_clf = ClassifierChain(SVC(), cv=3, random_state=42)

In [None]:
import numpy as np

rng = np.random.default_rng(seed=42)  # to make this code example reproducible
noise_train = rng.integers(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise_train
noise_test = rng.integers(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise_test
y_train_mod = X_train
y_test_mod = X_test

plt.subplot(131); show_img(noise_train[0])
plt.subplot(132); show_img(X_train[0])
plt.subplot(133); show_img(X_train_mod[0])

In [None]:
from sklearn.neighbors import KNeighborsClassifier

kn_clf = KNeighborsClassifier()
kn_clf.fit(X_train_mod, y_train_mod)
clean_img = kn_clf.predict([X_train_mod[0]])
show_img(clean_img)

## 1. An MNIST Classifier With Over 97% Accuracy

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf = knn_clf.fit(X_train, y_train)
base_line_acc = knn_clf.score(X_test,y_test)
base_line_acc

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'weights': ['uniform', 'distance'],
    'n_neighbors': [1,2, 3,4,5,6]
}

grid_cv = GridSearchCV(knn_clf, cv = 3, param_grid = param_grid)
grid_cv.fit(X_train[:10000], y_train[:10000])

In [None]:
grid_cv.best_params_

In [None]:
grid_cv.best_score_

In [None]:
grid_cv.best_estimator_.fit(X_train, y_train)
tuned_accuracy = grid_cv.score(X_test, y_test)
tuned_accuracy

## Data Augmented Training

In [None]:
from scipy.ndimage import shift

def shift_image(img, dy, dx):
  img_pxl = img.reshape([28, 28])
  moved_img = shift(img_pxl, (dy,dx))
  return moved_img.reshape([-1])

Random_Image = X_train[1000]
Random_Image_Moved = shift_image(Random_Image, 2, 2)

plt.figure(figsize = (6,4))
plt.subplot(121)
plt.title("Original Image")
plt.imshow(Random_Image.reshape([28, 28]), cmap="Greys")

plt.subplot(122)
plt.title("shifted to down right")
plt.imshow(Random_Image_Moved.reshape([28, 28]), cmap = "Greys")

plt.show()



In [None]:
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

directions = [(-1, 0), (0, -1), (0, 1), (1, 0)]

for dx, dy in directions:
  for image, label in zip(X_train, y_train):
    X_train_augmented.append(shift_image(image, dx, dy))
    y_train_augmented.append(label)



In [None]:
import numpy as np
X_train_augmented = np.array(
    X_train_augmented
)
y_train_augmented = np.array(
    y_train_augmented
)
# Make it randomize
rng = np.random.default_rng(seed = 42)
shuffled_indices = rng.permutation(len(X_train_augmented))

X_train_augmented = X_train_augmented[shuffled_indices]
y_train_augmented = y_train_augmented[shuffled_indices]


In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(** grid_cv.best_params_)
knn_clf.fit(X_train_augmented, y_train_augmented)
augmented_accuracy = knn_clf.score(X_test, y_test)
augmented_accuracy


In [None]:
# Error rate
error_rate = ((1- augmented_accuracy) / (1 - tuned_accuracy)) - 1
print(f"error rate {error_rate:.0%}")

## Tackle the titanic dataset

In [None]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_titanic_data():
    tarball_path = Path("datasets/titanic.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/titanic.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as titanic_tarball:
            titanic_tarball.extractall(path="datasets", filter="data")
    return [pd.read_csv(Path("datasets/titanic") / filename)
            for filename in ("train.csv", "test.csv")]

train_data, test_data = load_titanic_data()

In [None]:
train_data.head()

In [None]:
train_data = train_data.set_index('PassengerId')
test_data = test_data.set_index('PassengerId')


In [None]:
train_data.info()

In [None]:
print("Male median age: ", train_data[train_data.Sex == "male"].Age.median())
print("Female median age: ", train_data[train_data.Sex == "female"].Age.median())
print("Overall Median age: ", train_data.Age.median())


In [None]:
train_data.describe()

In [None]:
train_data.Survived.value_counts()

## Titanic Prediction

In [None]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_titanic_data():
    tarball_path = Path("datasets/titanic.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/titanic.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as titanic_tarball:
            titanic_tarball.extractall(path="datasets", filter="data")
    return [pd.read_csv(Path("datasets/titanic") / filename)
            for filename in ("train.csv", "test.csv")]
train_data, test_data = load_titanic_data()

In [None]:
train_data.head()

In [None]:
train_data.info()
train_data.describe()

In [None]:
display(train_data.Survived.value_counts())
display(train_data.Pclass.value_counts())
display(train_data.Embarked.value_counts())
display(train_data.Sex.value_counts())

### Preparing preprocessing Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Numerical Preprocessing
preprocess_numerical = Pipeline(
    [
        ("imputer", SimpleImputer(strategy = "median")),
        ("scaler", StandardScaler())
    ]
)

# Categorical Preprocessing
preprocess_categorical = Pipeline(
    [
        # ("ordial_encoder", OrdinalEncoder()),
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OneHotEncoder(sparse_output = False))
    ]
)
num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Pclass", "Sex", "Embarked"]
preprocessing_pipeline = ColumnTransformer(
    [
        ("Numerical Transformer", preprocess_numerical, num_attribs),
        ("Categorical Transformer", preprocess_categorical, cat_attribs)
    ]
)

X_train, y_train = preprocessing_pipeline.fit_transform(train_data), train_data['Survived']
X_train, y_train

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest_clf = RandomForestClassifier(n_estimators = 100, random_state = 42)

forest_clf.fit(X_train,y_train)

X_test = preprocessing_pipeline.transform(test_data)
y_pred = forest_clf.predict(X_test)

forest_scores = cross_val_score(forest_clf, X_train, y_train, cv = 10)
forest_scores.mean()

In [None]:
from sklearn.svm import SVC

sv_clf = SVC(gamma='scale', kernel = 'rbf')
sv_clf.fit(X_train, y_train)

sv_scores = cross_val_score(sv_clf, X_train, y_train, cv = 10)
sv_scores.mean()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 4))
plt.plot([1]*10, sv_scores, ".")
plt.plot([2]*10, forest_scores, ".")
plt.boxplot([sv_scores, forest_scores], labels=("SVM", "Random Forest"))
plt.ylabel("Accuracy")
plt.show()

## Spam Classifier