# **1. IMPORTING ESSENTIAL LIBRARIES FOR THE CLASSIFICATION.**


In [22]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# **2.LOADING THE DATASET AND HANDLING MISSING VALUES.**

In [7]:
df =load_breast_cancer()
X = pd.DataFrame(df.data, columns=df.feature_names)
y = pd.Series(df.target)

In [10]:
df=print(X.isnull().sum())

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64


# **3. PERFORMING FEATURE SCALING**




In [12]:
scaler = StandardScaler()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -  Explain the preprocessing steps you performed and justify why they are necessary for this dataset.
* Missing Values Handling: Ensures that the model Dosen't include incomplete data.
* Feature Scaling: It is important to bring features to a similar scale, especially for algorithms like SVM and k-NN that rely on distance metrics

# **4. CLASSIFICATION AND ALGORITHM IMPLEMENTATION**

1.**Logistic Regression**
       - A linear model used for binary classification and it works well with linearly separable data; useful for binary classification as in this dataset.


In [14]:
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train_scaled, y_train)

In [15]:
y_pred_log_reg = log_reg.predict(X_test_scaled)

In [23]:
print("Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))

Logistic Regression Performance:
Accuracy: 0.9736842105263158
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



2. **Decision Tree Classifier**
- A non-linear model that splits the data based on feature values to make decisions.Good for capturing complex relationships and interactions in the data

In [19]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train_scaled, y_train)

In [20]:
y_pred_decision_tree = decision_tree.predict(X_test_scaled)

In [24]:
print("Decision Tree Classifier Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_decision_tree))
print("Classification Report:\n", classification_report(y_test, y_pred_decision_tree))

Decision Tree Classifier Performance:
Accuracy: 0.9473684210526315
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



**3. Random Forest Classifier**

- An ensemble method that builds multiple decision trees and combines their predictions. It reduces overfitting and improves accuracy.

In [25]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train_scaled, y_train)


In [26]:
y_pred_random_forest = random_forest.predict(X_test_scaled)

In [27]:
print("Random Forest Classifier Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_random_forest))
print("Classification Report:\n", classification_report(y_test, y_pred_random_forest))

Random Forest Classifier Performance:
Accuracy: 0.9649122807017544
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



**4. k-Nearest Neighbors (k-NN)**
- A non-parametric method that classifies an instance based on the majority vote of its neighbors, Simple and effective for small datasets

In [28]:
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)

In [29]:
y_pred_knn = knn.predict(X_test_scaled)

In [30]:
print("k-Nearest Neighbors Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))

k-Nearest Neighbors Performance:
Accuracy: 0.9473684210526315
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



# **3. MODEL COMPARISON**

-- Best performing Algorithm is **Random Forest Classifier** as it can handle variance and overfitting very well.

-- Worst performing Algorithm is K-NN because the dataset is not scaled properly.