In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
data = pd.read_csv("BreastCancer.csv")


In [None]:
data .head()

In [None]:
data = data.drop(columns=['id', 'Unnamed: 32'])


In [None]:
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})


In [None]:
print("Duplicate values before dropping features:", data.duplicated().sum())


In [None]:
corr_matrix = data.corr()


In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap="coolwarm", annot=False)
plt.title("Feature Correlation Matrix")
plt.show()

In [None]:
threshold = 0.9
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
data = data.drop(columns=to_drop)
print("Dropped Features:", to_drop)



In [None]:
print("Duplicate values after dropping features:", data.duplicated().sum())
print("Missing values:\n", data.isnull().sum())

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.drop(columns=['diagnosis']))
df_scaled = pd.DataFrame(scaled_features, columns=data.columns[1:])
df_scaled['diagnosis'] = data['diagnosis']

In [None]:
x = df_scaled.drop(columns=['diagnosis'])
y = df_scaled['diagnosis']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
print("Class distribution before SMOTE:\n", y_train.value_counts())


In [None]:
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [None]:
print("New class distribution after SMOTE:\n", y_train_resampled.value_counts())


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.countplot(x=y_train, ax=axes[0])
axes[0].set_title("Before SMOTE")
sns.countplot(x=y_train_resampled, ax=axes[1])
axes[1].set_title("After SMOTE")
plt.show()

In [None]:
df_scaled.to_csv("Preprocessed_BreastCancer.csv", index=False)


In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train_resampled, y_train_resampled)

In [None]:
y_pred_knn = knn.predict(x_test)

In [None]:
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

In [None]:
from sklearn.model_selection import GridSearchCV

# Define possible values for K
param_grid = {'n_neighbors': range(1, 20)}

# Perform Grid Search
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(x_train_resampled, y_train_resampled)

# Best K
print("Best K:", grid_search.best_params_['n_neighbors'])


best_knn = grid_search.best_estimator_
y_pred_best_knn = best_knn.predict(x_test)


print("Optimized KNN Accuracy:", accuracy_score(y_test, y_pred_best_knn))
print(classification_report(y_test, y_pred_best_knn))


In [None]:
new_input = np.array([[10.5, 14.2, 67.4, 0.12, 0.09, 0.2, 0.5, 0.03, 0.04, 0.1,
                        12.3, 15.6, 70.1, 0.13, 0.07, 0.18, 0.6, 0.02, 0.05, 0.09]])  

new_input_scaled = scaler.transform(new_input)


prediction = best_knn.predict(new_input_scaled)
print("Prediction:", "Malignant" if prediction[0] == 1 else "Benign")



In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred_best_knn)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=["Benign", "Malignant"], yticklabels=["Benign", "Malignant"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
# Convert prediction to text
prediction_text = "Malignant" if prediction[0] == 1 else "Benign"

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
x_pca = pca.fit_transform(x)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=x_pca[:, 0], y=x_pca[:, 1], hue=y, palette="coolwarm", alpha=0.6)
plt.scatter(pca.transform(new_input_scaled)[:, 0], pca.transform(new_input_scaled)[:, 1], color="black", marker="X", s=200, label="New Input")

plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title(f"New Input Classification: {prediction_text}")
plt.legend()
plt.show()
