In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Load the Wisconsin Breast Cancer dataset
file_path = r"C:\Users\Shaurya\Downloads\data.csv"
df = pd.read_csv(file_path)

In [3]:
# Drop unnecessary columns
df = df.drop(columns=['id', 'Unnamed: 32'])

In [4]:
# Encode the 'diagnosis' column (M=1, B=0)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['diagnosis'] = label_encoder.fit_transform(df['diagnosis'])

In [5]:
# Handle missing values by dropping rows with null values
df = df.dropna()

In [6]:
# Split the dataset into features (X) and target variable (y)
X = df.drop(columns=['diagnosis'])
y = df['diagnosis']

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Apply PCA
pca = PCA(n_components=10)  # You can adjust the number of components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [10]:
# Build and train the CatBoost model
catboost_model = CatBoostClassifier(iterations=500, depth=10, learning_rate=0.05, loss_function='Logloss')
catboost_model.fit(X_train_pca, y_train)

0:	learn: 0.6344323	total: 162ms	remaining: 1m 20s
1:	learn: 0.5802576	total: 193ms	remaining: 48s
2:	learn: 0.5360972	total: 225ms	remaining: 37.2s
3:	learn: 0.4978628	total: 255ms	remaining: 31.6s
4:	learn: 0.4630067	total: 286ms	remaining: 28.4s
5:	learn: 0.4266153	total: 317ms	remaining: 26.1s
6:	learn: 0.3977107	total: 350ms	remaining: 24.6s
7:	learn: 0.3745172	total: 381ms	remaining: 23.5s
8:	learn: 0.3505920	total: 415ms	remaining: 22.6s
9:	learn: 0.3290338	total: 446ms	remaining: 21.8s
10:	learn: 0.3107909	total: 478ms	remaining: 21.2s
11:	learn: 0.2928086	total: 509ms	remaining: 20.7s
12:	learn: 0.2749744	total: 541ms	remaining: 20.3s
13:	learn: 0.2609369	total: 573ms	remaining: 19.9s
14:	learn: 0.2456329	total: 605ms	remaining: 19.6s
15:	learn: 0.2341635	total: 638ms	remaining: 19.3s
16:	learn: 0.2222524	total: 670ms	remaining: 19s
17:	learn: 0.2115170	total: 701ms	remaining: 18.8s
18:	learn: 0.2015454	total: 733ms	remaining: 18.6s
19:	learn: 0.1906142	total: 765ms	remaining:

<catboost.core.CatBoostClassifier at 0x278ff79abd0>

In [11]:
# Make predictions on the test set
y_pred = catboost_model.predict(X_test_pca)

In [12]:
# Print accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

Accuracy: 0.9649
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97        71
           1       0.95      0.95      0.95        43

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114

