# Cancer Classification using XGBoost

This notebook demonstrates a machine learning pipeline for cancer type classification using XGBoost. The process includes:
- Data loading and preprocessing
- Handling class imbalance with SMOTE
- Feature selection using Random Forest
- Hyperparameter tuning with GridSearchCV
- Model evaluation with various metrics

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, \
    confusion_matrix

## 2. Load and Explore the Dataset

In [None]:
# Load the dataset
print("Loading dataset...")
file_path = "cancer_classification_dataset.csv"
df = pd.read_csv(file_path)

# Display basic information about the dataset
print("\nDataset shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

## 3. Prepare Features and Target Variable

In [None]:
# Separate features and target variable
X = df.drop(columns=['cancer_type'])
y = df['cancer_type']

# Encode categorical target variable
print("Checking target variable for NaN values...")
print(f"NaN values in target: {y.isna().sum()}")

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Display class distribution
class_counts = Counter(y_encoded)
print(f"\nClass distribution: {class_counts}")
print(f"Number of classes: {len(class_counts)}")
print("\nClass mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"Class {i}: {label}")

## 4. Visualize Original Class Distribution

In [None]:
# Plot original class distribution
plt.figure(figsize=(10, 6))
plt.bar(*zip(*Counter(y_encoded).items()))
plt.xlabel("Class Labels")
plt.ylabel("Frequency")
plt.title("Original Class Distribution")
plt.xticks(range(len(label_encoder.classes_)), label_encoder.classes_, rotation=90)
plt.tight_layout()
plt.show()

## 5. Feature Scaling and SMOTE for Class Imbalance

In [None]:
# Feature Scaling before SMOTE
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Address class imbalance using SMOTE
print("Applying SMOTE...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_encoded)

# Check new class distribution after SMOTE
print(f"\nNew class distribution after SMOTE: {Counter(y_resampled)}")

# Plot resampled class distribution
plt.figure(figsize=(10, 6))
plt.bar(*zip(*Counter(y_resampled).items()))
plt.xlabel("Class Labels")
plt.ylabel("Frequency")
plt.title("Class Distribution After SMOTE")
plt.xticks(range(len(label_encoder.classes_)), label_encoder.classes_, rotation=90)
plt.tight_layout()
plt.show()

## 6. Feature Selection with Random Forest

In [None]:
# Train a Random Forest model to get feature importances
print("Training Random Forest model for feature selection...")
rf = RandomForestClassifier(n_estimators=100,
                            random_state=42,
                            class_weight='balanced')
rf.fit(X_resampled, y_resampled)

# Select top 100 most important features
print("Selecting top 100 features...")
feature_importances = rf.feature_importances_
feature_names = X.columns
important_features = pd.DataFrame({'Feature': feature_names,
                                   'Importance': feature_importances})
important_features = important_features.sort_values(by='Importance',
                                                    ascending=False)
top_features = important_features.head(100)['Feature'].values

# Show top 20 features
print("\nTop 20 most important features:")
important_features.head(20)

In [None]:
# Visualize top 20 features
plt.figure(figsize=(12, 8))
plt.barh(important_features.head(20)['Feature'], important_features.head(20)['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 20 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 7. Reduce Dataset to Selected Features

In [None]:
# Reduce dataset to selected features
X_reduced = X[top_features]
print(f"Reduced feature set shape: {X_reduced.shape}")

## 8. Train-Test Split

In [None]:
# Train-test split
print("Performing train-test split...")
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_encoded,
                                                    test_size=0.2,
                                                    random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

## 9. Option 1: XGBoost Model with Best Parameters (Skip GridCV)

Use this cell if you want to skip the time-consuming GridSearchCV process and use the best parameters that were previously determined.

In [None]:
# Best parameters found from previous GridSearchCV run
best_params = {
    'colsample_bytree': 1.0,
    'learning_rate': 0.1,
    'max_depth': 7,
    'n_estimators': 300,
    'subsample': 0.8
}

print("Training XGBoost model with best parameters...")
print(f"Using parameters: {best_params}")

# Create and train model with best parameters
best_xgb_model = XGBClassifier(
    colsample_bytree=best_params['colsample_bytree'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    n_estimators=best_params['n_estimators'],
    subsample=best_params['subsample'],
    random_state=42
)

# Train the model
best_xgb_model.fit(X_train, y_train)

# Make predictions
print("Making predictions on test data...")
y_pred = best_xgb_model.predict(X_test)

## 9. Option 2: Hyperparameter Tuning with GridSearchCV

**Note:** This process can be time-consuming. Run this cell if you want to perform your own hyperparameter search.

In [None]:
# Hyperparameter tuning using GridSearchCV for cross-validation
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Create the XGBoost model
xgb_model = XGBClassifier(random_state=42)

# Perform Grid Search with Cross-Validation
print("Performing hyperparameter tuning with cross-validation...")
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5,
                           scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print(f"Best parameters found: {best_params}")

# Train the final model with the best parameters
print("Training best XGB model...")
best_xgb_model = grid_search.best_estimator_

# Make predictions
print("Making predictions on test data...")
y_pred = best_xgb_model.predict(X_test)

## 10. Model Evaluation

In [None]:
# Evaluate the XGBoost model
print(f"Accuracy of XGBoost model: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report for XGBoost:\n")
print(classification_report(y_test, y_pred))

# Create a function to calculate top-k accuracy
def top_k_accuracy(y_true, y_pred_proba, k=2):
    top_k = np.argsort(y_pred_proba, axis=1)[:, -k:]
    correct = np.array([y_true[i] in top_k[i] for i in range(len(y_true))])
    return np.mean(correct)

# Calculate top-2 accuracy
y_pred_proba = best_xgb_model.predict_proba(X_test)
top_2_accuracy = top_k_accuracy(y_test, y_pred_proba, k=2)
print(f"\nTop-2 Accuracy: {top_2_accuracy:.4f}")

# Calculate top-3 accuracy
top_3_accuracy = top_k_accuracy(y_test, y_pred_proba, k=3)
print(f"Top-3 Accuracy: {top_3_accuracy:.4f}")

## 11. Cancer-Type Specific Accuracy Analysis

In [None]:
# Cancer-type specific accuracy (for each class)
print("Cancer-type specific accuracy:")
class_accuracies = {}
for i, label in enumerate(label_encoder.classes_):
    class_indices = np.where(y_test == i)[0]
    if len(class_indices) > 0:
        class_accuracy = np.mean(y_pred[class_indices] == y_test[class_indices])
        class_accuracies[label] = class_accuracy
        print(f"Accuracy for cancer type '{label}': {class_accuracy:.4f}")
    else:
        print(f"No test samples for cancer type '{label}'")

# Visualize class-specific accuracies
plt.figure(figsize=(12, 8))
plt.bar(class_accuracies.keys(), class_accuracies.values())
plt.xlabel('Cancer Type')
plt.ylabel('Accuracy')
plt.title('Class-Specific Accuracies')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

## 12. Confusion Matrix Analysis

In [None]:
# Generate and visualize confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(12, 10))
plt.imshow(conf_matrix, cmap='Blues')
plt.colorbar()

# Add labels
tick_marks = np.arange(len(label_encoder.classes_))
plt.xticks(tick_marks, label_encoder.classes_, rotation=90)
plt.yticks(tick_marks, label_encoder.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')

# Add text annotations
thresh = conf_matrix.max() / 2
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        if conf_matrix[i, j] > 0:
            plt.text(j, i, format(conf_matrix[i, j], 'd'),
                    horizontalalignment="center",
                    color="white" if conf_matrix[i, j] > thresh else "black")

plt.tight_layout()
plt.show()

## 13. Analyze Most Confused Cancer Types

In [None]:
# Analyze which cancer types are most confused with each other
print("Cancer Types Most Confused with Each Other:")
confusion_pairs = []

for i in range(len(label_encoder.classes_)):
    for j in range(len(label_encoder.classes_)):
        if i != j and conf_matrix[i, j] > 0:
            confusion_pairs.append({
                'true_type': label_encoder.classes_[i],
                'predicted_type': label_encoder.classes_[j],
                'count': conf_matrix[i, j]
            })
            print(f"{label_encoder.classes_[i]} is confused with {label_encoder.classes_[j]} with count {conf_matrix[i, j]}")

# Create a DataFrame of confusion pairs and sort by count
confusion_df = pd.DataFrame(confusion_pairs)
most_confused = confusion_df.sort_values('count', ascending=False).head(10)
print("\nTop 10 Most Confused Cancer Type Pairs:")
most_confused

## 14. Visualize Most Confused Pairs

In [None]:
# Visualize top confused pairs
plt.figure(figsize=(12, 8))
top_confused = most_confused.head(10)
pair_labels = [f"{row['true_type']} â†’ {row['predicted_type']}" for _, row in top_confused.iterrows()]

plt.barh(pair_labels, top_confused['count'])
plt.xlabel('Count')
plt.title('Top 10 Most Confused Cancer Type Pairs')
plt.tight_layout()
plt.show()