## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score,roc_curve, auc


## Loading Dataset

In [None]:
df = pd.read_csv(r"C:\Users\lakshita\Desktop\datasets\thyroid_cancer.csv")

## Data Processing

In [None]:
df.head()

In [None]:
df.head()

In [None]:
df.info()

## Checking NULL values

In [None]:
df.isnull().sum()

## Exploratory Data Analysis (EDA) 

In [None]:
sns.pairplot(df, hue="Recurred", palette='husl')
plt.show()

In [None]:
# Histograms for numerical features
df.hist(figsize=(8, 4), bins=20)
plt.suptitle("Distribution of Numerical Features", fontsize=15)
plt.show()

## Recurrence vs Risk Level

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='Risk', hue='Recurred', data=df, palette='coolwarm')
plt.title('Thyroid Cancer Recurrence by Risk Level')
plt.show()

In [None]:
df.rename(columns={"Hx Radiothreapy": "Hx Radiotherapy"}, inplace=True)

# Count plots for categorical variables
categorical_cols = ['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiotherapy', 'Thyroid Function',
                    'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 'Response', 'Recurred']
plt.figure(figsize=(11, 11))
for i, col in enumerate(categorical_cols, 1):
    plt.subplot(4, 3, i)
    sns.countplot(x=df[col], hue=df[col], palette="coolwarm", edgecolor="black", legend=False)
    plt.title(col)
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


## Boxplots for Outlier Detection

In [None]:
numerical_cols = ['Age', 'T', 'N', 'M', 'Stage']

plt.figure(figsize=(10, 6))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x=df[col], color="skyblue")
    plt.title(col)

plt.suptitle("Boxplots for Outlier Detection", fontsize=15)
plt.tight_layout()
plt.show()

## Age vs Cancer Recurrence

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x='Recurred', y='Age', data=df, hue='Recurred', palette='pastel', legend=False)
plt.title('Age vs Cancer Recurrence')
plt.show()

## Feature Engineering {Encode Categorical Features}

In [None]:
# Define categorical columns
numerical_cols = ['Age', 'T', 'N', 'M', 'Stage']
# Identify all categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

#  Label Encoding (for ordinal categories)
df_label_encoded = df.copy()  # Create a copy of the original DataFrame
for col in categorical_cols:
    le = LabelEncoder()  # Create a new LabelEncoder instance for each column
    df_label_encoded[col] = le.fit_transform(df_label_encoded[col])
    df[col] = le.fit_transform(df[col].astype(str))

# One-Hot Encoding (for nominal categories)
df_one_hot_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Print results
print("Label Encoded DataFrame:")
print(df_label_encoded.head())

print("\nOne-Hot Encoded DataFrame:")
print(df_one_hot_encoded.head())


## Correlation Matrix & Heatmap

In [None]:
# Compute correlation matrix
corr_matrix = df.corr()

# Plot heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()


## Feature Scaling (Standardization)

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Apply standardization to all numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns  # Identify numerical columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Display the first few rows after scaling
df.head()

## ML MODEL IMPLEMETATION

In [None]:
# Define features (X) and target variable (y)
print(df['Recurred'].value_counts())
X = df.drop(columns=['Recurred'])  # Replace 'Target_Column' with your actual target variable
y = df['Recurred']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


## Model RandomForest Classifier

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]  # Probability scores for ROC AUC

## Evaluate the Model

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

## Classification Report

In [None]:
# Generate classification report as a dictionary
report_dict = classification_report(y_test, y_pred, output_dict=True)

# Convert to DataFrame for visualization
report_df = pd.DataFrame(report_dict).T

# Plot heatmap
plt.figure(figsize=(8, 5))
sns.heatmap(report_df.iloc[:-1, :-1], annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Classification Report Heatmap")
plt.show()

## Feature Importance

In [None]:
# Get feature importance
importances = clf.feature_importances_
features = X.columns

# Plot feature importance
plt.figure(figsize=(8, 5))
sns.barplot(x=importances, y=features, palette='viridis')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.title('Feature Importance in Random Forest')
plt.show()


## Confusion Matrix Heatmap

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix as heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Recurrence', 'Recurrence'], yticklabels=['No Recurrence', 'Recurrence'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

## ROC Curve

In [None]:
# Compute ROC curve and AUC score
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC Curve
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='grey', linestyle='--')  # Diagonal reference line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

## Distribution of Predictions

In [None]:
sns.histplot(y_prob, bins=30, kde=True, color='purple')
plt.axvline(0.5, color='red', linestyle='--')  # Decision boundary
plt.xlabel('Prediction Probability')
plt.ylabel('Count')
plt.title('Distribution of Prediction Probabilities')
plt.show()
