# Loading the data

In [None]:
!pip install imbalanced-learn


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
import pickle
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score,precision_score,recall_score,roc_auc_score,roc_curve,f1_score
from sklearn.model_selection import KFold

colors = ["#89CFF0", "#FF69B4", "#FFD700", "#7B68EE", "#FF4500",
          "#9370DB", "#32CD32", "#8A2BE2", "#FF6347", "#20B2AA",
          "#FF69B4", "#00CED1", "#FF7F50", "#7FFF00", "#DA70D6"]

def evaluate_model (y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test,y_pred)
    print("Accuracy of the model is: %.2f"%(accuracy * 100) , "%")
    print("Precision of the model is: %.2f" %(precision * 100) , "%")
    print("Recall of the model is: %.2f" %(recall * 100) , "%")
    print("AUC value of the model is: %.2f" %(roc * 100) , "%")
    print("F1 score of the model is: %.2f" %(f1 * 100) , "%")

In [None]:
df = pd.read_csv(r'C:\Users\Faza Ulfath\Downloads\Thyroid-Cancer-Prediction-ML-Application--master\Thyroid_Diff.csv')  

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df['Hx Radiothreapy'].value_counts()

In [None]:
df.isnull().sum().sum()

In [None]:
df[df.duplicated()]

In [None]:
df['Physical Examination'].value_counts() # one-hot encoding 

In [None]:
df.groupby("Recurred")["Physical Examination"].value_counts()

<div style="text-align: center; font-size: 35px; font-weight: bold;">Exploratory Data Analysis(EDA)</div>


In [None]:
fig, ax = plt.subplots(figsize = (20, 5),dpi=500)

ax.hist(df['Age'], bins = 25, edgecolor = 'black', alpha = 0.7, color = 'skyblue', density = True)

df['Age'].plot(kind = 'kde', color = 'red', ax = ax)

ax.set_xlabel('Age')
ax.set_ylabel('Count / Density')
ax.set_title('Age Distribution Histogram with Density Curve')
ax.legend(['Density Curve', 'Histogram'])
plt.show()

In [None]:
df['Stage'].value_counts().index

In [None]:
plt.figure(figsize = (8, 6))
ax = df["Stage"].value_counts().plot(kind = 'bar', color = colors, rot = 0)
ax.set_xticklabels((df['Stage'].value_counts().index))

for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x() + 0.25, p.get_height() + 1), ha = 'center', va = 'bottom', color = 'black')
    ax.tick_params(axis = 'both', labelsize = 15)
plt.xlabel('Cancer Stage', weight = "bold", color = "#D71313", fontsize = 14, labelpad = 20)
plt.ylabel('Number of Occurrences', weight = "bold", color = "#D71313", fontsize = 14, labelpad = 20);

In [None]:
plt.figure(figsize = (8, 6))
ax = df["Gender"].value_counts().plot(kind = 'bar', color = colors, rot = 0)
ax.set_xticklabels((df['Gender'].value_counts().index))

for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x() + 0.25, p.get_height() + 1), ha = 'center', va = 'bottom', color = 'black')
    ax.tick_params(axis = 'both', labelsize = 15)
plt.xlabel('Gender', weight = "bold", color = "#D71313", fontsize = 14, labelpad = 20)
plt.ylabel('Number of Occurrences', weight = "bold", color = "#D71313", fontsize = 14, labelpad = 20);

In [None]:
plt.figure(figsize = (8, 6))
ax = df["Risk"].value_counts().plot(kind = 'bar', color = colors, rot = 0)
ax.set_xticklabels((df['Risk'].value_counts().index))

for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x() + 0.25, p.get_height() + 1), ha = 'center', va = 'bottom', color = 'black')
    ax.tick_params(axis = 'both', labelsize = 15)
plt.xlabel('Risk', weight = "bold", color = "#D71313", fontsize = 14, labelpad = 20)
plt.ylabel('Number of Occurrences', weight = "bold", color = "#D71313", fontsize = 14, labelpad = 20);

In [None]:
plt.figure(figsize = (8, 6))
ax = df["Recurred"].value_counts().plot(kind = 'bar', color = colors, rot = 0)
ax.set_xticklabels((df['Recurred'].value_counts().index))

for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x() + 0.25, p.get_height() + 1), ha = 'center', va = 'bottom', color = 'black')
    ax.tick_params(axis = 'both', labelsize = 15)
plt.xlabel('Recurrence', weight = "bold", color = "#D71313", fontsize = 14, labelpad = 20)
plt.ylabel('Number of Occurrences', weight = "bold", color = "#D71313", fontsize = 14, labelpad = 20);

In [None]:
plt.figure(figsize = (8, 6))
ax = df["Smoking"].value_counts().plot(kind = 'bar', color = colors, rot = 0)
ax.set_xticklabels((df['Smoking'].value_counts().index))

for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x() + 0.25, p.get_height() + 1), ha = 'center', va = 'bottom', color = 'black')
    ax.tick_params(axis = 'both', labelsize = 15)
plt.xlabel('Does the patient smoke?', weight = "bold", color = "#D71313", fontsize = 14, labelpad = 20)
plt.ylabel('Number of Occurrences', weight = "bold", color = "#D71313", fontsize = 14, labelpad = 20);

In [None]:
plt.figure(figsize = (8, 6))
ax = df["Response"].value_counts().plot(kind = 'bar', color = colors, rot = 90)
ax.set_xticklabels((df['Response'].value_counts().index))

for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x() + 0.25, p.get_height() + 1), ha = 'center', va = 'bottom', color = 'black')
    ax.tick_params(axis = 'both', labelsize = 15)
plt.xlabel('Response', weight = "bold", color = "#D71313", fontsize = 14, labelpad = 20)
plt.ylabel('Number of Occurrences', weight = "bold", color = "#D71313", fontsize = 14, labelpad = 20);

In [None]:
df['Recurred'].value_counts()

In [None]:
df['Stage'].value_counts()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Stage', hue='Recurred', data=df, palette='bone')

plt.title('Recurrence Count for Each Stage', fontsize=16, weight='bold')
plt.xlabel('Stage', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Recurred', title_fontsize='12', fontsize='10')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Gender', hue='Recurred', data=df, palette='pink')

plt.title('Recurrence Count for Each Gender', fontsize=16, weight='bold')
plt.xlabel('Gender', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Recurred', title_fontsize='12', fontsize='10')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Smoking', hue='Recurred', data=df, palette='icefire')

plt.title('Recurrence Count for Smoking Status', fontsize=16, weight='bold')
plt.xlabel('Smoker', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Recurred', title_fontsize='12', fontsize='10')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Hx Radiothreapy', hue='Recurred', data=df, palette='flare')

plt.title('Recurrence Count for Radiotherapy history', fontsize=16, weight='bold')
plt.xlabel('Radiotherapy', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Recurred', title_fontsize='12', fontsize='10')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Thyroid Function', hue='Recurred', data=df, palette='flare')

plt.title('Recurrence Count for Thyroid Function', fontsize=16, weight='bold')
plt.xlabel('Thyroid Function', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Recurred', title_fontsize='12', fontsize='10')
plt.xticks(rotation=90, ha='right')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Risk', hue='Recurred', data=df, palette='flare')

plt.title('Recurrence Count for Risk', fontsize=16, weight='bold')
plt.xlabel('Risk', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Recurred', title_fontsize='12', fontsize='10')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Adenopathy', hue='Recurred', data=df, palette='flare')

plt.title('Recurrence Count for Adenopathy', fontsize=16, weight='bold')
plt.xlabel('Adenopathy', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Recurred', title_fontsize='12', fontsize='10')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Focality', hue='Recurred', data=df, palette='flare')

plt.title('Recurrence Count for Focality', fontsize=16, weight='bold')
plt.xlabel('Focality', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Recurred', title_fontsize='12', fontsize='10')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Physical Examination', hue='Recurred', data=df, palette='flare')

plt.title('Recurrence Count for Examination', fontsize=16, weight='bold')
plt.xlabel('Physical Examination Findings', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Recurred', title_fontsize='12', fontsize='10')
plt.xticks(rotation=20, ha='right')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Pathology', hue='Recurred', data=df, palette='flare')

plt.title('Recurrence Count for Pathology', fontsize=16, weight='bold')
plt.xlabel('Pathology', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Recurred', title_fontsize='12', fontsize='10')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Response', hue='Recurred', data=df, palette='flare')

plt.title('Recurrence Count for Adenopathy', fontsize=16, weight='bold')
plt.xlabel('Adenopathy', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Recurred', title_fontsize='12', fontsize='10')

plt.show()

# Transforming and Splitting the data

In [None]:
transformer=ColumnTransformer(transformers=[
    ('oh',OneHotEncoder(sparse_output=False,drop='first'), ['Gender','Thyroid Function','Physical Examination','Adenopathy','Pathology']),
    ('oe',OrdinalEncoder(categories=[['No','Yes'],['No','Yes'],['No','Yes'],['Uni-Focal','Multi-Focal'],
                                     ['Low','Intermediate','High'],['T1a', 'T1b', 'T2', 'T3a', 'T3b', 'T4a', 'T4b'],
                                     ['N0','N1a','N1b'],['M0','M1'],['I', 'II', 'IVB', 'III', 'IVA'],
                                     ['Excellent', 'Indeterminate', 'Biochemical Incomplete','Structural Incomplete']]),
                                     ['Smoking','Hx Smoking','Hx Radiothreapy','Focality','Risk','T','N','M','Stage','Response'])
],
                              remainder='passthrough'
)

In [None]:
X = df.drop("Recurred",axis=1)
y = df['Recurred']

In [None]:
map = {"No": 0 , "Yes":1}
y = df['Recurred'].map(map)

In [None]:
X_transformed = transformer.fit_transform(X)

In [None]:
transformed_feature_names = transformer.get_feature_names_out(input_features=X.columns)
print(transformed_feature_names)

In [None]:
transformed_cols = ['Gender', 'Thyroid Function_Clinical Hypothyroidism',
 'Thyroid Function_Euthyroid',
 'Thyroid Function_Subclinical Hyperthyroidism',
 'Thyroid Function_Subclinical Hypothyroidism',
 'Physical Examination_Multinodular goiter',
 'Physical Examination_Normal',
 'Physical Examination_Single nodular goiter-left',
 'Physical Examination_Single nodular goiter-right',
 'Adenopathy_Extensive', 'Adenopathy_Left', 'Adenopathy_No',
 'Adenopathy_Posterior', 'Adenopathy_Right',
 'Pathology_Hurthel cell', 'Pathology_Micropapillary',
 'Pathology_Papillary', 'Smoking', 'Hx Smoking',
 'Hx Radiothreapy', 'Focality', 'Risk', 'T', 'N', 'M',
 'Stage', 'Response', 'Age']

In [None]:
len(transformed_cols)

In [None]:
transformed_df = pd.DataFrame(data=X_transformed,columns=transformed_cols)
transformed_df['Recurred'] = y
transformed_df.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_transformed,y,test_size=0.25,random_state=1)

In [None]:
X_test,X_val,y_test,y_val = train_test_split(X_test,y_test,test_size=0.5,random_state=1)

# Model Training & Evaluation

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)


In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier()

In [None]:
dt.fit(X_train_scaled,y_train)

In [None]:
y_pred_dt = dt.predict(X_test_scaled)

In [None]:
evaluate_model(y_test,y_pred_dt)

In [None]:
print(classification_report(y_test,y_pred_dt))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_dt)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()

In [None]:
X_train_scaled.shape

In [None]:
rf.fit(X_train_scaled,y_train)

In [None]:
y_pred = rf.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
n_estimators = [100, 300, 700, 1000]
criterion = ['gini','entropy','log_loss']
max_depth = [10,20,50,100]
bootstrap=[True,False]
grid = dict(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, bootstrap=bootstrap) # creating dictionary to store the lists


In [None]:
GC_RF = GridSearchCV(estimator=rf, param_grid=grid, n_jobs=-1, cv=5, scoring='recall_macro', error_score=0) # creating grid search object for xgb
GC_RF_result = GC_RF.fit(X_val_scaled ,y_val) # fitting the grid search on the training data

In [None]:
rf.set_params(**GC_RF_result.best_params_)

In [None]:
rf.fit(X_train_scaled,y_train)

In [None]:
y_pred = rf.predict(X_test_scaled)

In [None]:
evaluate_model(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:


from sklearn.metrics import auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred)  # Use y_pred[:, 1] for the positive class probabilities
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('RF Model ROC Curve')
plt.show()

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

In [None]:
xgb_clsfr = XGBClassifier() #creating a model with no parameters
n_estimators = [100, 300, 500, 700,1000] # creating lists for different values of hyperparameters
subsample = [0.3,0.5, 0.7, 1.0]
max_depth = [2,4,6, 7, 9]
grid = dict(n_estimators=n_estimators, subsample=subsample, max_depth=max_depth) # creating dictionary to store the lists
grid_search = GridSearchCV(estimator=xgb_clsfr, param_grid=grid, n_jobs=-1, cv=10, scoring='roc_auc', error_score=0) # creating grid search object for xgb
grid_result = grid_search.fit(X_val_scaled, y_val) # fitting the grid search on the training data
print("Highest ROC AUC is achieved using the parameters : " , ( grid_result.best_params_))

In [None]:
xgb_clsfr.set_params(**grid_result.best_params_)

In [None]:
xgb_clsfr.fit(X_train_scaled,y_train)

In [None]:
y_pred2 = xgb_clsfr.predict(X_test_scaled)

In [None]:
accuracy_score(y_test,y_pred2)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred2)

In [None]:
print(classification_report(y_test,y_pred2))

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras import regularizers

In [None]:
NN_cls = Sequential([
    Dense(20, activation='relu', name='L1', kernel_regularizer=regularizers.l2(0.02)),
    Dropout(0.1),
    Dense(10, activation='relu', name='L2', kernel_regularizer=regularizers.l2(0.02)),
    Dropout(0.1),
    Dense(5, activation='relu', name='L3', kernel_regularizer=regularizers.l2(0.02)),
    Dropout(0.1),
    Dense(2, activation='linear', name='L4')
])

NN_cls.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=['accuracy']
)

history_bal = NN_cls.fit(
    X_train_scaled, y_train,
    epochs=150, validation_data=(X_val_scaled, y_val))

In [None]:
predictions=NN_cls.predict(X_test_scaled)

In [None]:
y_pred = np.argmax(NN_cls.predict(X_test_scaled), axis=1)

In [None]:
evaluate_model(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

In [None]:
n_splits = 5
# Initialize KFold
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

avg = []

# Iterate over folds
for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    X_train_transformed = transformer.fit_transform(X_train)
    X_test_transformed = transformer.transform(X_test)
    X_train_scaled = scaler.fit_transform(X_train_transformed)
    X_test_scaled = scaler.transform(X_test_transformed)

    # Your training and evaluation steps for this fold
    rf.fit(X_train_scaled, y_train)
    preds = rf.predict(X_test_scaled)
    acc = accuracy_score(y_test, preds)
    avg.append(acc)
    
    print(f"Fold {fold + 1}: Accuracy score: {acc:.2f}")

print(f"Average accuracy score : {sum(avg)/len(avg):.2f}")


In [None]:
# Initialize KFold
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

avg = []

# Iterate over folds
for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    X_train_transformed = transformer.fit_transform(X_train)
    X_test_transformed = transformer.transform(X_test)
    X_train_scaled = scaler.fit_transform(X_train_transformed)
    X_test_scaled = scaler.transform(X_test_transformed)

    # Your training and evaluation steps for this fold
    xgb_clsfr.fit(X_train_scaled, y_train)
    preds = xgb_clsfr.predict(X_test_scaled)
    acc = accuracy_score(y_test, preds)
    avg.append(acc)
    
    print(f"Fold {fold + 1}: Accuracy score: {acc:.2f}")

print(f"Average accuracy score : {sum(avg)/len(avg):.2f}")


In [None]:
n_splits = 5
# Initialize KFold
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

avg = []

# Iterate over folds
for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    X_train_transformed = transformer.fit_transform(X_train)
    X_test_transformed = transformer.transform(X_test)
    X_train_scaled = scaler.fit_transform(X_train_transformed)
    X_test_scaled = scaler.transform(X_test_transformed)

    # Your training and evaluation steps for this fold
    NN_cls.fit(X_train_scaled, y_train,epochs=150,verbose=0)
    preds = np.argmax(NN_cls.predict(X_test_scaled),axis=1)
    acc = accuracy_score(y_test, preds)
    avg.append(acc)
    
    print(f"Fold {fold + 1}: Accuracy score: {acc:.2f}")

print(f"Average accuracy score : {sum(avg)/len(avg):.2f}")


In [None]:
corr_matrix = transformed_df.corr()

# Plot the heatmap
plt.figure(figsize=(20, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
def find_correlated_pairs(data, threshold):
    correlation_matrix = data.corr()

    # Set the correlation threshold
    threshold = threshold  # Adjust this threshold as needed

    # Find feature pairs with correlation above the threshold
    highly_correlated_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i + 1, len(correlation_matrix.columns)):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                pair = (correlation_matrix.columns[i], correlation_matrix.columns[j], correlation_matrix.iloc[i, j])
                highly_correlated_pairs.append(pair)

    # Create a dictionary to store correlated features for each unique feature
    correlated_features_dict = {}
    for pair in highly_correlated_pairs:
        if pair[0] not in correlated_features_dict:
            correlated_features_dict[pair[0]] = [pair[1]]
        else:
            correlated_features_dict[pair[0]].append(pair[1])

        if pair[1] not in correlated_features_dict:
            correlated_features_dict[pair[1]] = [pair[0]]
        else:
            correlated_features_dict[pair[1]].append(pair[0])

    # Display correlated features for each unique feature
    for feature, correlated_features in correlated_features_dict.items():
        print(f"{feature} is strongly correlated to \033[1m{len(correlated_features)}\033[0m feature(s): {', '.join(correlated_features)}")

    return highly_correlated_pairs,correlated_features_dict


In [None]:
find_correlated_pairs(transformed_df,0.7)

# Feature Importance & SHAP Plots

In [None]:

# Extract feature importances
feature_importances = rf.feature_importances_

# Create a DataFrame with feature names and importances
features_df = pd.DataFrame({'Feature': transformed_df.drop('Recurred',axis=1).columns, 'Importance': feature_importances})

# Sort the features by importance
features_df = features_df.sort_values(by='Importance', ascending=False)

custom_colors = ['blue', 'green', 'red', 'purple', 'orange', 'cyan', 'magenta', 'yellow', 'brown', 'gray']

# Plot the feature importances
plt.figure(figsize=(15, 10))
plt.barh(features_df['Feature'], features_df['Importance'],color=colors)
plt.xlabel('Importance')
plt.title('Random Forest Feature Importances')
plt.show()

In [None]:

# Extract feature importances
feature_importances = xgb_clsfr.feature_importances_

# Create a DataFrame with feature names and importances
features_df = pd.DataFrame({'Feature': transformed_df.drop('Recurred',axis=1).columns, 'Importance': feature_importances})

# Sort the features by importance
features_df = features_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(15, 10))
plt.barh(features_df['Feature'], features_df['Importance'],color=colors)
plt.xlabel('Importance')
plt.title('XGBoost Feature Importances')
plt.show()

In [None]:
corr = transformed_df.corr()['Recurred'].abs().sort_values(ascending=False)

corr.values


In [None]:
correlation_df = pd.DataFrame({'Feature': corr.index, 'Correlation with Target': corr.values})


# Plot the feature importances
plt.figure(figsize=(15, 10))
plt.barh(correlation_df['Feature'], correlation_df['Correlation with Target'],color=colors)
plt.xlabel('Correlation')
plt.title('Correlation of Features with Target')
plt.show()

In [None]:
print(len(feature_names))  # Length of the feature names
print(X_test_scaled.shape[1])  # Number of features in the test data


In [None]:
import shap

explainer = shap.TreeExplainer(rf)


shap_values = explainer.shap_values(X_test_scaled)


feature_names = transformed_df.drop('Recurred',axis=1).columns


shap.summary_plot(shap_values[1], X_test_scaled, feature_names=feature_names,show=False, plot_size=(15, 6))




In [None]:
explainer = shap.TreeExplainer(xgb_clsfr)


shap_values = explainer.shap_values(X_test_scaled)


feature_names = transformed_df.drop('Recurred',axis=1).columns


shap.summary_plot(shap_values, X_test_scaled, feature_names=feature_names, show=False, plot_size=(15, 6))

# Pipelines & Model Pickle Files

In [None]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('scaler', StandardScaler()),
                 ('estimator', rf)

])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
preds = pipe.predict(X_test)

In [None]:
pickle.dump(transformer,open('models/transformer.pkl','wb'))
pickle.dump(pipe,open('models/pipe.pkl','wb'))

In [None]:
X.iloc[0].values.shape

In [None]:
arr = [27, 'F', 'No', 'No', 'No', 'Euthyroid',
       'Single nodular goiter-left', 'No', 'Micropapillary', 'Uni-Focal',
       'Low', 'T1a', 'N0', 'M0', 'I', 'Indeterminate']

In [None]:
arr = np.array(arr,dtype=object).reshape(1,-1)

In [None]:
test = pd.DataFrame(data=arr,columns=X.columns)

In [None]:
X.columns.shape

In [None]:
X.columns

In [None]:
import json

column_names = X.columns.tolist()

# Save column names to a JSON file
with open('column_names.json', 'w') as json_file:
    json.dump(column_names, json_file)

In [None]:
with open ('column_names.json','r') as json_file:
    column_names = json.load(json_file)


In [None]:
column_names

In [None]:
transformer_1 = pickle.load(open("models/transformer.pkl","rb"))
pipeline = pickle.load(open("models/pipe.pkl","rb"))

In [None]:
test = transformer_1.transform(test)

In [None]:
X_test[0]

In [None]:
df['Response'].value_counts()