**Importing required libraries**

In [None]:
# Basic libraries
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning libraries
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_breast_cancer
from google.colab import drive



**Setup for Googledrive interface**

In [None]:
# Define the directory path to save pickle files
drive_path = '/content/drive/MyDrive'

**Loading the dataset**

In [None]:
#Loading dataset
data=load_breast_cancer()
dataset=pd.DataFrame(data.data, columns=data.feature_names)
dataset['target'] = data.target
print(dataset)


     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mea

**Analysing of data**

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [None]:
dataset.shape

(569, 31)

In [None]:
dataset.isnull().sum()

Unnamed: 0,0
mean radius,0
mean texture,0
mean perimeter,0
mean area,0
mean smoothness,0
mean compactness,0
mean concavity,0
mean concave points,0
mean symmetry,0
mean fractal dimension,0


In [None]:
dataset['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,357
0,212




Malignant--->1



Benign--->0








In [None]:
#Gives mean value of all features based on target
dataset.groupby('target').mean()


Unnamed: 0_level_0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,17.46283,21.604906,115.365377,978.376415,0.102898,0.145188,0.160775,0.08799,0.192909,0.06268,...,21.134811,29.318208,141.37033,1422.286321,0.144845,0.374824,0.450606,0.182237,0.323468,0.09153
1,12.146524,17.914762,78.075406,462.790196,0.092478,0.080085,0.046058,0.025717,0.174186,0.062867,...,13.379801,23.51507,87.005938,558.89944,0.124959,0.182673,0.166238,0.074444,0.270246,0.079442


In [None]:
#separating data and labels
x=dataset.drop(columns='target', axis=1)
y=dataset['target']
print(x)
print(y)

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mea

**Data Standardization**

In [None]:
scaler = StandardScaler()
standardized_data = scaler.fit_transform(dataset.drop(columns=['target']))
print(standardized_data)

[[ 1.09706398 -2.07333501  1.26993369 ...  2.29607613  2.75062224
   1.93701461]
 [ 1.82982061 -0.35363241  1.68595471 ...  1.0870843  -0.24388967
   0.28118999]
 [ 1.57988811  0.45618695  1.56650313 ...  1.95500035  1.152255
   0.20139121]
 ...
 [ 0.70228425  2.0455738   0.67267578 ...  0.41406869 -1.10454895
  -0.31840916]
 [ 1.83834103  2.33645719  1.98252415 ...  2.28998549  1.91908301
   2.21963528]
 [-1.80840125  1.22179204 -1.81438851 ... -1.74506282 -0.04813821
  -0.75120669]]


In [None]:
x=standardized_data
y=dataset['target']
print(x)
print(y)

[[ 1.09706398 -2.07333501  1.26993369 ...  2.29607613  2.75062224
   1.93701461]
 [ 1.82982061 -0.35363241  1.68595471 ...  1.0870843  -0.24388967
   0.28118999]
 [ 1.57988811  0.45618695  1.56650313 ...  1.95500035  1.152255
   0.20139121]
 ...
 [ 0.70228425  2.0455738   0.67267578 ...  0.41406869 -1.10454895
  -0.31840916]
 [ 1.83834103  2.33645719  1.98252415 ...  2.28998549  1.91908301
   2.21963528]
 [-1.80840125  1.22179204 -1.81438851 ... -1.74506282 -0.04813821
  -0.75120669]]
0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: target, Length: 569, dtype: int64


**Define Models**

In [None]:
models = {
    "Logistic Regression": LogisticRegression(random_state=5),
    "Random Forest": RandomForestClassifier(random_state=5, n_estimators=100),
    "Decision Tree": DecisionTreeClassifier(random_state=5)
}
print(type(models))


<class 'dict'>


 **Initialize Stratified K-Fold**

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)  # 5 folds
results=[]

**Perform K-Fold Validation And Training the Model**

In [None]:
import os
pickle_dir = "/content/drive/MyDrive/cancer"
os.makedirs(pickle_dir, exist_ok=True)

fold_data = {"models": {}, "accuracies": {}}

for model_name, model in models.items():
    print(f"\nEvaluating model: {model_name}")
    fold_accuracies = []
    best_fold_accuracy = 0
    best_fold_model = None
    best_fold = 0

    for fold, (train_index, test_index) in enumerate(kf.split(x, y), start=1):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Evaluate the model
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        fold_accuracies.append(acc)

        fold_data["models"][f"Fold {fold}"] = model
        fold_data["accuracies"][f"Fold {fold}"] = acc

        # Update the best fold for the current model
        if acc > best_fold_accuracy:
            best_fold_accuracy = acc
            best_fold_model = model
            best_fold = fold

        print(f"  Fold {fold}: Accuracy = {acc:.2f}")
        results.append({
            "Model": model_name,
            "Fold": fold,
            "Accuracy": acc
        })

    # Save all folds of the current model to one pickle file
    pickle_file_path = os.path.join(pickle_dir, f"{model_name}_all_folds.pkl")
    with open(pickle_file_path, "wb") as file:
        pickle.dump(model, file)

print("\nModel evaluation complete and all models saved.")



Evaluating model: Logistic Regression
  Fold 1: Accuracy = 0.98
  Fold 2: Accuracy = 0.98
  Fold 3: Accuracy = 0.98
  Fold 4: Accuracy = 0.96
  Fold 5: Accuracy = 0.99

Evaluating model: Random Forest
  Fold 1: Accuracy = 0.95
  Fold 2: Accuracy = 0.96
  Fold 3: Accuracy = 0.96
  Fold 4: Accuracy = 0.96
  Fold 5: Accuracy = 0.97

Evaluating model: Decision Tree
  Fold 1: Accuracy = 0.93
  Fold 2: Accuracy = 0.96
  Fold 3: Accuracy = 0.91
  Fold 4: Accuracy = 0.90
  Fold 5: Accuracy = 0.92

Model evaluation complete and all models saved.


**Saving the results to a CSV file**

In [None]:
import pandas as pd

# Initialize the structure to hold the formatted results
formatted_results = {
    "Model Name": [],
    "Fold 1": [],
    "Fold 2": [],
    "Fold 3": [],
    "Fold 4": [],
    "Fold 5": []
}

# Iterate over the results to extract and format the data
for model_name in models.keys():
    formatted_results["Model Name"].append(model_name)

    # Extract fold accuracies for the current model
    fold_accuracies = [
        result["Accuracy"] for result in results if result["Model"] == model_name
    ]

    # Add fold accuracies to the respective row, ensuring all 5 folds are present
    for i in range(5):
        if i < len(fold_accuracies):
            formatted_results[f"Fold {i + 1}"].append(fold_accuracies[i])
        else:
            formatted_results[f"Fold {i + 1}"].append(None)

# Convert the dictionary to a DataFrame
formatted_results_df = pd.DataFrame(formatted_results)

# Save the DataFrame to a CSV file
csv_file_path = "/content/drive/MyDrive/cancer/formatted_model_evaluation_results.csv"
formatted_results_df.to_csv(csv_file_path, index=False)

print(f"Formatted results saved to {csv_file_path}")
print(formatted_results_df)







Formatted results saved to /content/drive/MyDrive/cancer/formatted_model_evaluation_results.csv
            Model Name    Fold 1    Fold 2    Fold 3    Fold 4    Fold 5
0  Logistic Regression  0.982456  0.982456  0.982456  0.956140  0.991150
1        Random Forest  0.947368  0.964912  0.964912  0.956140  0.973451
2        Decision Tree  0.929825  0.956140  0.912281  0.903509  0.920354


**Evaluation on testing data**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=2, stratify = y)
print(x.shape, X_train.shape, X_test.shape)
# Iterate through the models
for fold_name, model in models.items():
    print(f"\nUsing {fold_name} model...")


    # Make Predictions on Test data
    X_test_pred = model.predict(X_test)

    #accuracy on X_train_predictions
    testing_data_accuracy = accuracy_score(X_test_pred,Y_test)
    print('Accuracy on testing data : ', testing_data_accuracy)

(569, 30) (455, 30) (114, 30)

Using Logistic Regression model...
Accuracy on testing data :  0.9912280701754386

Using Random Forest model...
Accuracy on testing data :  1.0

Using Decision Tree model...
Accuracy on testing data :  0.9912280701754386


**Making a predictive system**

In [None]:
# Iterate through the models
for fold_name, model in models.items():
    print(f"\nUsing {fold_name} model...")

    input_data = (17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189)
    input_data_as_numpy_array = np.asarray(input_data)
    input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
    prediction = model.predict(input_data_reshaped)
    print('Output for sampled data:',prediction)



Using Logistic Regression model...
Output for sampled data: [0]

Using Random Forest model...
Output for sampled data: [0]

Using Decision Tree model...
Output for sampled data: [0]


**Loading from pickle files**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=2, stratify = y)

# Path to the pickle file
pickle_file_path = "/content/drive/MyDrive/cancer/Logistic Regression_all_folds.pkl"

# Load the data (models and accuracies)
with open(pickle_file_path, "rb") as file:
    fold_data = pickle.load(file)  # Dictionary containing models and accuracies


# Make Predictions on Test data
X_test_pred = fold_data.predict(X_test)

#accuracy on X_train_predictions
testing_data_accuracy = accuracy_score(X_test_pred,Y_test)
print('Accuracy on testing data : ', testing_data_accuracy)


Accuracy on testing data :  0.9912280701754386
