Dataset source:https://www.kaggle.com/datasets/rabieelkharoua/cancer-prediction-dataset

In [None]:
#import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [9]:
df=pd.read_csv("cancer_data.csv")
df.head()

Unnamed: 0,age,gender,bmi,smoking,genetic_risk,physical_activity,alcohol_intake,cancer_history,diagnosis
0,58,1,16.085313,0,1,8.146251,4.148219,1,1
1,71,0,30.828784,0,1,9.36163,3.519683,0,0
2,48,1,38.785084,0,2,5.135179,4.728368,0,1
3,34,0,30.040295,0,0,9.502792,2.044636,0,0
4,62,1,35.479721,0,0,5.35689,3.309849,0,1


In [10]:
#Check for any missing values in the dataset
print("Missing values in the dataset:")
print(df.isna().sum())

Missing values in the dataset:
age                  0
gender               0
bmi                  0
smoking              0
genetic_risk         0
physical_activity    0
alcohol_intake       0
cancer_history       0
diagnosis            0
dtype: int64


In [4]:
#Select the features and target variable for modeling
X = df[['age', 'gender', 'bmi', 'smoking', 'genetic_risk', 'physical_activity', 'alcohol_intake', 'cancer_history']]
y = df["diagnosis"]

#Split the data into training and test sets with a test size of 25%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)



In [5]:
#Initialize and train a Decision Tree Classifier model using the training data
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

#Make predictions on the test set using the trained model
y_pred_dt = dt_model.predict(X_test)

#Evaluate the model using a classification report and print the report
report_dt = classification_report(y_test, y_pred_dt)
print("Decision Tree Classification Report:")
print(report_dt)

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.91      0.89       231
           1       0.85      0.78      0.81       144

    accuracy                           0.86       375
   macro avg       0.86      0.85      0.85       375
weighted avg       0.86      0.86      0.86       375



model trainning using random forest classifier

In [7]:
#Initialize and train a Random Forest Classifier model with 25 estimators using the training data
rf_model = RandomForestClassifier(n_estimators=25)
rf_model.fit(X_train, y_train)

# Make predictions on the test set using the trained model
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model using a classification report and print the report
report_rf = classification_report(y_test, y_pred_rf)
print("Random Forest Classification Report:")
print(report_rf)

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.96      0.93       231
           1       0.92      0.83      0.88       144

    accuracy                           0.91       375
   macro avg       0.91      0.90      0.90       375
weighted avg       0.91      0.91      0.91       375



###Exploring Various Parameters in Random Forest Classifier

1.let parameters:
   - n_estimators = 50
   - max_features = "log2"
   - criterion = "entropy"
   - bootstrap = False
   - max_depth = 15
   - min_samples_split = 5
   - min_samples_leaf = 3

my learning source:
scikit-learn RandomForestClassifier Parameters

In [8]:
#Train a Random Forest model with specified parameters
rf_params = {
    'criterion': 'entropy',
    'max_depth': 15,
    'min_samples_split': 5,
    'min_samples_leaf': 3,
    'max_features': 'log2',
    'bootstrap': False,
    'n_estimators': 50
}
rf_model_custom = RandomForestClassifier(**rf_params)
rf_model_custom.fit(X_train, y_train)

#Make predictions on the test set using the trained model
y_pred_rf_custom = rf_model_custom.predict(X_test)

#Evaluate the model using a classification report and print the report
report_rf_custom = classification_report(y_test, y_pred_rf_custom)
print("Random Forest Classification Report with Custom Parameters:")
print(report_rf_custom)

Random Forest Classification Report with Custom Parameters:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95       231
           1       0.95      0.87      0.91       144

    accuracy                           0.93       375
   macro avg       0.94      0.92      0.93       375
weighted avg       0.93      0.93      0.93       375

