In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
data_path = r"C:\Users\Daghan\OneDrive\Masaüstü\DMP\cancer_dataset.csv"
df = pd.read_csv(data_path)

# Assume the target variable is 'diagnosis' and features are the rest of the columns
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Perform cross-validation on the training set
cv_scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=5, scoring='accuracy')

# Train the model on the entire training set
rf_model.fit(X_train_scaled, y_train)

# Predictions on test set
test_predictions = rf_model.predict(X_test_scaled)

# Calculate accuracy and F1 score on test set
test_accuracy = accuracy_score(y_test, test_predictions)
test_f1_score = f1_score(y_test, test_predictions, pos_label='M')  # Assuming 'M' is the positive class

# Print cross-validation scores, testing accuracy, and testing F1 score
print("Cross-Validation Scores:", cv_scores)
print(f"Mean Cross-Validation Accuracy: {cv_scores.mean():.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")
print(f"Testing F1 Score: {test_f1_score:.4f}")


Cross-Validation Scores: [0.96703297 0.94505495 0.97802198 0.94505495 0.93406593]
Mean Cross-Validation Accuracy: 0.9538
Testing Accuracy: 0.9649
Testing F1 Score: 0.9524
