In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.utils import resample
from joblib import dump

# Step 1: Load and Clean Dataset
df = pd.read_csv("dataset.csv")
df.columns = df.columns.str.lower()
df["gender"] = df["gender"].str.lower().map({"m": 1, "f": 0})
df = df.dropna()

# Step 2: Balance the dataset
df_majority = df[df.lung_cancer == 0]
df_minority = df[df.lung_cancer == 1]
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)
df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Step 3: Separate features and labels
X = df_balanced.drop(columns=["lung_cancer"])
y = df_balanced["lung_cancer"]

# Step 4: Scale features
scaler = StandardScaler()
X_scaled_array = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled_array, columns=X.columns)

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42, stratify=y)

# Step 6: Train LinearSVC with probability support
base_svc = LinearSVC(random_state=42, max_iter=10000)
svm_model = CalibratedClassifierCV(estimator=base_svc, method='sigmoid', cv=5)  # düzeltildi
svm_model.fit(X_train, y_train)

# Step 7: Evaluation
y_pred_svm = svm_model.predict(X_test)
y_proba_svm = svm_model.predict_proba(X_test)[:, 1]

print("\n=== SVM Classification Report ===")
print(classification_report(y_test, y_pred_svm))
print("SVM ROC AUC:", roc_auc_score(y_test, y_proba_svm))
print("SVM Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))

# Step 8: Save model
dump(svm_model, "svm_lung_model.joblib")



=== SVM Classification Report ===
              precision    recall  f1-score   support

           0       0.51      0.49      0.50       371
           1       0.51      0.54      0.52       370

    accuracy                           0.51       741
   macro avg       0.51      0.51      0.51       741
weighted avg       0.51      0.51      0.51       741

SVM ROC AUC: 0.5130909885626866
SVM Confusion Matrix:
 [[181 190]
 [172 198]]


['svm_lung_model.joblib']