03 - Feature Engineering - Cancer Risk Prediction
=============================================
Yeni √∂zellikler t√ºreterek model performansƒ±nƒ± iyile≈ütirme i≈ülemleri ve yeni √∂zellik(feature) deƒüi≈ükenleri olu≈üturularak modele dahil edilecektir. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(
    "../data/raw/cancer-patient-data-sets.csv",
    encoding="latin1",
    engine="python"
)

df.shape

(1000, 26)

In [None]:
base_features = [col for col in df.columns if col not in ['index', 'Patient Id', 'Level']]
print(f"Baseline Feature Count: {len(base_features)}")

# Kullanƒ±lmayan s√ºtunlar silindi.

üìù Baseline Feature Count: 23


In [4]:
df_fe = df.copy() 

In [9]:
df_fe['Age_Group'] = pd.cut(
    df_fe['Age'], 
    bins=[0, 25, 40, 55, 100], 
    labels=[0, 1, 2, 3])  # Young, Adult, Middle, Senior olarak gruplandƒ±.
df_fe['Age_Group'] = df_fe['Age_Group'].astype(int)



# Ya≈ü(age) sayƒ±sal deƒüi≈ükeni kendi i√ßinde gruplandƒ±rƒ±ldƒ±.

In [24]:

df_fe['Environmental_Risk'] = (
    df_fe['Air Pollution'] + 
    df_fe['Dust Allergy'] + 
    df_fe['OccuPational Hazards'])

#  Risk Factor Score - √áevresel fakt√∂rler i√ßin bir grup olu≈üturuldu.

In [None]:
df_fe['Lifestyle_Risk'] = (
    df_fe['Smoking'] + 
    df_fe['Alcohol use'] + 
    df_fe['Obesity'] +
    (10 - df_fe['Balanced Diet'])  # Tersi alƒ±ndƒ± √ß√ºnk√º daha y√ºksek diyet = daha d√º≈ü√ºk risk anlamƒ±na gelmektedir.
) / 4

# Ya≈üan tarzƒ±na(Lifestyle_Risk)  baƒülƒ± rick fakt√∂rleri i√ßin bir veri olu≈üturuldu.

In [11]:
df_fe['Genetic_Health_Risk'] = (
    df_fe['Genetic Risk'] + 
    df_fe['chronic Lung Disease']
) / 2

#  Genetik risk fakt√∂r√ºleri i√ßin bir veri hesaplandƒ±.

In [22]:
symptom_cols = ['Chest Pain', 'Coughing of Blood', 'Fatigue', 'Weight Loss',
'Shortness of Breath', 'Wheezing', 'Swallowing Difficulty']
df_fe['Symptom_Severity'] = df_fe[symptom_cols].mean(axis=1)

#  ≈ûiddetli belirtiler gruplandƒ±rƒ±ldƒ±.

In [14]:
df_fe['Respiratory_Score'] = (
    df_fe['Shortness of Breath'] + 
    df_fe['Wheezing'] + 
    df_fe['Dry Cough'] +
    df_fe['chronic Lung Disease']
) / 4

# Solunum ile ilgili veriler gruplandƒ±.

In [None]:
critical_threshold = 6
df_fe['Critical_Symptom_Count'] = (
    (df_fe['Chest Pain'] >= critical_threshold).astype(int) +
    (df_fe['Coughing of Blood'] >= critical_threshold).astype(int) +
    (df_fe['Weight Loss'] >= critical_threshold).astype(int) +
    (df_fe['Shortness of Breath'] >= critical_threshold).astype(int) )

#  Kritik belirtiler alƒ±ndƒ±.

In [25]:
df_fe['Overall_Risk_Score'] = (
    df_fe['Environmental_Risk'] * 0.25 +
    df_fe['Lifestyle_Risk'] * 0.30 +
    df_fe['Genetic_Health_Risk'] * 0.20 +
    df_fe['Symptom_Severity'] * 0.25)

#  T√ºm risk fakt√∂rleri alƒ±ndƒ±.

In [26]:
# Sigara ve ya≈ü etkile≈üimi i√ßin bir deƒüi≈üken olu≈üturalƒ±m.
df_fe['Smoking_Age_Interaction'] = df_fe['Smoking'] * df_fe['Age']
print("‚úÖ Smoking_Age_Interaction created")

# Genetik risk ve ya≈ü arasƒ±ndaki etkile≈üimi i√ßeren bir deƒüi≈üken olu≈üturalƒ±m.
df_fe['Genetic_Age_Interaction'] = df_fe['Genetic Risk'] * df_fe['Age']
print("‚úÖ Genetic_Age_Interaction created")

# Sigara ve hava kirliliƒüi arasƒ±ndaki etkile≈üimi i√ßeren bir deƒüi≈üken olu≈üturalƒ±m.
df_fe['Smoking_Pollution'] = df_fe['Smoking'] * df_fe['Air Pollution']
print("‚úÖ Smoking_Pollution created")

# Obezite ve kronik hastalƒ±k arasƒ±ndaki etkile≈üimi kontrol edelim.
df_fe['Obesity_ChronicLung'] = df_fe['Obesity'] * df_fe['chronic Lung Disease']
print("‚úÖ Obesity_ChronicLung created")

# Pasif sigara ve hava kirliliƒüi arasƒ±ndaki etkile≈üimi kontrol edelim.
df_fe['PassiveSmoker_Pollution'] = df_fe['Passive Smoker'] * df_fe['Air Pollution']
print("‚úÖ PassiveSmoker_Pollution created")

‚úÖ Smoking_Age_Interaction created
‚úÖ Genetic_Age_Interaction created
‚úÖ Smoking_Pollution created
‚úÖ Obesity_ChronicLung created
‚úÖ PassiveSmoker_Pollution created


In [None]:
# Polinom √∂zellikler(POLYNOMIAL FEATURES), mevcut baƒüƒ±msƒ±z deƒüi≈ükenlerin (feature'larƒ±n) kuvvetlerini ($x^2, x^3$) ve bunlarƒ±n birbirleriyle olan √ßarpƒ±mlarƒ±nƒ± (etkile≈üim terimleri)
# veri setine ekler. Polinom √∂zellikler, sadece bir √∂zelliƒüin kendi kuvvetlerini deƒüil, aynƒ± zamanda iki veya daha fazla √∂zelliƒüin etkile≈üimini de modellemeye olanak tanƒ±r.B√∂ylece 
# birden fazla fakt√∂r i√ßin makine √∂ƒürenmesi modelinin doƒürusal olmayan (non-linear) ili≈ükileri ve √∂zellik etkile≈üimlerini yakalayabilmesini saƒülamak ama√ßlanmaktadƒ±r.
# Belirli  deƒüi≈ükenler i√ßin polinom √∂zellikler kontrol edildi.

important_features = ['Smoking', 'Air Pollution', 'Genetic Risk']

for feat in important_features:
    df_fe[f'{feat}_squared'] = df_fe[feat] ** 2
    print(f"‚úÖ {feat}_squared created")

‚úÖ Smoking_squared created
‚úÖ Air Pollution_squared created
‚úÖ Genetic Risk_squared created


In [29]:
# BINNING FEATURES (√ñzellikleri Gruplama/Kategorile≈ütirme), s√ºrekli (continuous) bir sayƒ±sal deƒüi≈ükenin deƒüer aralƒ±ƒüƒ±nƒ± belirli sayƒ±da kategoriye veya "kutucuƒüa" (bin) b√∂lme i≈ülemidir.

df_fe['Smoking_Level'] = pd.cut(
    df_fe['Smoking'], 
    bins=[0, 2, 5, 10], 
    labels=[0, 1, 2])  # Low, Medium, High
df_fe['Smoking_Level'] = df_fe['Smoking_Level'].astype(int)
print("‚úÖ Smoking_Level created")

# Hava kirliliƒüi kategorileri
df_fe['Pollution_Level'] = pd.cut(
    df_fe['Air Pollution'], 
    bins=[0, 3, 6, 10], 
    labels=[0, 1, 2])
df_fe['Pollution_Level'] = df_fe['Pollution_Level'].astype(int)
print("‚úÖ Pollution_Level created")

‚úÖ Smoking_Level created
‚úÖ Pollution_Level created


In [None]:
new_features = [col for col in df_fe.columns if col not in df.columns and col != 'Level']
all_features = [col for col in df_fe.columns if col not in ['index', 'Patient Id', 'Level']]

print(f"\n‚úÖ Original Features: {len(base_features)}")
print(f"‚úÖ New Features: {len(new_features)}")
print(f"‚úÖ Total Features: {len(all_features)}")

print(f"\nüìù New Feature List:")
for i, feat in enumerate(new_features, 1):
    print(f"   {i:2d}. {feat}")

# Olu≈üturulan veri setinin yeni hali hakkƒ±nda √∂zet.


‚úÖ Original Features: 23
‚úÖ New Features: 18
‚úÖ Total Features: 41

üìù New Feature List:
    1. Age_Group
    2. Lifestyle_Risk
    3. Genetic_Health_Risk
    4. Symptom_Severity
    5. Respiratory_Score
    6. Critical_Symptom_Count
    7. Environmental_Risk
    8. Overall_Risk_Score
    9. Smoking_Age_Interaction
   10. Genetic_Age_Interaction
   11. Smoking_Pollution
   12. Obesity_ChronicLung
   13. PassiveSmoker_Pollution
   14. Smoking_squared
   15. Air Pollution_squared
   16. Genetic Risk_squared
   17. Smoking_Level
   18. Pollution_Level


In [32]:
# MODEL COMPARISON - Baseline vs Feature Engineered (MODEL KAR≈ûILA≈ûTIRMASI: TEMEL VE ‚Äã‚Äã√ñZELLƒ∞K M√úHENDƒ∞SLƒ∞ƒûƒ∞)

# Prepare data
X_baseline = df[base_features]
X_fe = df_fe[all_features]
y = df['Level']

# Train-test split
X_base_train, X_base_test, y_train, y_test = train_test_split(
    X_baseline, y, test_size=0.2, random_state=42, stratify=y
)
X_fe_train, X_fe_test, _, _ = train_test_split(
    X_fe, y, test_size=0.2, random_state=42, stratify=y
)

# Scaling
scaler_base = StandardScaler()
scaler_fe = StandardScaler()

X_base_train_scaled = scaler_base.fit_transform(X_base_train)
X_base_test_scaled = scaler_base.transform(X_base_test)

X_fe_train_scaled = scaler_fe.fit_transform(X_fe_train)
X_fe_test_scaled = scaler_fe.transform(X_fe_test)

# Train models
print("\nüîß Training Baseline Model...")
model_baseline = LogisticRegression(random_state=42, max_iter=1000)
model_baseline.fit(X_base_train_scaled, y_train)

print("üîß Training Feature Engineered Model...")
model_fe = LogisticRegression(random_state=42, max_iter=1000)
model_fe.fit(X_fe_train_scaled, y_train)

# Evaluate
baseline_train_acc = model_baseline.score(X_base_train_scaled, y_train)
baseline_test_acc = model_baseline.score(X_base_test_scaled, y_test)

fe_train_acc = model_fe.score(X_fe_train_scaled, y_train)
fe_test_acc = model_fe.score(X_fe_test_scaled, y_test)

# Cross-validation scores
print("\nPerforming Cross-Validation...")
cv_baseline = cross_val_score(model_baseline, X_base_train_scaled, y_train, cv=5)
cv_fe = cross_val_score(model_fe, X_fe_train_scaled, y_train, cv=5)


üîß Training Baseline Model...
üîß Training Feature Engineered Model...

Performing Cross-Validation...


In [33]:
print(f"""
BASELINE MODEL:
Train Accuracy:     {baseline_train_acc:.4f} ({baseline_train_acc*100:.2f}%)
Test Accuracy:      {baseline_test_acc:.4f} ({baseline_test_acc*100:.2f}%)
CV Score (mean):    {cv_baseline.mean():.4f} ¬± {cv_baseline.std():.4f}
Feature Count:      {len(base_features)}

FEATURE ENGINEERED MODEL:
Train Accuracy:     {fe_train_acc:.4f} ({fe_train_acc*100:.2f}%)
Test Accuracy:      {fe_test_acc:.4f} ({fe_test_acc*100:.2f}%)
CV Score (mean):    {cv_fe.mean():.4f} ¬± {cv_fe.std():.4f}
Feature Count:      {len(all_features)}

IMPROVEMENT:
Test Accuracy Gain: {(fe_test_acc - baseline_test_acc)*100:.2f}%
CV Score Gain:      {(cv_fe.mean() - cv_baseline.mean())*100:.2f}%
""")

if fe_test_acc > baseline_test_acc:
    print("‚úÖ Feature Engineering ba≈üarƒ±lƒ±! Model performansƒ± arttƒ±.")
else:
    print("‚ö†Ô∏è Feature Engineering beklenen etkiyi g√∂stermedi. Revizyon gerekebilir.")

# Classification report
print("\n" + "="*80)
print("DETAILED CLASSIFICATION REPORT (Feature Engineered Model)")
print("="*80)
y_pred = model_fe.predict(X_fe_test_scaled)
print(classification_report(y_test, y_pred))


BASELINE MODEL:
Train Accuracy:     1.0000 (100.00%)
Test Accuracy:      1.0000 (100.00%)
CV Score (mean):    1.0000 ¬± 0.0000
Feature Count:      23

FEATURE ENGINEERED MODEL:
Train Accuracy:     1.0000 (100.00%)
Test Accuracy:      1.0000 (100.00%)
CV Score (mean):    1.0000 ¬± 0.0000
Feature Count:      41

IMPROVEMENT:
Test Accuracy Gain: 0.00%
CV Score Gain:      0.00%

‚ö†Ô∏è Feature Engineering beklenen etkiyi g√∂stermedi. Revizyon gerekebilir.

DETAILED CLASSIFICATION REPORT (Feature Engineered Model)
              precision    recall  f1-score   support

        High       1.00      1.00      1.00        73
         Low       1.00      1.00      1.00        61
      Medium       1.00      1.00      1.00        66

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [41]:

df_fe.to_csv('cancer_data_feature_engineered.csv', index=False)
print("‚úÖ Feature engineered data saved: cancer_data_feature_engineered.csv")

# Save feature list
with open('feature_list.txt', 'w') as f:
    f.write("BASELINE FEATURES:\n")
    f.write("="*50 + "\n")
    for feat in base_features:
        f.write(f"- {feat}\n")
    
    f.write("\n\nNEW FEATURES:\n")
    f.write("="*50 + "\n")
    for feat in new_features:
        f.write(f"- {feat}\n")


‚úÖ Feature engineered data saved: cancer_data_feature_engineered.csv
