# 06 - Final Pipeline 

Bu dok√ºman, akciƒüer kanseri risk tahmini i√ßin olu≈üturulan **production-ready Machine Learning Pipeline** mimarisini a√ßƒ±klamaktadƒ±r. Pipeline; veri temizleme, feature engineering, encoding, modelleme ve √ßƒ±ktƒ± √ºretme adƒ±mlarƒ±nƒ± mod√ºler ve yeniden kullanƒ±labilir bir bi√ßimde birle≈ütirmektedir.


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import warnings
warnings.filterwarnings('ignore')

In [21]:

# YAPILANDIRMA

RANDOM_STATE = 42
TEST_SIZE = 0.2
CV_FOLDS = 5


In [22]:
#  1. VERƒ∞ Y√úKLEME

df = pd.read_csv('../data/raw/cancer-patient-data-sets.csv')


In [23]:
# 2. √ñZELLƒ∞K M√úHENDƒ∞SLƒ∞ƒûƒ∞ FONKSƒ∞YONU

def engineer_features(df):
    """
    T√ºm √∂zellik m√ºhendisliƒüi d√∂n√º≈ü√ºmlerini uygulayalƒ±m.
    """
    df_fe = df.copy()
    
    # Ya≈ü Gruplarƒ±
    df_fe['Age_Group'] = pd.cut(
        df_fe['Age'], 
        bins=[0, 25, 40, 55, 100], 
        labels=[0, 1, 2, 3]).astype(int)       
    
    # Risk Puanlarƒ±
    df_fe['Environmental_Risk'] = (
        df_fe['Air Pollution'] + 
        df_fe['Dust Allergy'] + 
        df_fe['OccuPational Hazards']
    ) / 3
    
    df_fe['Lifestyle_Risk'] = (
        df_fe['Smoking'] + 
        df_fe['Alcohol use'] + 
        df_fe['Obesity'] +
        (10 - df_fe['Balanced Diet'])
    ) / 4
    
    df_fe['Genetic_Health_Risk'] = (
        df_fe['Genetic Risk'] + 
        df_fe['chronic Lung Disease']
    ) / 2
    
    # Belirti puanlarƒ±
    symptom_cols = ['Chest Pain', 'Coughing of Blood', 'Fatigue', 'Weight Loss',
                    'Shortness of Breath', 'Wheezing', 'Swallowing Difficulty']
    df_fe['Symptom_Severity'] = df_fe[symptom_cols].mean(axis=1)
    
    df_fe['Respiratory_Score'] = (
        df_fe['Shortness of Breath'] + 
        df_fe['Wheezing'] + 
        df_fe['Dry Cough'] +
        df_fe['chronic Lung Disease']
    ) / 4
    
    # Kritik semptomlar
    critical_threshold = 6
    df_fe['Critical_Symptom_Count'] = (
        (df_fe['Chest Pain'] >= critical_threshold).astype(int) +
        (df_fe['Coughing of Blood'] >= critical_threshold).astype(int) +
        (df_fe['Weight Loss'] >= critical_threshold).astype(int) +
        (df_fe['Shortness of Breath'] >= critical_threshold).astype(int)
    )
    
    # Genel risk
    df_fe['Overall_Risk_Score'] = (
        df_fe['Environmental_Risk'] * 0.25 +
        df_fe['Lifestyle_Risk'] * 0.30 +
        df_fe['Genetic_Health_Risk'] * 0.20 +
        df_fe['Symptom_Severity'] * 0.25
    )
    
    # Etkile≈üimler
    df_fe['Smoking_Age_Interaction'] = df_fe['Smoking'] * df_fe['Age']
    df_fe['Genetic_Age_Interaction'] = df_fe['Genetic Risk'] * df_fe['Age']
    df_fe['Smoking_Pollution'] = df_fe['Smoking'] * df_fe['Air Pollution']
    df_fe['Obesity_ChronicLung'] = df_fe['Obesity'] * df_fe['chronic Lung Disease']
    df_fe['PassiveSmoker_Pollution'] = df_fe['Passive Smoker'] * df_fe['Air Pollution']
    
    # Polinom √∂zellikleri
    for feat in ['Smoking', 'Air Pollution', 'Genetic Risk']:
        df_fe[f'{feat}_squared'] = df_fe[feat] ** 2
    
    # Binning
    df_fe['Smoking_Level'] = pd.cut(
        df_fe['Smoking'], 
        bins=[0, 2, 5, 10], 
        labels=[0, 1, 2]).astype(int)
    
    df_fe['Pollution_Level'] = pd.cut(
        df_fe['Air Pollution'], 
        bins=[0, 3, 6, 10], 
        labels=[0, 1, 2]).astype(int)

    return df_fe

print("\nüîß Applying Feature Engineering...")
df_engineered = engineer_features(df)
print(f"‚úÖ Feature Engineering Complete: {df_engineered.shape}")


üîß Applying Feature Engineering...
‚úÖ Feature Engineering Complete: (1000, 44)


In [24]:
# 3. SON √ñZELLƒ∞K SETƒ∞ SE√áƒ∞Mƒ∞

# En uygun √∂zellik setini tanƒ±mlayalƒ±m; deƒüerlendirme sonu√ßlarƒ±na g√∂re yapƒ±lƒ±r.
# Bunlar genellikle √∂zellik √∂nem analizinden elde edilen en √∂nemli √∂zelliklerdir.

FINAL_FEATURES = [
    # Original high-importance features
    'Smoking', 'Genetic Risk', 'Air Pollution', 'Alcohol use',
    'chronic Lung Disease', 'Age', 'Obesity', 'Chest Pain',
    'Coughing of Blood', 'Fatigue', 'Weight Loss', 'Shortness of Breath',
    'Wheezing', 'Passive Smoker', 'OccuPational Hazards',
    
    # Engineered features
    'Overall_Risk_Score', 'Lifestyle_Risk', 'Environmental_Risk',
    'Symptom_Severity', 'Respiratory_Score', 'Genetic_Health_Risk',
    'Smoking_Age_Interaction', 'Genetic_Age_Interaction',
    'Smoking_squared', 'Air Pollution_squared', 'Critical_Symptom_Count',
    'Age_Group', 'Smoking_Level'
]

print(f"\nüìù Final Feature Set: {len(FINAL_FEATURES)} features")
for i, feat in enumerate(FINAL_FEATURES[:10], 1):
    print(f"   {i}. {feat}")
print("   ... (showing first 10)")

# Prepare X and y
X = df_engineered[FINAL_FEATURES]
y = df_engineered['Level']

print(f"\n‚úÖ X shape: {X.shape}")
print(f"‚úÖ y distribution:\n{y.value_counts()}")



üìù Final Feature Set: 28 features
   1. Smoking
   2. Genetic Risk
   3. Air Pollution
   4. Alcohol use
   5. chronic Lung Disease
   6. Age
   7. Obesity
   8. Chest Pain
   9. Coughing of Blood
   10. Fatigue
   ... (showing first 10)

‚úÖ X shape: (1000, 28)
‚úÖ y distribution:
Level
High      365
Medium    332
Low       303
Name: count, dtype: int64


In [25]:
# 4. TRAIN-TEST AYRIMI

TEST_SIZE = 0.20       # Test seti i√ßin verinin %20'si
RANDOM_STATE = 42    # Sabit bir √ßekirdek (seed) deƒüeri

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE, 
    stratify=y
)

print(f"\n‚úÖ Train set: {X_train.shape[0]} samples")
print(f"‚úÖ Test set:  {X_test.shape[0]} samples")


‚úÖ Train set: 800 samples
‚úÖ Test set:  200 samples


In [26]:
# 5. √ñN ƒ∞≈ûLEME

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ StandardScaler fitted and applied")

‚úÖ StandardScaler fitted and applied


In [27]:
# 6. MODEL Eƒûƒ∞Tƒ∞Mƒ∞ (SON MODEL)
# En iyi model yapƒ±landƒ±rmasƒ± yaparak  optimizasyon sonu√ßlarƒ±na g√∂re ayarlayalƒ±m.

final_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

print(f"\nüîß Model: {final_model.__class__.__name__}")
print("‚öôÔ∏è Parameters:")
print(f"   - n_estimators: {final_model.n_estimators}")
print(f"   - max_depth: {final_model.max_depth}")
print(f"   - min_samples_split: {final_model.min_samples_split}")
print(f"   - min_samples_leaf: {final_model.min_samples_leaf}")

print("\nüöÄ Training final model...")
final_model.fit(X_train_scaled, y_train)
print("‚úÖ Training completed!")



üîß Model: RandomForestClassifier
‚öôÔ∏è Parameters:
   - n_estimators: 300
   - max_depth: 15
   - min_samples_split: 2
   - min_samples_leaf: 1

üöÄ Training final model...
‚úÖ Training completed!


In [28]:
# 7. MODEL DEƒûERLENDƒ∞RMESƒ∞

# Tahminler(Predictions):

y_train_pred = final_model.predict(X_train_scaled)
y_test_pred = final_model.predict(X_test_scaled)

# Doƒüruluk puanlarƒ±(Accuracy scores):

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

# Cross-validation
CV_FOLDS = 5  # 5 katlƒ± √ßapraz doƒürulama (5-fold cross-validation)
cv_scores = cross_val_score(
    final_model, X_train_scaled, y_train, 
    cv=CV_FOLDS, scoring='accuracy')

print(f"\n PERFORMANCE METRICS:")
print(f"   Train Accuracy:      {train_acc:.4f} ({train_acc*100:.2f}%)")
print(f"   Test Accuracy:       {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"   CV Score (mean):     {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
print(f"   Overfitting:         {(train_acc - test_acc):.4f}")


 PERFORMANCE METRICS:
   Train Accuracy:      1.0000 (100.00%)
   Test Accuracy:       1.0000 (100.00%)
   CV Score (mean):     1.0000 ¬± 0.0000
   Overfitting:         0.0000


In [29]:
# Detailed classification report
print(f"\n CLASSIFICATION REPORT:")
print("="*80)
print(classification_report(y_test, y_test_pred))

# Confusion matrix
print(f"\n CONFUSION MATRIX:")
print("="*80)
cm = confusion_matrix(y_test, y_test_pred, labels=['Low', 'Medium', 'High'])
cm_df = pd.DataFrame(
    cm,
    index=['True: Low', 'True: Medium', 'True: High'],
    columns=['Pred: Low', 'Pred: Medium', 'Pred: High']
)
print(cm_df)

# Per-class accuracy
print(f"\n PER-CLASS ACCURACY:")
for i, label in enumerate(['Low', 'Medium', 'High']):
    class_acc = cm[i, i] / cm[i, :].sum()
    print(f"   {label:10s}: {class_acc:.4f} ({class_acc*100:.2f}%)")


 CLASSIFICATION REPORT:
              precision    recall  f1-score   support

        High       1.00      1.00      1.00        73
         Low       1.00      1.00      1.00        61
      Medium       1.00      1.00      1.00        66

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


 CONFUSION MATRIX:
              Pred: Low  Pred: Medium  Pred: High
True: Low            61             0           0
True: Medium          0            66           0
True: High            0             0          73

 PER-CLASS ACCURACY:
   Low       : 1.0000 (100.00%)
   Medium    : 1.0000 (100.00%)
   High      : 1.0000 (100.00%)


In [30]:
# 8. √ñZELLƒ∞K √ñNEMƒ∞ (SON MODEL)

importance_df = pd.DataFrame({
    'feature': FINAL_FEATURES,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n Top 10 Most Important Features:")
print(importance_df.head(10).to_string(index=False))


 Top 10 Most Important Features:
               feature  importance
      Symptom_Severity    0.212949
        Passive Smoker    0.095341
    Overall_Risk_Score    0.092324
               Obesity    0.087199
     Coughing of Blood    0.085572
              Wheezing    0.047258
        Lifestyle_Risk    0.040275
               Fatigue    0.038828
Critical_Symptom_Count    0.037338
     Respiratory_Score    0.029238


In [31]:
# 9. MODEL KALICILIƒûI

# Modeli kaydedelim.
with open('final_model.pkl', 'wb') as f:
    pickle.dump(final_model, f)
print("‚úÖ Model saved: final_model.pkl")

# √ñl√ßekleyiciyi kaydedelim.
with open('final_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("‚úÖ Scaler saved: final_scaler.pkl")

# √ñzellik listesini kaydet
with open('final_features.txt', 'w') as f:
    f.write("FINAL FEATURE SET\n")
    f.write("="*80 + "\n\n")
    for i, feat in enumerate(FINAL_FEATURES, 1):
        f.write(f"{i}. {feat}\n")
print("‚úÖ Feature list saved: final_features.txt")

# Pipeline meta verilerini kaydedelim.
metadata = {
    'model_type': final_model.__class__.__name__,
    'n_features': len(FINAL_FEATURES),
    'train_size': len(X_train),
    'test_size': len(X_test),
    'test_accuracy': test_acc,
    'cv_score_mean': cv_scores.mean(),
    'cv_score_std': cv_scores.std(),
    'random_state': RANDOM_STATE,
    'date_trained': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}

with open('pipeline_metadata.txt', 'w') as f:
    f.write("FINAL PIPELINE METADATA\n")
    f.write("="*80 + "\n\n")
    for key, value in metadata.items():
        f.write(f"{key}: {value}\n")
print("‚úÖ Metadata saved: pipeline_metadata.txt")


‚úÖ Model saved: final_model.pkl
‚úÖ Scaler saved: final_scaler.pkl
‚úÖ Feature list saved: final_features.txt
‚úÖ Metadata saved: pipeline_metadata.txt


In [32]:
# 10. SON √ñZET

print(f"""
‚úÖ PIPELINE SUCCESSFULLY BUILT!

MODEL INFORMATION:
Model Type:          {final_model.__class__.__name__}
Number of Features:  {len(FINAL_FEATURES)}
Training Samples:    {len(X_train):,}
Test Samples:        {len(X_test):,}

üìà PERFORMANCE METRICS:
Test Accuracy:       {test_acc:.4f} ({test_acc*100:.2f}%)
CV Score:            {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}
Overfitting:         {(train_acc - test_acc):.4f}

üì¶ SAVED ARTIFACTS:
‚úÖ final_model.pkl
‚úÖ final_scaler.pkl
‚úÖ final_features.txt
‚úÖ pipeline_metadata.txt

üöÄ READY FOR DEPLOYMENT:
The model is ready to be integrated into inference pipeline.
Use inference.py to make predictions on new data.


""")

print("="*80)
print("PIPELINE BUILD COMPLETED! üéâ")
print("="*80)


‚úÖ PIPELINE SUCCESSFULLY BUILT!

MODEL INFORMATION:
Model Type:          RandomForestClassifier
Number of Features:  28
Training Samples:    800
Test Samples:        200

üìà PERFORMANCE METRICS:
Test Accuracy:       1.0000 (100.00%)
CV Score:            1.0000 ¬± 0.0000
Overfitting:         0.0000

üì¶ SAVED ARTIFACTS:
‚úÖ final_model.pkl
‚úÖ final_scaler.pkl
‚úÖ final_features.txt
‚úÖ pipeline_metadata.txt

üöÄ READY FOR DEPLOYMENT:
The model is ready to be integrated into inference pipeline.
Use inference.py to make predictions on new data.



PIPELINE BUILD COMPLETED! üéâ
