In [1]:
# =========================
# Core
# =========================
import numpy as np
import pandas as pd

# =========================
# Visualization
# =========================
import matplotlib.pyplot as plt
import seaborn as sns

# =========================
# Scikit-learn – preprocessing
# =========================
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# =========================
# Scikit-learn – model selection
# =========================
from sklearn.model_selection import train_test_split

# =========================
# Scikit-learn – models
# =========================
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

# =========================
# Scikit-learn – evaluation metrics
# =========================
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    precision_recall_curve)

# =========================
# GPU (RAPIDS – optional)
# =========================
USE_GPU = True
try:
    import cudf
    from cuml.svm import SVC as cuSVC
    from cuml.metrics import roc_auc_score as cuml_roc_auc_score
except Exception as e:
    USE_GPU = False
    gpu_import_error = repr(e)

# טעינת הדאטה
file_path = r"C:\Users\hadar\Downloads\Traffic_Crashes_-_Crashes.csv"
df = pd.read_csv(file_path)

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
# רשימת עמודות למחיקה
columns_to_drop = [
    'CRASH_RECORD_ID',
    'CRASH_DATE_EST_I',
    'LANE_CNT',
    'REPORT_TYPE',
    'INTERSECTION_RELATED_I',
    'NOT_RIGHT_OF_WAY_I',
    'HIT_AND_RUN_I',
    'DOORING_I',
    'PHOTOS_TAKEN_I',
    'STATEMENTS_TAKEN_I',
    'WORK_ZONE_I',
    'WORK_ZONE_TYPE',
    'WORKERS_PRESENT_I',
    'DATE_POLICE_NOTIFIED',
    'SEC_CONTRIBUTORY_CAUSE',
    'STREET_DIRECTION',
    'STREET_NO',
    'STREET_NAME',
    'BEAT_OF_OCCURRENCE',
    'INJURIES_TOTAL',
    'INJURIES_FATAL',
    'INJURIES_INCAPACITATING',
    'INJURIES_NON_INCAPACITATING',
    'INJURIES_REPORTED_NOT_EVIDENT',
    'INJURIES_NO_INDICATION',
    'INJURIES_UNKNOWN',
    'LOCATION',
    'LATITUDE',
    'LONGITUDE',
    'CRASH_TYPE'
]

# מחיקת העמודות
df = df.drop(columns=columns_to_drop)

In [3]:
# המרת עמודת התאריך לפורמט datetime
df['CRASH_DATE'] = pd.to_datetime(df['CRASH_DATE'])

# יצירת עמודת זמן (שעה בלבד)
df['CRASH_TIME'] = df['CRASH_DATE'].dt.time

# יצירת עמודת חודש
df['CRASH_MONTH'] = df['CRASH_DATE'].dt.month

# יצירת עמודת שנה
df['CRASH_YEAR'] = df['CRASH_DATE'].dt.year

# יצירת עמודת יום בשבוע
# pandas: Monday=0 ... Sunday=6
# אנחנו רוצים: Sunday=1 ... Saturday=7
df['CRASH_DAY_OF_WEEK'] = df['CRASH_DATE'].dt.weekday
df['CRASH_DAY_OF_WEEK'] = ((df['CRASH_DAY_OF_WEEK'] + 1) % 7) + 1
df = df[df['CRASH_YEAR'] == 2025].copy()
df = df.drop(columns=['CRASH_DATE','CRASH_YEAR'])

In [4]:
# אם CRASH_TIME הוא datetime.time / או מחרוזת - נהפוך אותו לטיפוס זמן אחיד
# (אם כבר אצלך הוא time, זה פשוט יעבוד)
crash_time = pd.to_datetime(df['CRASH_TIME'].astype(str), format='%H:%M:%S', errors='coerce')

# דקות מאז חצות (שומר דיוק: שעה+דקות+שניות)
df['CRASH_MINUTES_FROM_MIDNIGHT'] = crash_time.dt.hour * 60 + crash_time.dt.minute + (crash_time.dt.second / 60)

# IS_NIGHT: בין 20:00–06:00
# 20:00 = 1200 דקות, 06:00 = 360 דקות
df['IS_NIGHT'] = df['CRASH_MINUTES_FROM_MIDNIGHT'].apply(
    lambda m: 1 if (m >= 1200 or m < 360) else 0)

# IS_WEEKEND: יום ראשון (1) או שבת (7)
df['IS_WEEKEND'] = df['CRASH_DAY_OF_WEEK'].apply(lambda d: 1 if d in [1, 7] else 0)

# IS_RUSH_HOUR: 7–9 כולל ו־16–18 כולל
df['IS_RUSH_HOUR'] = df['CRASH_MINUTES_FROM_MIDNIGHT'].apply(
    lambda m: 1 if (420 <= m <= 540 or 960 <= m <= 1080) else 0)

df = df.drop(columns=['CRASH_MINUTES_FROM_MIDNIGHT'])

In [5]:
# המהירות החוקית הנמוכה ביותר בשיקגו היא 15 mph (סמטאות),
# לכן ערכים נמוכים מ-15 אינם מייצגים תמרור חוקי ונחשבים שגויים
df['POSTED_SPEED_LIMIT'] = pd.to_numeric(df['POSTED_SPEED_LIMIT'], errors='coerce')

# להשאיר רק שורות עם מהירות >= 15 או ערך חסר (NaN)
df = df[df['POSTED_SPEED_LIMIT'].isna() | (df['POSTED_SPEED_LIMIT'] >= 15)].copy()

In [6]:
col = 'TRAFFIC_CONTROL_DEVICE'
threshold = 5000

# 1) איחוד UNKNOWN ו-NaN ל-OTHER
df[col] = df[col].fillna('OTHER').replace('UNKNOWN', 'OTHER')

KEEP = {
    "NO CONTROLS",
    "TRAFFIC SIGNAL",
    "STOP SIGN/FLASHER"}
df = df[df["TRAFFIC_CONTROL_DEVICE"].isin(KEEP)].copy()

df = df[df[col] != 'OTHER'].copy()

In [7]:
col = 'DEVICE_CONDITION'

# ניקוי רווחים ויישור טקסט
df[col] = df[col].astype(str).str.strip()

# 1) איחוד UNKNOWN עם OTHER (וגם NaN אם יש)
df[col] = df[col].replace({'UNKNOWN': 'OTHER'}).fillna('OTHER')

# 2) איחוד קטגוריות "לא מתפקד" לקטגוריה אחת
df[col] = df[col].replace({
    'FUNCTIONING IMPROPERLY': 'NOT_FUNCTIONING',
    'NOT FUNCTIONING': 'NOT_FUNCTIONING',
    'WORN REFLECTIVE MATERIAL': 'NOT_FUNCTIONING',
    'MISSING': 'NOT_FUNCTIONING'
})

# 3) הסרת שורות עם OTHER (כולל UNKNOWN שאוחד)
df = df[df[col] != 'OTHER'].copy()

In [8]:
col = 'WEATHER_CONDITION'

# 1) ניקוי רווחים
df[col] = df[col].astype(str).str.strip()

# 2) הסרת UNKNOWN / OTHER / NaN
df[col] = df[col].replace(['UNKNOWN', 'OTHER'], 'UNKNOWN').fillna('UNKNOWN')
df = df[df[col] != 'UNKNOWN'].copy()

# 3) איחוד קטגוריות בתוך אותה עמודה
df.loc[df[col] == 'CLEAR', col] = 'Clear'
df.loc[df[col] == 'CLOUDY/OVERCAST', col] = 'Cloudy'
df.loc[df[col] == 'RAIN', col] = 'Rain'

df.loc[df[col].isin([
    'SNOW',
    'SLEET/HAIL',
    'FREEZING RAIN/DRIZZLE',
    'BLOWING SNOW'
]), col] = 'Snow/Ice'

df.loc[df[col].isin([
    'FOG/SMOKE/HAZE',
    'SEVERE CROSS WIND GATE',
    'BLOWING SAND, SOIL, DIRT'
]), col] = 'Low Visibility'

In [9]:
# ניקוי רווחים
df['LIGHTING_CONDITION'] = df['LIGHTING_CONDITION'].astype(str).str.strip()

# הסרת UNKNOWN
df = df[df['LIGHTING_CONDITION'] != 'UNKNOWN'].copy()

# איחוד קטגוריות: DAWN + DUSK -> TWILIGHT
df['LIGHTING_CONDITION'] = df['LIGHTING_CONDITION'].replace({
    'DAWN': 'TWILIGHT',
    'DUSK': 'TWILIGHT'})

In [10]:
df = df[~df['TRAFFICWAY_TYPE'].isin([
    'OTHER',
    'UNKNOWN',
    'NOT REPORTED',
    'UNKNOWN INTERSECTION TYPE'
])].copy()
df = df[~df['ROADWAY_SURFACE_COND'].isin(['OTHER', 'UNKNOWN'])].copy()

In [11]:
col = 'ROAD_DEFECT'

# 1) ניקוי רווחים
df[col] = df[col].astype(str).str.strip()

# 2) הסרת UNKNOWN ו-OTHER
df = df[~df[col].isin(['UNKNOWN', 'OTHER'])].copy()

# 3) המרה לבינארי:
# NO DEFECTS -> 0
# כל סוג פגם אחר -> 1
df[col] = df[col].apply(
    lambda x: 0 if x == 'NO DEFECTS' else 1).astype(int)

In [12]:
df = df[~df['PRIM_CONTRIBUTORY_CAUSE'].isin([
    'UNABLE TO DETERMINE',
    'NOT APPLICABLE'])].copy()

In [13]:
# יצירת משתנה יעד בינארי: פגיעה / ללא פגיעה
df['INJURY'] = df['MOST_SEVERE_INJURY'].apply(
    lambda x: 0 if x == 'NO INDICATION OF INJURY' else 1
).astype(int)

# (אופציונלי ומומלץ) הסרת עמודת המקור
df = df.drop(columns=['MOST_SEVERE_INJURY'])

In [14]:
# המרה ל-datetime אם צריך
df['CRASH_TIME'] = pd.to_datetime(df['CRASH_TIME'], format='%H:%M:%S')

# דקות מאז חצות
df['CRASH_MINUTES_FROM_MIDNIGHT'] = (
    df['CRASH_TIME'].dt.hour * 60 +
    df['CRASH_TIME'].dt.minute)
df = df.drop(columns=['CRASH_TIME'])

In [15]:
df['CRASH_TIME_SIN'] = np.sin(2 * np.pi * df['CRASH_MINUTES_FROM_MIDNIGHT'] / 1440)
df['CRASH_TIME_COS'] = np.cos(2 * np.pi * df['CRASH_MINUTES_FROM_MIDNIGHT'] / 1440)

In [16]:
# ------------------------------------------------------------
# 0) עזר: לנקות טקסטים בעמודות קטגוריות (מונע רווחים/None)
# ------------------------------------------------------------
cat_cols_for_rules = [
    'TRAFFIC_CONTROL_DEVICE',
    'LIGHTING_CONDITION',
    'WEATHER_CONDITION'
]

for c in cat_cols_for_rules:
    if c in df.columns:
        df[c] = df[c].astype(str).str.strip()

# ------------------------------------------------------------
# 1) NIGHT + NO TRAFFIC CONTROL
#    (IS_NIGHT = 1 וגם TRAFFIC_CONTROL_DEVICE == 'NO CONTROLS')
# ------------------------------------------------------------
df['NIGHT_AND_NO_CONTROL'] = (
    (df['IS_NIGHT'] == 1) &
    (df['TRAFFIC_CONTROL_DEVICE'].eq('NO CONTROLS'))
).astype(int)

# למה כדאי? בלילה בלי בקרה תנועתית יש פחות "הכוונה" → עלול להגדיל פגיעות.

# ------------------------------------------------------------
# 2) NIGHT + POOR LIGHTING
#    (לילה + DARKNESS)
# ------------------------------------------------------------
df['NIGHT_AND_POOR_LIGHTING'] = (
    (df['IS_NIGHT'] == 1) &
    (df['LIGHTING_CONDITION'].eq('DARKNESS'))
).astype(int)

# למה כדאי? שילוב חושך ותאורה לקויה מצמצם ראות וזמן תגובה.

# ------------------------------------------------------------
# 3) RUSH HOUR + NO TRAFFIC CONTROL
# ------------------------------------------------------------
df['RUSH_AND_NO_CONTROL'] = (
    (df['IS_RUSH_HOUR'] == 1) &
    (df['TRAFFIC_CONTROL_DEVICE'].eq('NO CONTROLS'))
).astype(int)

# למה כדאי? עומס תנועה ללא ויסות מעלה קונפליקטים וסיכון לתאונות.

# ------------------------------------------------------------
# 4) WEEKEND + NIGHT
# ------------------------------------------------------------
df['WEEKEND_NIGHT'] = (
    (df['IS_WEEKEND'] == 1) &
    (df['IS_NIGHT'] == 1)
).astype(int)

# למה כדאי? שעות בילוי/עייפות בסופ"ש בלילה קשורות לעיתים לסיכון מוגבר.

# ------------------------------------------------------------
# 5) MULTI_VEHICLE (יותר מרכב אחד)
# ------------------------------------------------------------
if 'NUM_UNITS' in df.columns:
    df['MULTI_VEHICLE_3PLUS'] = (df['NUM_UNITS'] >= 3).astype(int)

In [17]:
y = df['INJURY']
X = df.drop(columns=['INJURY'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y)

In [18]:
ohe_cols = [
    'TRAFFIC_CONTROL_DEVICE',
    'DEVICE_CONDITION',
    'WEATHER_CONDITION',
    'LIGHTING_CONDITION',
    'FIRST_CRASH_TYPE',
    'TRAFFICWAY_TYPE',
    'ALIGNMENT',
    'ROADWAY_SURFACE_COND',
    'DAMAGE',
    'PRIM_CONTRIBUTORY_CAUSE'
]

ohe = OneHotEncoder(
    handle_unknown='ignore',
    sparse_output=False)

# fit רק על TRAIN
train_ohe = ohe.fit_transform(X_train[ohe_cols])

# transform על TEST
test_ohe = ohe.transform(X_test[ohe_cols])

new_cols = ohe.get_feature_names_out(ohe_cols)

train_ohe_df = pd.DataFrame(train_ohe, columns=new_cols, index=X_train.index).astype(int)
test_ohe_df  = pd.DataFrame(test_ohe,  columns=new_cols, index=X_test.index).astype(int)

# הסרת עמודות מקוריות וחיבור החדשות
X_train = pd.concat(
    [X_train.drop(columns=ohe_cols), train_ohe_df],
    axis=1
)
X_test = pd.concat(
    [X_test.drop(columns=ohe_cols), test_ohe_df],
    axis=1)

num_cols = [
    'POSTED_SPEED_LIMIT',
    'NUM_UNITS',
    'CRASH_TIME_SIN',
    'CRASH_TIME_COS',
    'CRASH_MINUTES_FROM_MIDNIGHT'
]

scaler = StandardScaler()

# fit רק על TRAIN
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

# transform על TEST
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [19]:
print(y.value_counts(normalize=True))

INJURY
0    0.790229
1    0.209771
Name: proportion, dtype: float64


In [20]:
# ============================================================
# Logistic Regression (Threshold tuned for Recall >= 0.75)
# ============================================================

lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train, y_train)
# =========================
# 1) Predict probabilities
# =========================
y_proba = lr.predict_proba(X_test)[:, 1]

# =========================
# 2) ROC-AUC (threshold independent)
# =========================
roc_auc = roc_auc_score(y_test, y_proba)

# =========================
# 3) Precision–Recall curve
# =========================
prec, rec, thr = precision_recall_curve(y_test, y_proba)

# Align arrays: thr length = len(prec)-1 = len(rec)-1
prec_thr = prec[1:]
rec_thr  = rec[1:]

# =========================
# 4) Choose Threshold:
#    minimum recall 0.75 + maximize precision
# =========================
target_recall = 0.75
valid = np.where(rec_thr >= target_recall)[0]

print("======================================================")
print("MODEL: Logistic Regression (Recall>=0.75, max Precision)")
print("======================================================")

if len(valid) == 0:
    print(f"No threshold achieves recall >= {target_recall}")
    print(f"ROC-AUC: {roc_auc:.4f}")
else:
    best_idx = valid[np.argmax(prec_thr[valid])]
    best_thr = thr[best_idx]

    y_pred = (y_proba >= best_thr).astype(int)

    print(f"Chosen threshold:        {best_thr:.6f}")
    print(f"Precision at threshold:  {prec_thr[best_idx]:.3f}")
    print(f"Recall at threshold:     {rec_thr[best_idx]:.3f}")
    print(f"ROC-AUC:                 {roc_auc:.4f}\n")

    print("Classification report:")
    print(classification_report(y_test, y_pred, digits=3))

    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

MODEL: Logistic Regression (Recall>=0.75, max Precision)
Chosen threshold:        0.455637
Precision at threshold:  0.370
Recall at threshold:     0.750
ROC-AUC:                 0.7917

Classification report:
              precision    recall  f1-score   support

           0      0.909     0.662     0.766      8594
           1      0.370     0.750     0.496      2281

    accuracy                          0.680     10875
   macro avg      0.640     0.706     0.631     10875
weighted avg      0.796     0.680     0.709     10875

Confusion matrix:
[[5685 2909]
 [ 570 1711]]


In [21]:
# ============================================================
# Random Forest Classifier (Threshold tuned for Recall >= 0.75)
# ============================================================

# =========================
# 1) Build the model
# =========================
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_leaf=10,
    max_features="sqrt",
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

# =========================
# 2) Train
# =========================
rf.fit(X_train, y_train)

# =========================
# 3) Predict probabilities
# =========================
y_proba = rf.predict_proba(X_test)[:, 1]

# =========================
# 4) ROC-AUC (threshold independent)
# =========================
roc_auc = roc_auc_score(y_test, y_proba)

# =========================
# 5) Precision–Recall Curve
# =========================
prec, rec, thr = precision_recall_curve(y_test, y_proba)

# thr is shorter by 1, align accordingly
prec_thr = prec[1:]
rec_thr  = rec[1:]

# =========================
# 6) Choose Threshold:
#    minimum recall 0.75 + maximize precision
# =========================
target_recall = 0.75
valid = np.where(rec_thr >= target_recall)[0]

print("===============================================")
print("MODEL: Random Forest (Recall>=0.75, max Precision)")
print("===============================================")

if len(valid) == 0:
    print(f"No threshold achieves recall >= {target_recall}")
    print(f"ROC-AUC: {roc_auc:.4f}")
else:
    best_idx = valid[np.argmax(prec_thr[valid])]
    best_thr = thr[best_idx]

    y_pred = (y_proba >= best_thr).astype(int)

    # metrics at chosen threshold
    chosen_precision = prec_thr[best_idx]
    chosen_recall    = rec_thr[best_idx]

    # =========================
    # 7) Results
    # =========================
    print(f"Chosen threshold: {best_thr:.6f}")
    print(f"Precision at threshold: {chosen_precision:.3f}")
    print(f"Recall at threshold:    {chosen_recall:.3f}")
    print(f"ROC-AUC:                {roc_auc:.4f}\n")

    print("Classification report:")
    print(classification_report(y_test, y_pred, digits=3))

    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

MODEL: Random Forest (Recall>=0.75, max Precision)
Chosen threshold: 0.469674
Precision at threshold: 0.369
Recall at threshold:    0.751
ROC-AUC:                0.7894

Classification report:
              precision    recall  f1-score   support

           0      0.909     0.659     0.764      8594
           1      0.369     0.751     0.495      2281

    accuracy                          0.679     10875
   macro avg      0.639     0.705     0.630     10875
weighted avg      0.796     0.679     0.708     10875

Confusion matrix:
[[5667 2927]
 [ 569 1712]]


In [22]:
# ============================================================
# Helper: convert inputs safely to numpy (for CPU fallback)
# ============================================================
def to_numpy(X):
    # pandas
    if hasattr(X, "to_numpy"):
        return X.to_numpy()
    return np.asarray(X)

# ============================================================
# Try GPU (RAPIDS) - fallback to CPU if not available
# ============================================================
USE_GPU = True
try:
    _ = cudf
    _ = cuSVC
    _ = cuml_roc_auc_score
except Exception as e:
    USE_GPU = False
    gpu_import_error = repr(e)

# ============================================================
# 1) Expect you already have:
#    X_train, X_test, y_train, y_test
# ============================================================

if USE_GPU:
    # -------------------------
    # GPU path (cuML)
    # -------------------------
    def to_cudf(X):
        """
        Convert pandas DataFrame / numpy array -> cuDF DataFrame.
        If X is sparse, convert to dense (may be heavy).
        """
        try:
            if sp.issparse(X):
                X = X.toarray()
        except Exception:
            pass

        if hasattr(X, "to_numpy"):  # pandas
            return cudf.DataFrame.from_pandas(X)
        return cudf.DataFrame(X)

    def to_cudf_series(y):
        if hasattr(y, "to_numpy"):
            y = y.to_numpy()
        return cudf.Series(y)

    # Move to GPU
    X_train_gpu = to_cudf(X_train)
    X_test_gpu  = to_cudf(X_test)
    y_train_gpu = to_cudf_series(y_train)
    y_test_gpu  = to_cudf_series(y_test)

    # Ensure float32 to reduce memory
    for col in X_train_gpu.columns:
        if X_train_gpu[col].dtype == np.float64:
            X_train_gpu[col] = X_train_gpu[col].astype(np.float32)
            X_test_gpu[col]  = X_test_gpu[col].astype(np.float32)

    # Build & train GPU SVM
    svm_gpu = cuSVC(
        kernel="rbf",
        C=1.0,
        gamma="scale",
        class_weight="balanced"
    )

    svm_gpu.fit(X_train_gpu, y_train_gpu)

    # Predict + score
    y_pred_gpu  = svm_gpu.predict(X_test_gpu)
    y_score_gpu = svm_gpu.decision_function(X_test_gpu)

    # Back to CPU for reporting
    y_true  = y_test_gpu.to_numpy()
    y_pred  = y_pred_gpu.to_numpy()
    y_score = y_score_gpu.to_numpy()

    print("\n==============================")
    print("SVM (cuML / GPU)")
    print("==============================")
    print(classification_report(y_true, y_pred))
    print("ROC-AUC (sklearn on CPU, score):", roc_auc_score(y_true, y_score))
    print("ROC-AUC (cuML on GPU, score):", float(cuml_roc_auc_score(y_test_gpu, y_score_gpu)))

else:
    # -------------------------
    # CPU fallback (sklearn)
    # -------------------------
    print("\n==============================")
    print("SVM (sklearn / CPU) – GPU not available")
    print("==============================")
    print("Reason:", gpu_import_error)
    print("Tip: To use GPU you must install RAPIDS (cudf/cuml) with a CUDA-matching environment.\n")

    Xtr = to_numpy(X_train).astype(np.float32, copy=False)
    Xte = to_numpy(X_test).astype(np.float32, copy=False)
    ytr = to_numpy(y_train)
    yte = to_numpy(y_test)

    svm_cpu = SVC(
        kernel="rbf",
        C=1.0,
        gamma="scale",
        class_weight="balanced"
    )

    svm_cpu.fit(Xtr, ytr)

    y_pred = svm_cpu.predict(Xte)

    # decision_function is best for ROC-AUC in SVM
    y_score = svm_cpu.decision_function(Xte)

    print(classification_report(yte, y_pred))
    print(f"ROC-AUC (score): {roc_auc_score(yte, y_score):.4f}")


SVM (sklearn / CPU) – GPU not available
Reason: NameError("name 'cudf' is not defined")
Tip: To use GPU you must install RAPIDS (cudf/cuml) with a CUDA-matching environment.

              precision    recall  f1-score   support

           0       0.90      0.70      0.79      8594
           1       0.38      0.71      0.50      2281

    accuracy                           0.70     10875
   macro avg       0.64      0.70      0.64     10875
weighted avg       0.79      0.70      0.73     10875

ROC-AUC (score): 0.7789


In [23]:
# =========================
# Gradient Boosting Classifier
# =========================

# 1) Build model
gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

# 2) Train
gb.fit(X_train, y_train)

# 3) Predict probabilities
y_proba = gb.predict_proba(X_test)[:, 1]

# 4) ROC-AUC
roc_auc = roc_auc_score(y_test, y_proba)

# 5) Precision–Recall curve
prec, rec, thr = precision_recall_curve(y_test, y_proba)

# thr shorter by 1
prec_thr = prec[1:]
rec_thr  = rec[1:]

# 6) Choose threshold: Recall >= 0.75, maximize Precision
target_recall = 0.75
valid = np.where(rec_thr >= target_recall)[0]

print("======================================================")
print("MODEL: Gradient Boosting (Recall>=0.75, max Precision)")
print("======================================================")

if len(valid) == 0:
    print(f"No threshold achieves recall >= {target_recall}")
    print(f"ROC-AUC: {roc_auc:.4f}")
else:
    best_idx = valid[np.argmax(prec_thr[valid])]
    best_thr = thr[best_idx]

    y_pred = (y_proba >= best_thr).astype(int)

    print(f"Chosen threshold:        {best_thr:.6f}")
    print(f"Precision at threshold:  {prec_thr[best_idx]:.3f}")
    print(f"Recall at threshold:     {rec_thr[best_idx]:.3f}")
    print(f"ROC-AUC:                 {roc_auc:.4f}\n")

    print("Classification report:")
    print(classification_report(y_test, y_pred, digits=3))

    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

MODEL: Gradient Boosting (Recall>=0.75, max Precision)
Chosen threshold:        0.192498
Precision at threshold:  0.372
Recall at threshold:     0.755
ROC-AUC:                 0.7925

Classification report:
              precision    recall  f1-score   support

           0      0.911     0.661     0.766      8594
           1      0.372     0.755     0.498      2281

    accuracy                          0.681     10875
   macro avg      0.641     0.708     0.632     10875
weighted avg      0.798     0.681     0.710     10875

Confusion matrix:
[[5681 2913]
 [ 558 1723]]


In [24]:
# =========================
# KNN Classifier
# =========================
knn = KNeighborsClassifier(
    n_neighbors=25,
    weights="distance",
    metric="minkowski"
)

knn.fit(X_train, y_train)

# הסתברויות
y_proba = knn.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_proba)

# Precision–Recall curve
prec, rec, thr = precision_recall_curve(y_test, y_proba)
prec_thr, rec_thr = prec[1:], rec[1:]

# יעד Recall
target_recall = 0.75
valid = np.where(rec_thr >= target_recall)[0]

if len(valid) == 0:
    print("KNN – No threshold achieves recall ≥ 0.75")
else:
    best_idx = valid[np.argmax(prec_thr[valid])]
    best_thr = thr[best_idx]
    y_pred = (y_proba >= best_thr).astype(int)

    print("\n==============================")
    print("KNN Classifier (Recall ≥ 0.75)")
    print("==============================")
    print("Chosen threshold:", best_thr)
    print(f"Precision: {prec_thr[best_idx]:.3f} | Recall: {rec_thr[best_idx]:.3f}")
    print(f"ROC-AUC: {roc_auc:.4f}")

    print("\nClassification report:")
    print(classification_report(y_test, y_pred))

    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))



KNN Classifier (Recall ≥ 0.75)
Chosen threshold: 0.16024021906998098
Precision: 0.286 | Recall: 0.750
ROC-AUC: 0.6859

Classification report:
              precision    recall  f1-score   support

           0       0.88      0.50      0.64      8594
           1       0.29      0.75      0.41      2281

    accuracy                           0.55     10875
   macro avg       0.58      0.63      0.53     10875
weighted avg       0.76      0.55      0.59     10875

Confusion matrix:
[[4317 4277]
 [ 570 1711]]


In [25]:
df.to_csv("clean_data.csv", index=False)