In [1]:
import pathlib

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import compose, dummy, impute, metrics, neighbors
from sklearn import model_selection, pipeline, preprocessing


DATA_DIR = pathlib.Path("/kaggle/input/rainfall-probability-cs-209-spring-2026")
RANDOM_STATE = np.random.RandomState(42)


# Load the Data

In [2]:
label_name = "rainfall"

train_df = pd.read_csv(
    DATA_DIR / "train.csv",
    index_col="id",
)
train_features_df = train_df.drop(label_name, axis="columns")
train_labels = train_df.loc[:, label_name]

In [3]:
test_features_df = pd.read_csv(
    DATA_DIR / "test.csv",
    index_col="id",
)

# Prepare the Data for ML

In [4]:
categorical_features_preprocessing = pipeline.Pipeline(
    steps=[
        (
            "simple_imputer",
            impute.SimpleImputer(
                strategy="most_frequent",
            ),
        ),
        (
            "ordinal_encoder",
            preprocessing.OrdinalEncoder(
                categories=[
                    range(1, 365 + 1)
                ],
                handle_unknown="error",
            )
        )
    ],
    memory=None,
    verbose=False,
)


numerical_features_preprocessing = pipeline.Pipeline(
    steps=[
        (
            "simple_imputer",
            impute.SimpleImputer(
                strategy="mean",
            )
        ),
        (
            "standard_scaler",
            preprocessing.StandardScaler(
                with_mean=True,
                with_std=True,
            )
        )
    ],
    memory=None,
    verbose=False,
)


features_preprocessing = compose.ColumnTransformer(
    transformers=[
        (
            "categorical_features",
            categorical_features_preprocessing,
            [
                "day",
            ]
        ),
        (
            "numerical_features",
            numerical_features_preprocessing,
            [
                "pressure",
                "maxtemp",
                "temparature",
                "mintemp",
                "dewpoint",
                "humidity",
                "cloud",
                "sunshine",
                "winddirection",
                "windspeed",
            ]
        ),
    ],  
    force_int_remainder_cols=False,
    remainder="drop",
    n_jobs=2,
    verbose=False,
    verbose_feature_names_out=False,
).set_output(transform="pandas")


# Select and Train a Model

## DummyClassifier

In [5]:
dummy_classifier_pipeline = pipeline.Pipeline(
    steps=[
        ("feature_preprocessing", features_preprocessing),
        (
            "dummy_classifier",
            dummy.DummyClassifier(
                strategy="prior",
                random_state=RANDOM_STATE,
            )
        )
    ]
)


In [6]:
dummy_classifier_pipeline

In [7]:
_ = dummy_classifier_pipeline.fit(
    train_features_df,
    train_labels
)

In [8]:
dummy_classifier_predicted_train_labels = (
    dummy_classifier_pipeline.predict(train_features_df)
)
dummy_classifier_predicted_train_probas = (
    dummy_classifier_pipeline.predict_proba(train_features_df)
)

## KNeighborsClassifier

The `neighbors.KNeighborsClassifier` class from **Scikit-Learn** implements the **k-Nearest Neighbors (k-NN) algorithm** for classification tasks. The classifier predicts the class of a new sample based on the classes of its **k nearest neighbors** in the training data.

1. You provide a **training dataset** with features `X_train` and labels `y_train`.
2. For a new input, the algorithm computes the **distance** (e.g., Euclidean) to all training points.
3. It selects the **k closest points** (neighbors).
4. The predicted class is determined by **majority vote** among these neighbors.

**Key parameters:**

* `n_neighbors` → number of neighbors to consider (k)
* `weights` → how votes are counted (default: `uniform` = all equal)
* `metric` → distance metric (default: `minkowski`)


In [9]:
neighbors.KNeighborsClassifier?

In [10]:
kneighbors_classifier_pipeline = pipeline.Pipeline(
    steps=[
        ("feature_preprocessing", features_preprocessing),
        (
            "kneighbors_classifier",
            neighbors.KNeighborsClassifier(
                n_neighbors=5
            )
        )
    ]
)


In [11]:
kneighbors_classifier_pipeline

In [12]:
_ = kneighbors_classifier_pipeline.fit(
    train_features_df,
    train_labels
)

In [13]:
kneighbors_classifier_predicted_train_labels = (
    kneighbors_classifier_pipeline.predict(train_features_df)
)
kneighbors_classifier_predicted_train_probas = (
    kneighbors_classifier_pipeline.predict_proba(train_features_df)
)

# Performance Metrics

## Cross-validation with time series data

`sklearn.model_selection.TimeSeriesSplit` is a cross-validation splitter designed for **time-ordered data**.

Instead of randomly shuffling, it:

* **keeps samples in chronological order**
* repeatedly creates splits where the **training set is earlier in time** and the **test set is later in time**

So each fold looks like:

* Train: `[past data]`
* Test:  `[future data]`

This avoids “peeking into the future,” which would cause **data leakage** in forecasting or any time-dependent prediction task.


In [14]:
model_selection.TimeSeriesSplit?

In [15]:
time_series_split = model_selection.TimeSeriesSplit(
    n_splits=2,
    test_size=730,
)


## Accuracy

`sklearn.metrics.accuracy_score` computes **classification accuracy**, i.e., the **fraction of predictions that match the true labels**.

$$\text{accuracy}=\frac{\#\text{correct predictions}}{\#\text{total predictions}}$$

* Works for **binary**, **multiclass**, and **multilabel** classification.
* You pass it `y_true` (ground truth labels) and `y_pred` (predicted labels).
* Optionally, you can use `normalize=False` to get the **count** of correct predictions instead of the fraction.


In [16]:
metrics.accuracy_score?

In [17]:
_train_score = metrics.accuracy_score(
    train_labels,
    dummy_classifier_predicted_train_labels,
)
print(f"DummyClassifier Train Accuracy Score: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    dummy_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="accuracy"
)
print(f"DummyClassifier CV Accuracy Score: {_cv_scores.mean()}")

DummyClassifier Train Accuracy Score: 0.7534246575342466
DummyClassifier CV Accuracy Score: 0.7595890410958904


In [18]:
_train_score = metrics.accuracy_score(
    train_labels,
    kneighbors_classifier_predicted_train_labels,
)
print(f"KNeighborsClassifier Train Accuracy Score: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    kneighbors_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="accuracy"
)
print(f"KNeighborsClassifier CV Accuracy Score: {_cv_scores.mean()}")


KNeighborsClassifier Train Accuracy Score: 0.8831050228310502
KNeighborsClassifier CV Accuracy Score: 0.8212328767123287


## Precision

`sklearn.metrics.precision_score` computes **precision** for a classification model:

$$ \text{Precision} = \frac{\# \text{True Positives}}{\# \text{True Positives} + \# \text{False Positives}} $$

In plain terms: **of all the examples your model predicted as positive, what fraction were actually positive?**

### Notes

* Works for **binary**, **multiclass**, and **multilabel** classification.
* You can control how it averages across classes using `average` kwarg.
* For binary classification, you can choose which class counts as the “positive” one using `pos_label`.


In [19]:
metrics.precision_score?

In [20]:
_train_score = metrics.precision_score(
    train_labels,
    dummy_classifier_predicted_train_labels,
)
print(f"DummyClassifier Train Precision Score: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    dummy_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="precision"
)
print(f"DummyClassifier CV Precision Score: {_cv_scores.mean()}")

DummyClassifier Train Precision Score: 0.7534246575342466
DummyClassifier CV Precision Score: 0.7595890410958904


In [21]:
_train_score = metrics.precision_score(
    train_labels,
    kneighbors_classifier_predicted_train_labels,
)
print(f"KNeighborsClassifier Train Precision Score: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    kneighbors_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="precision"
)
print(f"KNeighborsClassifier CV Precision Score: {_cv_scores.mean()}")


KNeighborsClassifier Train Precision Score: 0.8928974069898534
KNeighborsClassifier CV Precision Score: 0.8460090010725702


## Recall

`metrics.recall_score` from **scikit-learn** computes **recall**, a classification metric that measures how well a model identifies *actual positive* cases.

**What it measures:**
$$\text{Recall} = \frac{\text{True Positives}}{\text{True Positives + False Negatives}}$$

**In plain terms:**
Out of all the real positives, recall tells you **what fraction the model correctly found**.

**Why it matters:**
Recall is especially important when **missing a positive case is costly** (e.g., disease detection, fraud detection).

For multiclass problems, the `average` parameter controls how recall is aggregated across classes.


In [22]:
metrics.recall_score?

In [23]:
_train_score = metrics.recall_score(
    train_labels,
    dummy_classifier_predicted_train_labels,
)
print(f"DummyClassifier Train Recall Score: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    dummy_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="recall"
)
print(f"DummyClassifier CV Recall Score: {_cv_scores.mean()}")

DummyClassifier Train Recall Score: 1.0
DummyClassifier CV Recall Score: 1.0


In [24]:
_train_score = metrics.recall_score(
    train_labels,
    kneighbors_classifier_predicted_train_labels,
)
print(f"KNeighborsClassifier Train Recall Score: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    kneighbors_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="recall"
)
print(f"KNeighborsClassifier CV Recall Score: {_cv_scores.mean()}")


KNeighborsClassifier Train Recall Score: 0.96
KNeighborsClassifier CV Recall Score: 0.9351443405617473


## F1-Score

`sklearn.metrics.f1_score` computes the **F1 score**, which is the **harmonic mean of precision and recall**:

$$F_1 = 2 \cdot \frac{\text{precision} \cdot \text{recall}}{\text{precision} + \text{recall}}$$

It’s commonly used for **classification**, especially when you care about balancing **false positives** and **false negatives** (e.g., imbalanced datasets).

* For **binary classification**, it returns a single F1 score for the positive class by default.
* For **multiclass/multilabel**, you can control how scores are combined using the `average` parameter.


In [25]:
metrics.f1_score?

In [26]:
_train_score = metrics.f1_score(
    train_labels,
    dummy_classifier_predicted_train_labels,
)
print(f"DummyClassifier Train F1 Score: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    dummy_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="f1"
)
print(f"DummyClassifier CV F1 Score: {_cv_scores.mean()}")

DummyClassifier Train F1 Score: 0.859375
DummyClassifier CV F1 Score: 0.8633211869070738


In [27]:
_train_score = metrics.f1_score(
    train_labels,
    kneighbors_classifier_predicted_train_labels,
)
print(f"KNeighborsClassifier Train F1 Score: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    kneighbors_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="f1"
)
print(f"KNeighborsClassifier CV F1 Score: {_cv_scores.mean()}")


KNeighborsClassifier Train F1 Score: 0.9252336448598131
KNeighborsClassifier CV F1 Score: 0.8882065955027328


## Balanced Accuracy

`sklearn.metrics.balanced_accuracy_score` computes **balanced accuracy**, which is the **average recall across classes**.

* For **binary classification**, it’s:
  $$\frac{\text{Recall}_{positive} + \text{Recall}_{negative}}{2}$$
  (i.e., average of sensitivity and specificity)

* For **multi-class**, it averages the **per-class recall** over all classes.

It’s especially useful when your dataset is **imbalanced**, because it gives each class equal weight (unlike plain accuracy, which can look high just by predicting the majority class).


In [28]:
metrics.balanced_accuracy_score?

In [29]:
_train_score = metrics.balanced_accuracy_score(
    train_labels,
    dummy_classifier_predicted_train_labels,
)
print(f"DummyClassifier Train Balanced Accuracy Score: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    dummy_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="balanced_accuracy"
)
print(f"DummyClassifier CV Balanced Accuracy Score: {_cv_scores.mean()}")

DummyClassifier Train Balanced Accuracy Score: 0.5
DummyClassifier CV Balanced Accuracy Score: 0.5


In [30]:
_train_score = metrics.balanced_accuracy_score(
    train_labels,
    kneighbors_classifier_predicted_train_labels,
)
print(f"KNeighborsClassifier Train Balanced Accuracy Score: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    kneighbors_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="balanced_accuracy"
)
print(f"KNeighborsClassifier CV Balanced Accuracy Score: {_cv_scores.mean()}")


KNeighborsClassifier Train Balanced Accuracy Score: 0.804074074074074
KNeighborsClassifier CV Balanced Accuracy Score: 0.6992989341444509


## Classification Report

`sklearn.metrics.classification_report` generates a **text (or dict) summary of a classifier’s performance** by computing common classification metrics.

For each class, it reports:

* **Precision:** how many predicted positives were correct
* **Recall:** how many true positives were found
* **F1-score:** harmonic mean of precision and recall
* **Support:** number of true samples in that class

It also includes overall averages such as:

* **Accuracy**
* **Macro average:** unweighted mean across classes
* **Weighted average:** mean weighted by class frequency

It’s a quick way to evaluate how well your model performs **per class**, especially for **imbalanced datasets**.


In [31]:
metrics.classification_report?

In [32]:
_classification_report = metrics.classification_report(
    train_labels,
    dummy_classifier_predicted_train_labels,
    zero_division=0.0,
)

print(_classification_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       540
           1       0.75      1.00      0.86      1650

    accuracy                           0.75      2190
   macro avg       0.38      0.50      0.43      2190
weighted avg       0.57      0.75      0.65      2190



In [33]:
_classification_report = metrics.classification_report(
    train_labels,
    kneighbors_classifier_predicted_train_labels
)

print(_classification_report)

              precision    recall  f1-score   support

           0       0.84      0.65      0.73       540
           1       0.89      0.96      0.93      1650

    accuracy                           0.88      2190
   macro avg       0.87      0.80      0.83      2190
weighted avg       0.88      0.88      0.88      2190



## Matthews Correlation Coefficient

`sklearn.metrics.matthews_corrcoef` computes the **Matthews Correlation Coefficient (MCC)**, a **single-number score for classification** that measures how well your predictions match the true labels.

* It uses **all four confusion-matrix counts**: **TP, TN, FP, FN**
* It’s especially useful for **imbalanced datasets** (unlike accuracy)

**Range:**

* **+1** → perfect prediction
* **0** → no better than random guessing
* **−1** → total disagreement (always wrong)

It works for **binary** and **multiclass** classification.


In [34]:
metrics.matthews_corrcoef?

In [35]:
_train_score = metrics.matthews_corrcoef(
    train_labels,
    dummy_classifier_predicted_train_labels,
)
print(f"DummyClassifier Train Matthews Correlation Coefficient: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    dummy_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="matthews_corrcoef"
)
print(f"DummyClassifier CV Matthews Correlation Coefficient: {_cv_scores.mean()}")

DummyClassifier Train Matthews Correlation Coefficient: 0.0
DummyClassifier CV Matthews Correlation Coefficient: 0.0


In [36]:
_train_score = metrics.matthews_corrcoef(
    train_labels,
    kneighbors_classifier_predicted_train_labels,
)
print(f"KNeighborsClassifier Train Matthews Correlation Coefficient: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    kneighbors_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="matthews_corrcoef"
)
print(f"KNeighborsClassifier CV Matthews Correlation Coefficient: {_cv_scores.mean()}")


KNeighborsClassifier Train Matthews Correlation Coefficient: 0.6682281510173902
KNeighborsClassifier CV Matthews Correlation Coefficient: 0.4631151142519436


## Brier Score Loss

`sklearn.metrics.brier_score_loss` measures how **well-calibrated** your **probabilistic predictions** are for a **binary classification** task.

* It takes **true labels** (0/1) and **predicted probabilities** (for the positive class).
* It computes the **mean squared error** between them:

$$\text{Brier} = \frac{1}{n}\sum_{i=1}^n (p_i - y_i)^2$$

### Interpretation

* **0.0 is perfect** (probabilities match reality exactly)
* **Higher is worse**

It penalizes both:

  * wrong predictions
  * *overconfident* wrong probabilities

So it’s a great metric when you care not just about accuracy, but about **probability quality / calibration**.


In [37]:
metrics.brier_score_loss?

In [38]:
_train_score = metrics.brier_score_loss(
    train_labels,
    dummy_classifier_predicted_train_probas[:, 1],
)
print(f"DummyClassifier Train Brier Score Loss: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    dummy_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="neg_brier_score"
)
print(f"DummyClassifier CV Brier Score Loss: {-1.0 * _cv_scores.mean()}")

DummyClassifier Train Brier Score Loss: 0.18577594295364983
DummyClassifier CV Brier Score Loss: 0.18285818164758866


In [39]:
_train_score = metrics.brier_score_loss(
    train_labels,
    kneighbors_classifier_predicted_train_probas[:, 1],
)
print(f"KNeighborsClassifier Train Brier Score: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    kneighbors_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="neg_brier_score"
)
print(f"KNeighborsClassifier CV Brier Score: {-1.0 * _cv_scores.mean()}")


KNeighborsClassifier Train Brier Score: 0.0844200913242009
KNeighborsClassifier CV Brier Score: 0.1359178082191781


## ROC AUC Score

`sklearn.metrics.roc_auc_score` computes the **AUC (Area Under the ROC Curve)** for a model.

In plain terms: it measures **how well your model ranks positive examples higher than negative ones**, across *all possible classification thresholds*.

* **AUC = 1.0** → perfect ranking
* **AUC = 0.5** → random guessing
* **AUC < 0.5** → systematically wrong ranking (often flipped labels)

It works best when you pass **probabilities or decision scores**, not hard class predictions.


In [40]:
metrics.roc_auc_score?

In [41]:
_train_score = metrics.roc_auc_score(
    train_labels,
    dummy_classifier_predicted_train_probas[:, 1],
)
print(f"DummyClassifier Train ROC AUC Score: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    dummy_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="roc_auc"
)
print(f"DummyClassifier CV ROC AUC Score: {_cv_scores.mean()}")

DummyClassifier Train ROC AUC Score: 0.5
DummyClassifier CV ROC AUC Score: 0.5


In [42]:
_train_score = metrics.roc_auc_score(
    train_labels,
    kneighbors_classifier_predicted_train_probas[:, 1],
)
print(f"KNeighborsClassifier Train ROC AUC Score: {_train_score}")

_cv_scores = model_selection.cross_val_score(
    kneighbors_classifier_pipeline,
    train_features_df,
    train_labels,
    cv=time_series_split,
    n_jobs=-1,
    scoring="roc_auc"
)
print(f"KNeighborsClassifier CV ROC AUC Score: {_cv_scores.mean()}")


KNeighborsClassifier Train ROC AUC Score: 0.936145342312009
KNeighborsClassifier CV ROC AUC Score: 0.8044950045608692


# Fine-tune your Model

In [43]:
model_selection.GridSearchCV?

In [44]:
param_grid = [
    {
        "kneighbors_classifier__n_neighbors": [3, 5, 7, 11, 13],
        "kneighbors_classifier__weights": ["uniform", "distance"],
        "kneighbors_classifier__p": [1, 2],
    }
]

tuned_kneighbors_classifier_pipeline = model_selection.GridSearchCV(
    kneighbors_classifier_pipeline,
    param_grid,
    cv=time_series_split,
    refit=True,
    return_train_score=True,
    scoring="roc_auc",
    verbose=2,
)

In [45]:
tuned_kneighbors_classifier_pipeline

In [46]:
_ = tuned_kneighbors_classifier_pipeline.fit(
    train_features_df,
    train_labels,
)

Fitting 2 folds for each of 20 candidates, totalling 40 fits
[CV] END kneighbors_classifier__n_neighbors=3, kneighbors_classifier__p=1, kneighbors_classifier__weights=uniform; total time=   2.2s
[CV] END kneighbors_classifier__n_neighbors=3, kneighbors_classifier__p=1, kneighbors_classifier__weights=uniform; total time=   0.1s
[CV] END kneighbors_classifier__n_neighbors=3, kneighbors_classifier__p=1, kneighbors_classifier__weights=distance; total time=   0.0s
[CV] END kneighbors_classifier__n_neighbors=3, kneighbors_classifier__p=1, kneighbors_classifier__weights=distance; total time=   0.1s
[CV] END kneighbors_classifier__n_neighbors=3, kneighbors_classifier__p=2, kneighbors_classifier__weights=uniform; total time=   0.1s
[CV] END kneighbors_classifier__n_neighbors=3, kneighbors_classifier__p=2, kneighbors_classifier__weights=uniform; total time=   0.1s
[CV] END kneighbors_classifier__n_neighbors=3, kneighbors_classifier__p=2, kneighbors_classifier__weights=distance; total time=   0.1

## Exploring Tuning Results

In [47]:
tuned_kneighbors_classifier_pipeline.best_score_

np.float64(0.8427338208031715)

In [48]:
tuned_kneighbors_classifier_pipeline.best_params_

{'kneighbors_classifier__n_neighbors': 11,
 'kneighbors_classifier__p': 1,
 'kneighbors_classifier__weights': 'distance'}

In [49]:
def cv_results_dict_to_df(cv_results_dict):
    cv_results_df = (
        pd.DataFrame
          .from_dict(
              cv_results_dict
          ).sort_values(
              "rank_test_score",
              ascending=True
          )
    )
    return cv_results_df

In [50]:
cv_results_dict_to_df(tuned_kneighbors_classifier_pipeline.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighbors_classifier__n_neighbors,param_kneighbors_classifier__p,param_kneighbors_classifier__weights,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
13,0.031162,0.000614,0.024824,0.001231,11,1,distance,"{'kneighbors_classifier__n_neighbors': 11, 'kn...",0.826734,0.858734,0.842734,0.016,1,1.0,1.0,1.0,0.0
17,0.030851,0.000265,0.025291,0.000661,13,1,distance,"{'kneighbors_classifier__n_neighbors': 13, 'kn...",0.82754,0.856436,0.841988,0.014448,2,1.0,1.0,1.0,0.0
12,0.020215,0.000523,0.024336,0.000801,11,1,uniform,"{'kneighbors_classifier__n_neighbors': 11, 'kn...",0.821359,0.856351,0.838855,0.017496,3,0.886131,0.907968,0.897049,0.010919
16,0.026282,0.006471,0.024593,0.000177,13,1,uniform,"{'kneighbors_classifier__n_neighbors': 13, 'kn...",0.820294,0.85224,0.836267,0.015973,4,0.878982,0.900666,0.889824,0.010842
9,0.080286,0.050755,0.023524,0.000505,7,1,distance,"{'kneighbors_classifier__n_neighbors': 7, 'kne...",0.816377,0.848316,0.832346,0.015969,5,1.0,1.0,1.0,0.0
8,0.03128,0.000508,0.023666,8e-05,7,1,uniform,"{'kneighbors_classifier__n_neighbors': 7, 'kne...",0.814959,0.84211,0.828534,0.013576,6,0.909471,0.921317,0.915394,0.005923
11,0.02592,0.004517,0.023614,0.00025,7,2,distance,"{'kneighbors_classifier__n_neighbors': 7, 'kne...",0.788402,0.848316,0.818359,0.029957,7,1.0,1.0,1.0,0.0
15,0.025175,0.005455,0.023421,0.000556,11,2,distance,"{'kneighbors_classifier__n_neighbors': 11, 'kn...",0.78655,0.84964,0.818095,0.031545,8,1.0,1.0,1.0,0.0
19,0.031305,0.001237,0.024795,0.000293,13,2,distance,"{'kneighbors_classifier__n_neighbors': 13, 'kn...",0.781225,0.851363,0.816294,0.035069,9,1.0,1.0,1.0,0.0
5,0.030896,0.000192,0.02372,0.000389,5,1,distance,"{'kneighbors_classifier__n_neighbors': 5, 'kne...",0.792373,0.839679,0.816026,0.023653,10,1.0,1.0,1.0,0.0


In [51]:
tuned_kneighbors_classifier_pipeline.best_estimator_

# Submit Predictions

In [52]:
%%bash

cat /kaggle/input/rainfall-probability-cs-209-spring-2026/sample_submission.csv | head -n 5

id,rainfall
2190,0
2191,0
2192,0
2193,0


In [53]:
predicted_rainfall_probas = (
    tuned_kneighbors_classifier_pipeline.best_estimator_
                                        .predict_proba(
                                        test_features_df
                                    )
)


In [54]:
_ = (
    pd.read_csv(
        DATA_DIR / "sample_submission.csv",
        index_col="id"
    ).assign(
        rainfall=predicted_rainfall_probas[:, 1]
    ).to_csv(
        "submission.csv",
        index=True
    )
)

In [55]:
%%bash

cat submission.csv | head -n 5

id,rainfall
2190,1.0
2191,1.0
2192,1.0
2193,0.5094701405050921
