In [1]:
import pathlib

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import compose, dummy, impute, metrics, pipeline, preprocessing


DATA_DIR = pathlib.Path("/kaggle/input/rainfall-probability-cs-209-spring-2026")
RANDOM_STATE = np.random.RandomState(42)


# Load the Data

In [2]:
%%bash

ls /kaggle/input/rainfall-probability-cs-209-spring-2026

sample_submission.csv
test.csv
train.csv


In [3]:
%%bash

cat /kaggle/input/rainfall-probability-cs-209-spring-2026/train.csv | head -n 5

id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1


In [4]:
label_name = "rainfall"

train_df = pd.read_csv(
    DATA_DIR / "train.csv",
    index_col="id",
)
train_features_df = train_df.drop(label_name, axis="columns")
train_labels = train_df.loc[:, label_name]

In [5]:
%%bash

cat /kaggle/input/rainfall-probability-cs-209-spring-2026/test.csv | head -n 5

id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6


In [6]:
test_features_df = pd.read_csv(
    DATA_DIR / "test.csv",
    index_col="id",
)

# Prepare the Data for ML

This code defines two separate preprocessing pipelines in Scikit-Learn: one for *categorical features* and one for *numerical features*. These pipelines will then be used by a special kind of Scikit-Learn transformer called a `compose.ColumnTransformer` so that each column type gets the right preprocessing treatment.

## Categorical Features

What it does (step-by-step):
Step 1: SimpleImputer(strategy="most_frequent")
Fills missing values in categorical columns.
Uses the most common category in that column.
Example: if a column is ["red", "blue", None, "blue"], the missing value becomes "blue".
Step 2: OrdinalEncoder(handle_unknown="ignore")
Converts categories into ???
handle_unknown="ignore" is important because:
If a new category appears in the val/test set (not seen during training),
it won’t crash.
Instead, it encodes it as all zeros.

In [7]:
categorical_features_preprocessing = pipeline.Pipeline(
    steps=[
        (
            "simple_imputer",
            impute.SimpleImputer(
                strategy="most_frequent",
            ),
        ),
        (
            "ordinal_encoder",
            preprocessing.OrdinalEncoder(
                categories=[
                    range(1, 365 + 1)
                ],
                handle_unknown="error",
            )
        )
    ],
    memory=None,
    verbose=False,
)


In [8]:
categorical_features_preprocessing

## Numerical Features

What it does (step-by-step):
Step 1: SimpleImputer(strategy="mean")
Fills missing numerical values using the mean of the column.
Example: [1.0, 2.0, None, 4.0] → mean is 2.33, so missing becomes 2.33.
Step 2: StandardScaler(with_mean=True, with_std=True)
Standardizes numerical features:
subtract mean → centers at 0
divide by standard deviation → scales to unit variance
After scaling, features roughly look like a standard normal distribution.
This is especially

In [9]:
numerical_features_preprocessing = pipeline.Pipeline(
    steps=[
        (
            "simple_imputer",
            impute.SimpleImputer(
                strategy="mean",
            )
        ),
        (
            "standard_scaler",
            preprocessing.StandardScaler(
                with_mean=True,
                with_std=True,
            )
        )
    ],
    memory=None,
    verbose=False,
)

In [10]:
numerical_features_preprocessing

## Combining Feature Preprocessing Pipelines

In [11]:
feature_preprocessing = compose.ColumnTransformer(
    transformers=[
        (
            "categorical_features",
            categorical_features_preprocessing,
            [
                "day",
            ]
        ),
        (
            "numerical_features",
            numerical_features_preprocessing,
            [
                "pressure",
                "maxtemp",
                "temparature",
                "mintemp",
                "dewpoint",
                "humidity",
                "cloud",
                "sunshine",
                "winddirection",
                "windspeed",
            ]
        ),
    ],  
    force_int_remainder_cols=False,
    remainder="drop",
    n_jobs=2,
    verbose=False,
    verbose_feature_names_out=False,
).set_output(transform="pandas")


In [12]:
feature_preprocessing

### Manually Preprocessing Features

In [13]:
processed_train_features_df = feature_preprocessing.fit_transform(train_features_df)

In [14]:
processed_train_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2190 entries, 0 to 2189
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   day            2190 non-null   float64
 1   pressure       2190 non-null   float64
 2   maxtemp        2190 non-null   float64
 3   temparature    2190 non-null   float64
 4   mintemp        2190 non-null   float64
 5   dewpoint       2190 non-null   float64
 6   humidity       2190 non-null   float64
 7   cloud          2190 non-null   float64
 8   sunshine       2190 non-null   float64
 9   winddirection  2190 non-null   float64
 10  windspeed      2190 non-null   float64
dtypes: float64(11)
memory usage: 205.3 KB


In [15]:
processed_train_features_df.head()

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.0,0.671702,-0.913809,-0.642199,-0.448815,-0.199457,0.636434,0.681269,-0.729397,-0.560901,-0.465291
1,1.0,1.043116,-1.798289,-1.350846,-1.259418,-0.956001,1.662224,0.847728,-1.032804,-0.685925,0.009629
2,2.0,1.856688,-1.232222,-1.504067,-1.496667,-2.109731,-0.90225,-1.59368,1.256536,-0.435876,-0.374349
3,3.0,-0.035752,-1.462187,-1.178472,-1.041939,-0.69121,1.662224,1.069675,-1.032804,-0.560901,1.393971
4,4.0,1.449902,-0.89612,-1.063556,-1.378043,-2.05299,-3.851394,-1.704654,-0.039837,-0.81095,0.302665


# Train a Benchmark Model

## Using Manually Preprocessed Features

In [16]:
dummy.DummyClassifier?

In [17]:
dummy_classifier = dummy.DummyClassifier(
    strategy="prior",
    random_state=RANDOM_STATE,
)

_ = dummy_classifier.fit(
    processed_train_features_df,
    train_labels
)

## Combining Feature Preprocessing with a Model

In [18]:
classifier_pipeline = pipeline.Pipeline(
    steps=[
        ("feature_preprocessing", feature_preprocessing),
        ("dummy_classifier", dummy_classifier)
    ]
)


In [19]:
classifier_pipeline

In [20]:
_ = classifier_pipeline.fit(
    train_features_df,
    train_labels
)

In [21]:
_ = joblib.dump(classifier_pipeline, "dummy-classifier-pipeline.pkl")

In [22]:
%%bash

ls -lh

total 148K
-rw-r--r-- 1 root root  14K Feb 12 10:25 dummy-classifier-pipeline.pkl
---------- 1 root root 130K Feb 12 10:25 __notebook__.ipynb


# Submit Predictions

In [23]:
%%bash

cat /kaggle/input/rainfall-probability-cs-209-spring-2026/sample_submission.csv | head -n 5

id,rainfall
2190,0
2191,0
2192,0
2193,0


In [24]:
predicted_rainfall_probas = classifier_pipeline.predict_proba(
    test_features_df
)


In [25]:
_ = (
    pd.read_csv(
        DATA_DIR / "sample_submission.csv",
        index_col="id"
    ).assign(
        rainfall=predicted_rainfall_probas[:, 1]
    ).to_csv(
        "submission.csv",
        index=True
    )
)

In [26]:
%%bash

cat submission.csv | head -n 5

id,rainfall
2190,0.7534246575342466
2191,0.7534246575342466
2192,0.7534246575342466
2193,0.7534246575342466
