<a href="https://www.kaggle.com/code/dalloliogm/predicting-using-a-logistic-regression-with-regula?scriptVersionId=228916956" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Rain prediction using Logistic Regression

This notebook predicts rainfall using a simple Logistic Regression model.

## Parameters and options

In [None]:
config = {
    "features": ['day', 
                 'pressure', 
                 'maxtemp', 
                 'temparature', 
                 'mintemp', 
                 'dewpoint', 
                 'humidity', 
                 'cloud', 
                 'sunshine', 
                 'winddirection', 
                 'windspeed'
                 ],
    # Note: in this notebook https://www.kaggle.com/code/hopesb/rain-fall-prediction/notebook they removed:
    # ["mintemp", "temparature", "maxtemp", "winddirection"
    "clustering_variables": ['day', 'temparature', 'sunshine', 'cloud', 'windspeed'],
    "n_clusters": 3,
    "n_lags": 5,
    "lag_columns": ['humidity', 'temparature', 'pressure', 'cloud', 'windspeed', 'dewpoint', 'sunshine'],
    "device": "cpu"
}

## Importing libraries and reading files

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import warnings
import seaborn as sns
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score


def plot_cv_scores(cv_scores, title):
    """Plot cross-validation ROC AUC scores."""
    plt.figure(figsize=(10, 6))
    num_folds = len(cv_scores)
    plt.plot(range(1, num_folds + 1), cv_scores, marker='o', linestyle='--', color='b')
    plt.axhline(y=cv_scores.mean(), color='r', linestyle='-', label=f'Mean: {cv_scores.mean():.4f}')
    plt.xlabel('Fold Number', fontsize=12)
    plt.ylabel('ROC AUC', fontsize=12)
    plt.title(title, fontsize=14)
    plt.xticks(range(1, num_folds + 1, 2))
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.show()

def evaluate_pipeline(pipeline, X, y, pipeline_name, n_splits=5, n_repeats=10, random_state=666):
    """
    Evaluate a pipeline using repeated stratified k-fold cross-validation and plot the results.
    
    Parameters:
        pipeline: The sklearn Pipeline to evaluate.
        X: Feature DataFrame.
        y: Target array or Series.
        pipeline_name: String, name of the pipeline (used for printing/plot titles).
        n_splits: Number of folds (default 5).
        n_repeats: Number of repeats (default 10).
        random_state: Random seed for reproducibility.
    
    Returns:
        cv_scores: Array of cross-validation scores.
    """
    auc_scorer = make_scorer(roc_auc_score, needs_proba=True)
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
    cv_scores = cross_val_score(pipeline, X, y, cv=rskf, scoring=auc_scorer)
    mean_score = np.mean(cv_scores)
    print(f"{pipeline_name} Repeated CV AUC scores:", cv_scores)
    print(f"Mean AUC {pipeline_name}: {mean_score:.4f}")
    plot_cv_scores(cv_scores, title=f"{pipeline_name}: Cross-validation fold estimates")
    return cv_scores


def plot_feature_importance(pipeline, X):
    """
    Plot feature importance for a pipeline.
    
    This function manually applies the feature engineering steps
    (skipping imputation and scaling) to obtain the final feature names,
    then extracts importance values from the classifier step.
    
    Parameters:
        pipeline: The sklearn Pipeline containing at least these steps:
                  'feature_eng', 'additional_fe', 'lag_features', 'clf'.
        X: pandas DataFrame with the original features.
    
    Raises:
        ValueError: If the classifier does not have a recognized attribute for feature importance.
    """
    # Check that required steps exist in the pipeline
    required_steps = ['feature_eng', 'additional_fe', 'lag_features', 'clf']
    missing_steps = [step for step in required_steps if step not in pipeline.named_steps]
    if missing_steps:
        raise ValueError(f"Pipeline is missing required steps: {missing_steps}")
    
    # Apply the feature engineering steps manually (skipping imputer and scaler)
    X_transformed = pipeline.named_steps['feature_eng'].transform(X)
    X_transformed = pipeline.named_steps['additional_fe'].transform(X_transformed)
    X_transformed = pipeline.named_steps['lag_features'].transform(X_transformed)
    feature_names = X_transformed.columns

    # Get the classifier from the pipeline
    clf = pipeline.named_steps['clf']

    # Extract feature importance
    if hasattr(clf, "coef_"):
        # For linear models like LogisticRegression
        coefficients = clf.coef_[0]
        feat_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': coefficients
        })
        feat_importance['abs_importance'] = feat_importance['importance'].abs()
        feat_importance = feat_importance.sort_values('abs_importance', ascending=True)
        title = "Feature Importance from Logistic Regression (Coefficients)"
    elif hasattr(clf, "feature_importances_"):
        # For tree-based models like XGBoost or LightGBM
        importances = clf.feature_importances_
        feat_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        })
        feat_importance = feat_importance.sort_values('importance', ascending=True)
        title = "Feature Importance from Model"
    else:
        raise ValueError("Classifier does not have a known feature importance attribute.")
    
    # Plot feature importance as a horizontal bar chart
    plt.figure(figsize=(12, 12))
    plt.barh(feat_importance['feature'], feat_importance['importance'])
    plt.xlabel("Importance")
    plt.title(title)
    plt.show()


In [None]:
!head /kaggle/input/playground-series-s5e3/sample_submission.csv
!head /kaggle/input/playground-series-s5e3/train.csv
!head /kaggle/input/playground-series-s5e3/test.csv

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s5e3/train.csv")#.set_index("id")
train.head()

In [None]:
train.describe().style.background_gradient(cmap='summer')

In [None]:
train.day.max()

In [None]:
train.day.value_counts()

In [None]:
test = pd.read_csv("/kaggle/input/playground-series-s5e3/test.csv")#.set_index("id")
test.head()

## Quick EDA

In [None]:
train.corr().style.background_gradient(cmap='winter')

In [None]:


#sns.pairplot(train, kind="kde")

## Handling Missing values

It seems there is just one missing value, in the test dataset

In [None]:
train.isnull().sum()


In [None]:
test.isnull().sum()

In [None]:

test[test.isnull().any(axis=1)]


## Feature Engineering

### Adding season (spring, summer, fall, winter) and cyclical features

The day variables goes from 1 to 365, so we only have data from one year.

To tell our model that day 1 is close to day 365 (as they are both in winter) we add a cyclical feature

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class SeasonMonthTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        # Nothing to fit for this transformer
        return self
    
    def transform(self, X):
        # Make a copy to avoid modifying the original
        X_trans = X.copy()
        
        # Convert day -> month
        def day_to_month(day):
            if day%365 <= 31: return 1
            elif day%365 <= 59: return 2
            elif day%365 <= 90: return 3
            elif day%365 <= 120: return 4
            elif day%365 <= 151: return 5
            elif day%365 <= 181: return 6
            elif day%365 <= 212: return 7
            elif day%365 <= 243: return 8
            elif day%365 <= 273: return 9
            elif day%365 <= 304: return 10
            elif day%365 <= 334: return 11
            else: return 12
        
        X_trans['month'] = X_trans['day'].apply(day_to_month)
        
        # Convert day -> season
        def day_to_season(day):
            if 80 <= day%365 < 172:
                return 'spring'
            elif 172 <= day%365 < 264:
                return 'summer'
            elif 264 <= day%365 < 356:
                return 'autumn'
            else:
                return 'winter'
        
        X_trans['season'] = X_trans['day'].apply(day_to_season)

        
        X_trans['day_sin'] = np.sin(2 * np.pi * X_trans['day'] / 365)
        X_trans['day_cos'] = np.cos(2 * np.pi * X_trans['day'] / 365)
        
        # One-hot encode season
        X_trans = pd.get_dummies(X_trans, columns=['season'], drop_first=True)
        
        return X_trans



### Additional features

Inspired by https://www.kaggle.com/code/josephnehrenz/87-9-logistic-s5e3-rainfall-probability-in-r

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd

class AdditionalFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_trans = X.copy()
        
        # ------------------- Sunshine Metrics -------------------
        if all(col in X.columns for col in ['sunshine', 'cloud', 'humidity']):
            X_trans['relative_sunshine'] = X_trans['sunshine'] / (100 - X_trans['cloud'] + 1)
            X_trans['sunshine_ratio'] = X_trans['sunshine'] / (X_trans['cloud'] + X_trans['humidity'] + 1e-5)
        if all(col in X.columns for col in ['sunshine', 'cloud']):
            X_trans['cloud_sun_ratio'] = X_trans['cloud'] / (X_trans['sunshine'] + 1)
        if 'sunshine' in X.columns:
            X_trans['sunshine_pct'] = X_trans['sunshine'] / 24.0  # Assuming max sunshine is 24 hours
        
        # ------------------- Cloud Metrics -------------------
        if 'cloud' in X.columns:
            X_trans['cloud_gradient'] = X_trans['cloud'] - X_trans['cloud'].shift(1, 
                                        fill_value=X_trans['cloud'].iloc[0])
            X_trans['cloud_category'] = pd.cut(X_trans['cloud'], bins=[0,20,50,80,100],
                                               labels=[0,1,2,3], include_lowest=True).astype(float)
            X_trans['sky_opacity'] = X_trans['cloud'] / 100.0
        
        # ------------------- Temperature Metrics -------------------
        if all(col in X.columns for col in ['maxtemp', 'mintemp']):
            X_trans['temp_range'] = X_trans['maxtemp'] - X_trans['mintemp']
        if 'temparature' in X.columns:
            X_trans['temp_change'] = X_trans['temparature'] - X_trans['temparature'].shift(1, 
                                           fill_value=X_trans['temparature'].iloc[0])
            X_trans['temp_ewm'] = X_trans['temparature'].ewm(span=10, adjust=False).mean()
            if 'humidity' in X.columns:
                X_trans['temp_humidity_interaction'] = X_trans['temparature'] + 0.2 * X_trans['humidity']
        
        # ------------------- Pressure Metrics -------------------
        if 'pressure' in X.columns:
            X_trans['pressure_rolling_mean'] = X_trans['pressure'].rolling(window=7, min_periods=1).mean()
            X_trans['pressure_rolling_std'] = X_trans['pressure'].rolling(window=7, min_periods=1).std()
            X_trans['pressure_diff'] = X_trans['pressure'] - X_trans['pressure'].shift(1, 
                                               fill_value=X_trans['pressure'].iloc[0])
        
        # ------------------- Humidity Metrics -------------------
        if all(col in X.columns for col in ['temparature', 'dewpoint']):
            X_trans['dewpoint_depression'] = X_trans['temparature'] - X_trans['dewpoint']
            X_trans['rh_approx'] = 100 - (5 * X_trans['dewpoint_depression'])
        if all(col in X.columns for col in ['humidity', 'cloud']):
            X_trans['humidity_cloud_interaction'] = (X_trans['humidity'] * X_trans['cloud']) / 10000.0
            X_trans['inv_humidity_cloud'] = 100 - X_trans['humidity'] - X_trans['cloud']
        
        # ------------------- Dewpoint Metrics -------------------
        if 'temparature' in X.columns:
            X_trans['svp'] = 6.1078 * np.exp((17.27 * X_trans['temparature']) / (X_trans['temparature'] + 237.3))
        if all(col in X.columns for col in ['temparature', 'humidity']):
            X_trans['abs_humidity'] = (6.112 * np.exp((17.67 * X_trans['temparature']) / (X_trans['temparature'] + 243.5)) * 
                                       X_trans['humidity'] * 2.1674) / (273.15 + X_trans['temparature'])
        
        return X_trans


### Lagging features

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class LagFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, n_lags=5):
        # Default columns if none provided
        self.columns = columns if columns is not None else ['humidity', 'temparature', 'pressure', 'sunshine']
        self.n_lags = n_lags

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_trans = X.copy()
        for col in self.columns:
            if col in X_trans.columns:
                for lag in range(1, self.n_lags + 1):
                    X_trans[f"{col}_lag_{lag}"] = X_trans[col].shift(lag)
            #else:
                # Log a warning if the column is missing
                #print(f"Warning: Column '{col}' not found in data. Skipping lag features for this column.")
        return X_trans


## Predicting Clusters

Thanks to this notebook: https://www.kaggle.com/code/felixleung/looks-like-there-are-3-clusters


In [None]:
train.columns

In [None]:
from sklearn.cluster import KMeans
import pandas as pd

# Suppose you use the 'day' column (or any set of features) for clustering
kmeans = KMeans(n_clusters=config["n_clusters"], random_state=42)

# Fit on training data
train['cluster'] = kmeans.fit_predict(train[config["clustering_variables"]])
# Predict clusters for test data using the same model
test['cluster'] = kmeans.predict(test[config["clustering_variables"]])

# One-hot encode the cluster labels (optional but recommended)
train = pd.get_dummies(train, columns=['cluster'], prefix='cluster')
test = pd.get_dummies(test, columns=['cluster'], prefix='cluster')


In [None]:
train.head()

## Prediction using Logistic Regression, XGBoost or LightGBM

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer
import numpy as np
from lightgbm import LGBMClassifier


# Define a helper function to create the pipeline
def create_pipeline(lag_columns = config["lag_columns"], 
                    n_lags=config["n_lags"], 
                    clf=False):
    steps = [
        ('feature_eng', SeasonMonthTransformer()),
        ('additional_fe', AdditionalFeatureTransformer()),
        ('lag_features', LagFeatureTransformer(columns=lag_columns, n_lags=n_lags)),
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('clf', clf)
    ]
    return Pipeline(steps)

# Define your feature columns and data
features = [*config["features"], *[c for c in train.columns if c.startswith('cluster')]]

X = train[features]
y = train['rainfall']

# Create the logistic regression pipeline
pipeline_lg = create_pipeline(
    n_lags = config["n_lags"],
    lag_columns = config["lag_columns"],
    clf=LogisticRegression(
        penalty='l1',        # L1 (Lasso) regularization
        C=1.0,               # adjust for stronger/weaker regularization
        max_iter=1000,
        random_state=42,
        solver='liblinear'   # supports L1 penalty
    )
)

# Create the XGBoost pipeline 
pipeline_xgb = create_pipeline(
    n_lags = config["n_lags"],
    lag_columns = config["lag_columns"],
    clf=XGBClassifier(
        device=config["device"],
        n_estimators=10000,
        learning_rate=0.1,
        max_depth=6,
#        early_stopping_rounds=100,
        alpha=0.1,
        random_state=42,
        colsample_bytree=0.9, 
        subsample=0.9,
        use_label_encoder=False,    # Disable label encoder to avoid warnings
        eval_metric='auc'           # Set evaluation metric to AUC
    )
)

# Create the XGBoost pipeline (with a different lag configuration)
pipeline_lgbm = create_pipeline(
    n_lags = config["n_lags"],
    lag_columns = config["lag_columns"],
    clf=LGBMClassifier(
        device=config["device"],
        n_estimators=10000,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        verbose=-1  # This will suppress the warnings
    )
)



In [None]:
cv_scores_lgbm = evaluate_pipeline(pipeline_lgbm, X, y, "LightGBM")
mean_cv_scores_lgbm = np.mean(cv_scores_lgbm)
pipeline_lgbm.fit(X, y)
plot_feature_importance(pipeline_lgbm, X)


In [None]:
cv_scores_lg   = evaluate_pipeline(pipeline_lg, X, y, "Logistic Regression")
mean_cv_scores_lg = np.mean(cv_scores_lg)
pipeline_lg.fit(X, y)
plot_feature_importance(pipeline_lg, X)


In [None]:
cv_scores_xgb  = evaluate_pipeline(pipeline_xgb, X, y, "XGBoost")
mean_cv_scores_xgb = np.mean(cv_scores_xgb)
pipeline_xgb.fit(X, y)
plot_feature_importance(pipeline_xgb, X)


## Choose a model

Modify this variable to choose which model to submit

In [None]:
from sklearn.ensemble import VotingClassifier

# Suppose these are your mean cross‑validation AUROC scores:
results = {"LG": mean_cv_scores_lg, "XGB": mean_cv_scores_xgb, "LGBM": mean_cv_scores_lgbm}
print("CV results:", results)

# Normalize the weights so they sum to 1:
total_score = mean_cv_scores_lg + mean_cv_scores_xgb + mean_cv_scores_lgbm
weights = [mean_cv_scores_lg / total_score, mean_cv_scores_xgb / total_score, mean_cv_scores_lgbm / total_score]
print("Normalized weights:", weights)

# Create a voting ensemble using soft voting (using predicted probabilities)
pipeline = VotingClassifier(estimators=[
    ('LG', pipeline_lg),
    ('XGB', pipeline_xgb),
    ('LGBM', pipeline_lgbm)
], voting='soft', weights=weights)



In [None]:
## Dictionary mapping model names to their mean cross-validation scores
#results = {"LG": mean_cv_scores_lg, "XGB": mean_cv_scores_xgb, "LGBM": mean_cv_scores_lgbm}#
#
## Get the best model key and score
#best_model, best_score = max(results.items(), key=lambda x: x[1])
#print("Best model:", best_model, "with score:", best_score)

## Dictionary mapping model names to their corresponding pipelines
#pipelines = {"LG": pipeline_lg, "XGB": pipeline_xgb, "LGBM": pipeline_lgbm}

## Assign the pipeline corresponding to the best model
#pipeline = pipelines[best_model]


In [None]:
#pipeline = pipeline_lg

## Fitting model

In [None]:
# Refit on the full training set to ensure coefficients are available
pipeline.fit(X, y)


## Creating Submission file

In [None]:
X_test = test[features]
X_test

In [None]:
pipeline.fit(X,y)
# Predict probabilities on the test set
test_preds = pipeline.predict_proba(X_test)[:, 1]

# Create the submission DataFrame
submission = pd.DataFrame({
    'id': test['id'],
    'rainfall': test_preds
})

# Save the submission file
submission.to_csv('submission.csv', index=False)


In [None]:
test_preds[0:10]

In [None]:
submission.head()