<a href="https://colab.research.google.com/github/fxs2596/NerdOut/blob/main/Binary_Rainfall_K_Fold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
# --- Complete Machine Learning Workflow Script (Start to Finish - RF Group CV Tuning) ---

# --- Step 0: Setup and Data Loading ---

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Import necessary scikit-learn modules
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GroupKFold # Import GroupKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Import Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

import time

print("Setup complete. Importing libraries.")

# Define target column name early
target_col = 'rainfall' # Correct target column name

# Load the training dataset
file_name = 'train.csv' # Correct file name

try:
    df = pd.read_csv(file_name)
    print(f"\nData '{file_name}' loaded successfully!")
except FileNotFoundError:
    print(f"\nError: File '{file_name}' not found. Please check the file name and path.")
    import sys
    sys.exit(f"Data file '{file_name}' not found.")

print(f"Target variable defined as '{target_col}'.")

print("\nFirst 5 rows of the loaded data:")
print(df.head())


# --- Step 1: Data Inspection and Cleaning (Initial & Group Creation) ---
print("\n--- Step 1: Data Inspection and Cleaning (Initial & Group Creation) ---")

print("\nColumn Info (data types and non-null counts):")
df.info()

print("\nSummary Statistics for Numerical Columns:")
df.describe()

print("\nMissing Values per Column:")
df.isnull().sum()

print(f"\nDistribution of the target variable '{target_col}':")
df[target_col].value_counts()

print(f"\nPercentage Distribution of the target variable '{target_col}':")
df[target_col].value_counts(normalize=True) * 100

# 1. Handle Data Quality Issues: rain > related columns? (Not applicable for binary target)
#    Handle rows where target > length - This was for regression, skip here

# Create the 'group' column *before* dropping 'id' or filtering rows
# Based on winner strategy: group = id // 365
if 'id' in df.columns:
    df['group'] = df['id'] // 365
    print("\nCreated 'group' column based on id // 365.")
    print("Unique groups and counts:")
    print(df['group'].value_counts().sort_index())
else:
    print("\nWarning: 'id' column not found. Cannot create 'group' column for Group K-Fold.")
    # If 'id' is missing, we cannot use GroupKFold as winner did. Fallback might be needed.
    # For now, assume 'id' exists and group is created.


# Drop columns not needed for modeling
columns_to_drop = ['id', 'day'] # Dropping 'id' and 'day'

df_cleaned = df.drop(columns=columns_to_drop)

print(f"\nDropped columns: {columns_to_drop}")
print(f"Shape after dropping columns: {df_cleaned.shape}")
print("First 5 rows after dropping columns:")
print(df_cleaned.head())


# --- Step 2: EDA (Correlation Analysis - Numerical Features) ---
print("\n--- Step 2: EDA (Correlation Analysis - Numerical Features) ---")

# Identify numerical columns after dropping, BEFORE separating groups
numerical_cols_cleaned = df_cleaned.select_dtypes(include=np.number).columns.tolist()

print("\nCorrelation Matrix for Numerical Features:")
correlation_matrix = df_cleaned[numerical_cols_cleaned].corr()
# print(correlation_matrix) # Commented out

print("\nCorrelation with the Target Variable ('rainfall'):")
if target_col in df_cleaned.columns:
    correlation_with_target = df_cleaned[numerical_cols_cleaned].corr()[target_col].sort_values(ascending=False)
    print(correlation_with_target)
else:
    print(f"Error: Target column '{target_col}' not found after dropping columns.")


# --- Step 2: EDA (Categorical Feature vs. Target) ---
print("\n--- Step 2: EDA (Categorical Feature vs. Target) ---")

categorical_column_name = 'winddirection'

print(f"\nUnique values and data type for '{categorical_column_name}':")
if categorical_column_name in df_cleaned.columns:
    print(df_cleaned[categorical_column_name].value_counts())
    print(df_cleaned[categorical_column_name].dtype)
else:
     print(f"Error: '{categorical_column_name}' not found in cleaned data.")

print(f"\nRelationship between '{categorical_column_name}' and '{target_col}':")
if categorical_column_name in df_cleaned.columns and target_col in df_cleaned.columns:
    rainfall_by_category = df_cleaned.groupby(categorical_column_name)[target_col].mean().sort_values(ascending=False)
    print(rainfall_by_category)
else:
    print(f"Error: '{categorical_column_name}' or target column not found in cleaned data.")


# --- Step 3: Data Preprocessing ---
print("\n--- Step 3: Data Preprocessing ---")

# Define features (X), target (y), and groups from the cleaned DataFrame
# The 'group' column should NOT be in X, but kept separate for the split
X = df_cleaned.drop(columns=[target_col, 'group']) # Drop target and group from features
y = df_cleaned[target_col]
groups = df_cleaned['group'] # Keep the groups series separately


print(f"\nFeatures shape (X): {X.shape}")
print(f"Target shape (y): {y.shape}")
print(f"Groups shape: {groups.shape}")


# ** IMPORTANT: Split data *NOW* after basic cleaning but before feature transformations **
# When using GroupKFold for CV later, we need the original groups aligned with X_train.
# We still need a single overall train/test split for final evaluation (Step 8).
# Let's do the overall train/test split here, making sure to split the groups as well.
print("\n--- Splitting Data into Overall Training and Testing Sets (with Groups) ---")

# Split X, y, and groups into train and test sets
# Using stratify=y for target split consistency, although GroupKFold handles group consistency
# We pass groups to train_test_split to keep them aligned with the X, y splits
X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split(
    X, y, groups, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set shape (X_train, y_train, groups_train): {X_train.shape}, {y_train.shape}, {groups_train.shape}")
print(f"Test set shape (X_test, y_test, groups_test): {X_test.shape}, {y_test.shape}, {groups_test.shape}")


# --- Identify Column Types for Preprocessing (based on X_train) ---
# Numerical columns: Select number types, excluding the target and group (already separated)
numerical_cols = X_train.select_dtypes(include=np.number).columns.tolist()

# Categorical columns for OHE: Based on our analysis, winddirection
categorical_cols = [categorical_column_name] # Use the defined categorical column name

# Remove categorical from numerical_cols list if it somehow got in there initially
if categorical_column_name in numerical_cols:
    numerical_cols.remove(categorical_column_name)


print(f"\nFeatures identified for transformation:")
print(f"Numerical: {numerical_cols}")
print(f"Categorical (for OHE): {categorical_cols}")


# --- Create Preprocessing Pipelines/Objects (Fitted on Training Data) ---
print("\n--- Creating and Fitting Preprocessing Objects (on Training Data) ---")

# 1. Handle Missing Values (Numerical Columns) - Using Median (Redundant here as no missing data)
# Fit imputer on training data ONLY
imputer = SimpleImputer(strategy='median')
imputer.fit(X_train[numerical_cols])
print("Numerical imputer fitted on training data (no missing data found, so it won't change values).")

# 2. Encode Categorical Features (Fit on Training Data)
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(X_train[categorical_cols]) # Fit encoder on training data only on specified categorical columns
encoded_feature_names = encoder.get_feature_names_out(categorical_cols) # Get the names of the new columns
print(f"Categorical encoder fitted on training data for {categorical_cols}.")
print(f"  Encoder learned {len(encoded_feature_names)} new features.")

# 3. Scale Numerical Features (Fit on Training Data)
scaler = StandardScaler()
# Fit scaler on training data ONLY (using the potentially imputed data, though none was imputed)
scaler.fit(X_train[numerical_cols])
print("Numerical scaler fitted on training data.")


# --- Apply Preprocessing Transformations ---
print("\n--- Applying Preprocessing Transformations ---")

# --- Apply to Training Data ---
# Apply Imputation (no effect here)
X_train_imputed = pd.DataFrame(imputer.transform(X_train[numerical_cols]), columns=numerical_cols, index=X_train.index)

# Apply Scaling
X_train_scaled = scaler.transform(X_train_imputed)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=numerical_cols, index=X_train.index)
print(f"Training numerical features scaled. Shape: {X_train_scaled_df.shape}")

# Apply Encoding
X_train_encoded = encoder.transform(X_train[categorical_cols]) # Transform original (not imputed/scaled) categorical
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_feature_names, index=X_train.index)
print(f"Training categorical features encoded. Shape: {X_train_encoded_df.shape}")

# Combine scaled numerical and encoded categorical features for training
X_train_processed = pd.concat([X_train_scaled_df, X_train_encoded_df], axis=1)
print(f"\nFinal processed TRAIN features shape: {X_train_processed.shape}")
print("Final processed TRAIN columns (first 10):", X_train_processed.columns.tolist()[:10])


# --- Apply to Testing Data ---
# Apply Imputation using the *FITTED* imputer from training
X_test_imputed = pd.DataFrame(imputer.transform(X_test[numerical_cols]), columns=numerical_cols, index=X_test.index)

# Apply Scaling using the *FITTED* scaler from training
X_test_scaled = scaler.transform(X_test_imputed)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=numerical_cols, index=X_test.index)
print(f"\nTesting numerical features scaled. Shape: {X_test_scaled_df.shape}")

# Apply Encoding using the *FITTED* encoder from training
X_test_encoded = encoder.transform(X_test[categorical_cols])
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_feature_names, index=X_test.index)
print(f"Testing categorical features encoded. Shape: {X_test_encoded_df.shape}")

# Combine scaled numerical and encoded categorical features for testing
X_test_processed = pd.concat([X_test_scaled_df, X_test_encoded_df], axis=1)

# **CRUCIAL**: Reindex test columns to match train columns exactly after processing
# Ensure X_train_processed exists from Step 3
if 'X_train_processed' in locals():
    X_test_processed = X_test_processed[X_train_processed.columns]
    print("Submission test columns reordered to match training columns.")
else:
     # This case should ideally not happen if running the script sequentially
     print("Warning: X_train_processed not found during test processing.")


print(f"Final processed TEST features shape: {X_test_processed.shape}")
print("Final processed TEST columns (first 10):", X_test_processed.columns.tolist()[:10])

print("\nData Preprocessing Complete!")


# --- Step 4: Model Selection (Starting Group CV Tuning for RF) ---
print("\n--- Step 4: Model Selection (Starting Group CV Tuning for RF) ---")
print("Starting Group K-Fold Cross-Validation tuning for Random Forest Classifier.")


# --- Step 5 & 6: (Previous evaluations are implicitly part of our analysis) ---


# --- Step 7: Hyperparameter Tuning (Randomized Search CV for Random Forest Classifier - Group K-Fold) ---
print("\n--- Step 7: Hyperparameter Tuning (Randomized Search CV for Random Forest Classifier - Group K-Fold) ---")

# param_distributions_rf already defined above

print("Initializing base Random Forest Classifier model for tuning...")
rf_base_model_for_tuning = RandomForestClassifier(random_state=42, n_jobs=-1)

# Initialize GroupKFold CV
# Winner used 6 folds
n_splits_gkfold = 6
print(f"Initializing GroupKFold with {n_splits_gkfold} splits.")
gkfold = GroupKFold(n_splits=n_splits_gkfold)

# Initialize RandomizedSearchCV
print("Initializing RandomizedSearchCV for Random Forest tuning with GroupKFold...")
random_search_rf_gkfold = RandomizedSearchCV(
    estimator=rf_base_model_for_tuning,
    param_distributions=param_distributions_rf, # Using the same RF parameter space
    n_iter=10, # <-- Number of iterations/combinations to try (adjust as needed for time vs thoroughness)
    cv=gkfold, # Use the GroupKFold object here
    scoring='roc_auc', # Optimize for ROC AUC
    random_state=42,
    n_jobs=-1,
    verbose=2
)

# Fit RandomizedSearchCV using the processed training data AND the training groups
print(f"Starting Randomized Search Group K-Fold Cross-Validation for Random Forest with {random_search_rf_gkfold.n_iter} iterations and {gkfold.get_n_splits()} folds ({random_search_rf_gkfold.n_iter * gkfold.get_n_splits()} total fits)...")
print("This tuning respects the 'year' grouping to prevent leakage.")
start_time_rf_tune_gkfold = time.time()

random_search_rf_gkfold.fit(X_train_processed, y_train, groups=groups_train) # Pass groups_train here!

end_time_rf_tune_gkfold = time.time()
print("\nRandomized Search Group K-Fold Cross-Validation for Random Forest complete.")
print(f"Random Forest Group CV Tuning took {end_time_rf_tune_gkfold - start_time_rf_tune_gkfold:.4f} seconds.")


# Get the best hyperparameters and the best score found from Group CV
best_params_rf_gkfold = random_search_rf_gkfold.best_params_
best_cv_score_rf_gkfold = random_search_rf_gkfold.best_score_ # This is the mean Group K-Fold ROC AUC

print("\n--- Randomized Search Results for Random Forest (Group K-Fold Tuning) ---")
print("Best Hyperparameters found:", best_params_rf_gkfold)
print(f"Best Cross-Validated ROC AUC (Group K-Fold): {best_cv_score_rf_gkfold:.4f}")

# The best model found by the Group CV search
best_rf_model_gkfold = random_search_rf_gkfold.best_estimator_

print("\nBest tuned (Group K-Fold) Random Forest estimator is ready.")


# --- Step 8: Final Evaluation of the Tuned Model on the Hold-Out Test Set ---
print("\n--- Step 8: Final Evaluation of Tuned Random Forest Model on Hold-Out Test Set ---")

# Use the best tuned model (best_rf_model_gkfold) found in Step 7 (Group CV)
print("Evaluating the best tuned Random Forest model (from Group CV) on the UNSEEN hold-out test set...")

# Make predictions on the test set (X_test_processed which was NOT used in tuning CV folds)
y_pred_tuned_rf_test_gkfold = best_rf_model_gkfold.predict(X_test_processed)

# Get predicted probabilities for the positive class
y_prob_tuned_rf_test_gkfold = best_rf_model_gkfold.predict_proba(X_test_processed)[:, 1]

print("Predictions complete. Calculating metrics...")

# Calculate evaluation metrics
accuracy_tuned_rf_gkfold = accuracy_score(y_test, y_pred_tuned_rf_test_gkfold)
precision_tuned_rf_gkfold = precision_score(y_test, y_pred_tuned_rf_test_gkfold)
recall_tuned_rf_gkfold = recall_score(y_test, y_pred_tuned_rf_test_gkfold)
f1_tuned_rf_gkfold = f1_score(y_test, y_pred_tuned_rf_test_gkfold)
roc_auc_tuned_rf_gkfold = roc_auc_score(y_test, y_prob_tuned_rf_test_gkfold)

# Print the metrics
print(f"\nTuned Random Forest Classifier Model Evaluation on Hold-Out Test Set (from Group CV Tuning):")
print(f"Accuracy: {accuracy_tuned_rf_gkfold:.4f}")
print(f"Precision: {precision_tuned_rf_gkfold:.4f}")
print(f"Recall: {recall_tuned_rf_gkfold:.4f}")
print(f"F1-Score: {f1_tuned_rf_gkfold:.4f}")
print(f"ROC AUC: {roc_auc_tuned_rf_gkfold:.4f}")

print("\nFinal evaluation of tuned (Group K-Fold) Random Forest model complete.")


# --- Step 9: Prediction on New Data / Submission ---
print("\n--- Step 9: Prediction on New Data / Submission ---")

# Load the new test dataset for submission
submission_file_name_input = 'test.csv'

try:
    df_submission_test = pd.read_csv(submission_file_name_input)
    print(f"\nSubmission test data '{submission_file_name_input}' loaded successfully!")
except FileNotFoundError:
    print(f"\nError: '{submission_file_name_input}' not found. Cannot generate submission file.")
    # Assuming this is run in an environment where test.csv MUST be present for submission
    # import sys
    # sys.exit(f"Submission data file '{submission_file_name_input}' not found.")
    # If running conceptually without test.csv, add a flag or handle gracefully
    df_submission_test = None # Set to None if file not found to avoid further errors


if df_submission_test is not None and not df_submission_test.empty:
    print("\nSubmission Test Data Inspection:")
    print(df_submission_test.head())
    print(df_submission_test.info())

    # ** IMPORTANT: Apply the *SAME* preprocessing steps using the *FITTED* objects from training **
    # Use the imputer, encoder, scaler objects fitted in Step 3 on the training data.
    # Use the lists of columns (numerical_cols, categorical_cols, encoded_feature_names) from Step 3.

    df_submission_processed = df_submission_test.copy()

    # Store submission IDs before preprocessing
    submission_ids = df_submission_processed['id']

    # Drop 'id' and 'day' from the submission features, similar to training
    submission_feature_cols = [col for col in df_submission_processed.columns if col not in ['id', 'day']]
    df_submission_features = df_submission_processed[submission_feature_cols]

    # numerical_cols and categorical_cols and encoded_feature_names are available from Step 3


    # --- Apply Missing Value Imputation using the *FITTED* Imputer ---
    print("\n--- Handling Missing Values in Submission Test Data ---")
    # Use the imputer fitted on training data's numerical_cols
    X_submission_numerical_imputed = imputer.transform(df_submission_features[numerical_cols])
    X_submission_numerical_imputed_df = pd.DataFrame(X_submission_numerical_imputed, columns=numerical_cols, index=df_submission_features.index)
    print("Missing values imputed using imputer fitted on training data.")


    # --- Apply Categorical Encoding using the *FITTED* Encoder ---
    print("\n--- Encoding Categorical Features in Submission Test Data ---")
    # Use the encoder fitted on training data's categorical_cols
    X_submission_encoded = encoder.transform(df_submission_features[categorical_cols])
    # Use the feature names learned from training
    X_submission_encoded_df = pd.DataFrame(X_submission_encoded, columns=encoded_feature_names, index=df_submission_features.index)
    print("Categorical features encoded using encoder fitted on training data.")
    print(f"Shape after One-Hot Encoding (Submission Test): {X_submission_encoded_df.shape}")


    # --- Apply Numerical Scaling using the *FITTED* Scaler ---
    print("\n--- Scaling Numerical Features in Submission Test Data ---")
    # Use the scaler fitted on training data's numerical_cols (after imputation in training)
    X_submission_scaled = scaler.transform(X_submission_numerical_imputed_df[numerical_cols])
    X_submission_scaled_df = pd.DataFrame(X_submission_scaled, columns=numerical_cols, index=df_submission_features.index)
    print("Numerical features scaled using scaler fitted on training data.")
    print(f"Shape after Scaling (Submission Test): {X_submission_scaled_df.shape}")


    # --- Combine Processed Features for Submission ---
    print("\n--- Combining Processed Submission Test Features ---")

    X_submission_processed = pd.concat([X_submission_scaled_df, X_submission_encoded_df], axis=1)
    print(f"Processed Submission Test features shape: {X_submission_processed.shape}")

    # **CRUCIAL**: Reindex submission features to match the column order of training features exactly
    if 'X_train_processed' in locals():
        X_submission_processed = X_submission_processed[X_train_processed.columns]
        print("Submission test columns reordered to match training columns.")
    else:
         print("Warning: X_train_processed not found during submission processing. Cannot guarantee submission column order matches training.")


    # --- Make Final Predictions using the Best Model ---
    print("\n--- Making Final Predictions ---")
    # Use the best performing model found (best_rf_model_gkfold from Step 7)

    if 'best_rf_model_gkfold' in locals() and hasattr(best_rf_model_gkfold, 'predict'):
        print(f"Using the best Group K-Fold tuned model ('best_rf_model_gkfold') to generate predictions.")
        submission_predictions = best_rf_model_gkfold.predict(X_submission_processed)
        print("Predictions generated for submission.")

        # --- Create Submission File ---
        print("\n--- Creating Submission File ---")

        submission_df = pd.DataFrame({'id': submission_ids, target_col: submission_predictions})

        # Define the submission file name
        submission_file_name_output = 'rainfall_submission_rf_gkfold.csv'

        # Save the submission file
        submission_df.to_csv(submission_file_name_output, index=False)

        print(f"Submission file '{submission_file_name_output}' created successfully!")
        print(submission_df.head())

    else:
        print("Error: Best tuned model object ('best_rf_model_gkfold') not found or not fitted. Ensure training/tuning completed.")
        print("Submission file not created.")


else:
    if df_submission_test is not None:
         print("\nSubmission file not created due to empty submission test data.")
    else:
         print("\nSubmission file not created because 'test.csv' was not found.")


# --- End of Script ---

Setup complete. Importing libraries.

Data 'train.csv' loaded successfully!
Target variable defined as 'rainfall'.

First 5 rows of the loaded data:
   id  day  pressure  maxtemp  temparature  mintemp  dewpoint  humidity  \
0   0    1    1017.4     21.2         20.6     19.9      19.4      87.0   
1   1    2    1019.5     16.2         16.9     15.8      15.4      95.0   
2   2    3    1024.1     19.4         16.1     14.6       9.3      75.0   
3   3    4    1013.4     18.1         17.8     16.9      16.8      95.0   
4   4    5    1021.8     21.3         18.4     15.2       9.6      52.0   

   cloud  sunshine  winddirection  windspeed  rainfall  
0   88.0       1.1           60.0       17.2         1  
1   91.0       0.0           50.0       21.9         1  
2   47.0       8.3           70.0       18.1         1  
3   95.0       0.0           60.0       35.6         1  
4   45.0       3.6           40.0       24.8         0  

--- Step 1: Data Inspection and Cleaning (Initial & Group