In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
SEED=95
TRIALS=200

In [2]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.metrics import auc,roc_auc_score,classification_report,roc_curve,auc,f1_score
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import optuna
import warnings
from collections import Counter
warnings.simplefilter('ignore')

In [3]:
class Config:
    train_link = "train.csv"
    test_link = "test.csv"
    sub_link  = "sample_submission.csv"
    original = "Rainfall.csv"

In [4]:
train = pd.read_csv(Config.train_link, index_col = 'id')
test = pd.read_csv(Config.test_link, index_col = 'id')
original = pd.read_csv(Config.original)

original.columns = [col.strip() for col in original.columns]

rain_map = {'yes':1,
           'no':0}

original['rainfall'] = original['rainfall'].map(rain_map)

original.dropna(inplace = True)

#train = pd.concat([train, original], axis = 0, ignore_index = True)

train = train.fillna(0)

test.winddirection=test.winddirection.fillna(test.winddirection.median())

In [5]:
train.describe()

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
count,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0
mean,179.948402,1013.602146,26.365799,23.953059,22.170091,20.454566,82.03653,75.721918,3.744429,104.863151,21.804703,0.753425
std,105.203592,5.655366,5.65433,5.22241,5.05912,5.288406,7.800654,18.026498,3.626327,80.002416,9.898659,0.431116
min,1.0,999.0,10.4,7.4,4.0,-0.3,39.0,2.0,0.0,10.0,4.4,0.0
25%,89.0,1008.6,21.3,19.3,17.7,16.8,77.0,69.0,0.4,40.0,14.125,1.0
50%,178.5,1013.0,27.8,25.5,23.85,22.15,82.0,83.0,2.4,70.0,20.5,1.0
75%,270.0,1017.775,31.2,28.4,26.4,25.0,88.0,88.0,6.8,200.0,27.9,1.0
max,365.0,1034.6,36.0,31.5,29.8,26.7,98.0,100.0,12.1,300.0,59.5,1.0


In [6]:
rainmap=train.groupby('day')['rainfall'].mean().to_dict()

In [7]:
num_feats=[]

In [8]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def transform_features(df, is_train=True, train_mapping=None):
    df["dew_point_depression"] = df["temparature"] - df["dewpoint"]
    df["temp_humidity_interaction"] = df["humidity"] * df["temparature"]
    df["dew_pressure"] = df["pressure"] / (df["dewpoint"] + 1)
    df["cloud_sun_ratio"] = df["cloud"] / (df["sunshine"] + 1)
    df["cloud_humidity_interaction"] = (df["humidity"] * df["cloud"]) / 100
    df["wind_x"] = df["windspeed"] * np.cos(np.radians(df["winddirection"]))
    df["wind_y"] = df["windspeed"] * np.sin(np.radians(df["winddirection"]))
    df["stability_index"] = df["maxtemp"] - df["mintemp"]
    df["THI"] = 0.8 * df["temparature"] + (df["humidity"] / 100) * (df["temparature"] - 14.4) + 46.4
    df["pressure_drop"] = df["pressure"].diff().fillna(0)
    #df['rain_day']=df['day'].map(rainmap)

    season_weights = {"Winter": 0.6, "Spring": 0.7, "Summer": 0.5, "Fall": 0.9}
    df["month"] = ((df["day"] - 1) // 30) + 1
    df["season"] = df["month"].map({
        1: "Winter", 2: "Winter", 3: "Spring", 4: "Spring", 5: "Spring",
        6: "Summer", 7: "Summer", 8: "Summer", 9: "Fall", 10: "Fall",
        11: "Fall", 12: "Winter"
    }).map(season_weights)

    df.drop(columns=["month", "dewpoint", "sunshine", "mintemp", "maxtemp", 'day', 'winddirection'], inplace=True)
    df.fillna(method="ffill", inplace=True)
    df.fillna(df.median(), inplace=True)

    if is_train:
        X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['rainfall']), 
                                                            df['rainfall'], 
                                                            test_size=0.2, 
                                                            random_state=SEED,
                                                            stratify=df['rainfall'])

        smote = SMOTE(random_state=SEED)
        X_train, y_train = smote.fit_resample(X_train, y_train)
        num_feats=list(X_train.columns)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        return X_train, X_test, y_train, y_test,num_feats

    else:
        #df.drop(columns=["day"], inplace=True)
        df.fillna(method="ffill", inplace=True)
        df.fillna(df.median(), inplace=True)
        return df


In [9]:
X_train,X_test,y_train,y_test,num_feats=transform_features(train)
test=transform_features(test,False)

In [10]:
num_feats

['pressure',
 'temparature',
 'humidity',
 'cloud',
 'windspeed',
 'dew_point_depression',
 'temp_humidity_interaction',
 'dew_pressure',
 'cloud_sun_ratio',
 'cloud_humidity_interaction',
 'wind_x',
 'wind_y',
 'stability_index',
 'THI',
 'pressure_drop',
 'season']

In [11]:
test.head()

Unnamed: 0_level_0,pressure,temparature,humidity,cloud,windspeed,dew_point_depression,temp_humidity_interaction,dew_pressure,cloud_sun_ratio,cloud_humidity_interaction,wind_x,wind_y,stability_index,THI,pressure_drop,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2190,1019.5,15.8,96.0,99.0,24.3,0.9,1516.8,64.119497,99.0,95.04,15.619739,18.61488,4.8,60.384,0.0,0.6
2191,1016.5,16.5,97.0,99.0,35.3,1.4,1600.5,63.136646,99.0,96.03,22.690403,27.041369,1.7,61.637,-3.0,0.6
2192,1023.9,10.4,86.0,96.0,16.9,1.5,894.4,103.424242,96.0,82.56,12.946151,10.863111,1.8,51.28,7.4,0.6
2193,1022.9,17.3,75.0,45.0,50.6,7.8,1297.5,97.419048,5.555556,33.75,47.548447,17.306219,5.4,62.415,-1.0,0.6
2194,1022.2,13.8,68.0,49.0,19.4,9.5,938.4,192.867925,4.803922,33.32,18.230037,6.635191,9.7,57.032,-0.7,0.6


In [12]:
test.isna().sum().sum()

0

In [13]:
test_mean = test[num_feats].mean(axis =0)
test_std = test[num_feats].std(axis =0)

test[num_feats] -= test_mean
test[num_feats] /= test_std

In [14]:
X_train

array([[-0.29485318,  0.24129594,  1.14439751, ...,  0.36036277,
         0.07233155,  0.22085078],
       [-0.06933157, -1.48579383, -0.10649058, ..., -1.49841766,
         0.21006465, -0.48352679],
       [-1.00611365,  1.10484082, -0.23157938, ...,  1.07456058,
        -0.20313464,  0.22085078],
       ...,
       [ 2.25927414, -1.25714816, -0.180273  , ..., -1.27248033,
         2.52352969, -0.48352679],
       [-0.78763232,  1.10205228, -0.1597445 , ...,  1.08236947,
         0.56243452, -0.58815659],
       [ 0.89326689,  0.1033531 , -0.59404079, ...,  0.04335653,
         0.76372873,  1.62960592]])

In [15]:
def build_model(trial):
    num_units_1 = trial.suggest_int('num_units_1', 16, 128)
    num_units_2 = trial.suggest_int('num_units_2', 32, 256)

    model = keras.Sequential([
        keras.layers.Dense(num_units_1, activation="relu"),
        keras.layers.Dense(num_units_2, activation="relu"),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    opt = keras.optimizers.RMSprop()
    model.compile(optimizer = 'rmsprop', loss = "binary_crossentropy", metrics = ['auc'])

    return model

In [16]:
'''
def objective(trial):
    model = build_model(trial)
    model.fit(X_train, y_train, epochs=30, batch_size=16, validation_data=(X_test, y_test), verbose=0)
    y_pred = model.predict(X_test)
    auc = roc_auc_score(y_test, y_pred)
    return np.mean(auc)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("Best Parameters:", study.best_trial.params)'''

'\ndef objective(trial):\n    model = build_model(trial)\n    model.fit(X_train, y_train, epochs=30, batch_size=16, validation_data=(X_test, y_test), verbose=0)\n    y_pred = model.predict(X_test)\n    auc = roc_auc_score(y_test, y_pred)\n    return np.mean(auc)\n\nstudy = optuna.create_study(direction=\'maximize\')\nstudy.optimize(objective, n_trials=100, show_progress_bar=True)\n\nprint("Best Parameters:", study.best_trial.params)'

In [17]:
'''
df_study = pd.DataFrame([
    {**t.params, "AUC": t.value, "Trial Number": t.number} 
    for t in study.trials
])

df_study = df_study.sort_values(by="AUC", ascending=False).reset_index(drop=True)

print(df_study.head(5))

top_model_params = [
    trial.params for trial in sorted(study.trials, key=lambda t: t.value, reverse=True)[:5]
]

print(top_model_params)'''

'\ndf_study = pd.DataFrame([\n    {**t.params, "AUC": t.value, "Trial Number": t.number} \n    for t in study.trials\n])\n\ndf_study = df_study.sort_values(by="AUC", ascending=False).reset_index(drop=True)\n\nprint(df_study.head(5))\n\ntop_model_params = [\n    trial.params for trial in sorted(study.trials, key=lambda t: t.value, reverse=True)[:5]\n]\n\nprint(top_model_params)'

In [18]:
top_model_params=[{'num_units_1': 41, 'num_units_2': 41}, {'num_units_1': 16, 'num_units_2': 174}, {'num_units_1': 52, 'num_units_2': 35}, {'num_units_1': 23, 'num_units_2': 150}, {'num_units_1': 23, 'num_units_2': 129}]

In [19]:
'''
model_predictions = []

for params in top_model_params:
    model = keras.Sequential([
        layers.Dense(params['num_units_1'], activation="relu"),
        layers.Dense(params['num_units_2'], activation="relu"),
        layers.Dense(1, activation="sigmoid")  # Sigmoid gives probability between 0 and 1
    ])

    opt = keras.optimizers.RMSprop()
    model.compile(optimizer='rmsprop', loss="binary_crossentropy", metrics=['AUC'])

    model.fit(X_train, y_train, epochs=30, batch_size=16, verbose=0)

    preds = model.predict(test).flatten() 
    model_predictions.append(preds)

model_predictions = np.array(model_predictions)

final_probs = np.mean(model_predictions, axis=0)  


sub = pd.read_csv(Config.sub_link)
sub['rainfall'] = final_probs  
sub.to_csv('submissionDL.csv', index=False)'''

'\nmodel_predictions = []\n\nfor params in top_model_params:\n    model = keras.Sequential([\n        layers.Dense(params[\'num_units_1\'], activation="relu"),\n        layers.Dense(params[\'num_units_2\'], activation="relu"),\n        layers.Dense(1, activation="sigmoid")  # Sigmoid gives probability between 0 and 1\n    ])\n\n    opt = keras.optimizers.RMSprop()\n    model.compile(optimizer=\'rmsprop\', loss="binary_crossentropy", metrics=[\'AUC\'])\n\n    model.fit(X_train, y_train, epochs=30, batch_size=16, verbose=0)\n\n    preds = model.predict(test).flatten() \n    model_predictions.append(preds)\n\nmodel_predictions = np.array(model_predictions)\n\nfinal_probs = np.mean(model_predictions, axis=0)  \n\n\nsub = pd.read_csv(Config.sub_link)\nsub[\'rainfall\'] = final_probs  \nsub.to_csv(\'submissionDL.csv\', index=False)'

In [20]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
import optuna

In [21]:
def objective_lgbm(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'n_estimators': trial.suggest_int('n_estimators', 70,95),
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        'num_leaves': trial.suggest_int('num_leaves', 20, 64),  
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),  
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),  
        'subsample': trial.suggest_uniform('subsample', 0.5, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.9),  
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 1),  
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 1), 
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 0.9), 
        'bagging_freq': trial.suggest_int('bagging_freq', 5, 15),  
        'min_gain_to_split': trial.suggest_loguniform('min_gain_to_split', 1e-2, 1.0),  
        'extra_trees': trial.suggest_categorical('extra_trees', [True, False]),
        'max_depth': trial.suggest_int('max_depth', 3, 10)
    }
    
    model = lgb.LGBMClassifier(**params, random_state=SEED)

    model.fit(X_train, y_train)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    
    return roc_auc

study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=TRIALS, show_progress_bar=True)

print("Best parameters for LightGBM:", study_lgbm.best_params)


[I 2025-03-02 16:47:10,773] A new study created in memory with name: no-name-87da5c16-22cd-4679-b524-2e71ac6782e3


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2025-03-02 16:47:10,985] Trial 0 finished with value: 0.901571268237935 and parameters: {'n_estimators': 87, 'boosting_type': 'gbdt', 'num_leaves': 21, 'learning_rate': 0.005738914205740304, 'min_child_samples': 84, 'subsample': 0.8650508155402625, 'colsample_bytree': 0.5424743745574522, 'lambda_l1': 0.015959091728336518, 'lambda_l2': 0.028235094597217285, 'feature_fraction': 0.5078532792437747, 'bagging_freq': 8, 'min_gain_to_split': 0.5871995972781434, 'extra_trees': False, 'max_depth': 6}. Best is trial 0 with value: 0.901571268237935.
[I 2025-03-02 16:47:11,028] Trial 1 finished with value: 0.9000280583613918 and parameters: {'n_estimators': 70, 'boosting_type': 'gbdt', 'num_leaves': 43, 'learning_rate': 0.04936271535936353, 'min_child_samples': 10, 'subsample': 0.5767372820710719, 'colsample_bytree': 0.5344451061467247, 'lambda_l1': 0.47157151796290536, 'lambda_l2': 0.004036847645855894, 'feature_fraction': 0.5096088759589695, 'bagging_freq': 10, 'min_gain_to_split': 0.23748215

In [22]:
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 70,95),
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 20),
        'subsample': trial.suggest_uniform('subsample', 0.5, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.9),
        'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 0.5),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 0.5),
        'max_leaves': trial.suggest_int('max_leaves', 30, 100),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    }
    
    model = xgb.XGBClassifier(**params, random_state=SEED)

    model.fit(X_train, y_train)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    
    return roc_auc

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=TRIALS, show_progress_bar=True)

print("Best parameters for XGBoost:", study_xgb.best_params)


[I 2025-03-02 16:47:36,108] A new study created in memory with name: no-name-4e5b6d90-40aa-4ca4-9caa-bee2e1f4a6a4


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2025-03-02 16:47:36,273] Trial 0 finished with value: 0.9011503928170596 and parameters: {'n_estimators': 83, 'max_depth': 7, 'learning_rate': 0.0010230501027666608, 'min_child_weight': 8, 'subsample': 0.5848569892663676, 'colsample_bytree': 0.7089995414695441, 'gamma': 0.010803477249724473, 'reg_alpha': 0.05140226171741003, 'reg_lambda': 0.06445435656534054, 'max_leaves': 63, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.9011503928170596.
[I 2025-03-02 16:47:36,371] Trial 1 finished with value: 0.9027216610549944 and parameters: {'n_estimators': 82, 'max_depth': 9, 'learning_rate': 0.002466323022189553, 'min_child_weight': 15, 'subsample': 0.590493288158056, 'colsample_bytree': 0.5576880098376481, 'gamma': 0.016544373240642115, 'reg_alpha': 0.24228574813252765, 'reg_lambda': 0.4397196417964742, 'max_leaves': 100, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.9027216610549944.
[I 2025-03-02 16:47:36,441] Trial 2 finished with value: 0.9015151515151515 and p

In [23]:
def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 128,512),
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 5, 30),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 15),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced', 'balanced_subsample'])
    }
    
    model = RandomForestClassifier(**params, random_state=SEED)

    model.fit(X_train, y_train)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)

    return roc_auc

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=TRIALS, show_progress_bar=True)

print("Best parameters for Random Forest:", study_rf.best_params)


[I 2025-03-02 16:48:08,157] A new study created in memory with name: no-name-4a5b34ea-43ef-47e1-8a9d-c3cba257cca3


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2025-03-02 16:48:10,845] Trial 0 finished with value: 0.8967452300785634 and parameters: {'n_estimators': 482, 'max_depth': 5, 'min_samples_split': 30, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': True, 'class_weight': None}. Best is trial 0 with value: 0.8967452300785634.
[I 2025-03-02 16:48:13,528] Trial 1 finished with value: 0.8762065095398429 and parameters: {'n_estimators': 201, 'max_depth': 49, 'min_samples_split': 13, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'bootstrap': False, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.8967452300785634.
[I 2025-03-02 16:48:15,038] Trial 2 finished with value: 0.8888047138047138 and parameters: {'n_estimators': 188, 'max_depth': 28, 'min_samples_split': 25, 'min_samples_leaf': 11, 'max_features': 'sqrt', 'bootstrap': False, 'class_weight': None}. Best is trial 0 with value: 0.8967452300785634.
[I 2025-03-02 16:48:18,706] Trial 3 finished with value: 0.888327721661055 and parameters: {'n_estimators': 331,

In [24]:
print("\n=== LightGBM Results ===")

lgbm_model = lgb.LGBMClassifier(**study_lgbm.best_params, random_state=SEED)
lgbm_model.fit(X_train, y_train)

y_proba = lgbm_model.predict_proba(X_test)[:, 1]

best_threshold = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (y_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

lgbm_pred = (y_proba >= best_threshold).astype(int)

print(f"Best weighted F1-score: {best_f1:.4f} at threshold {best_threshold:.2f}")
print(f"Best parameters: {study_lgbm.best_params}")
#print("Classification Report:\n", classification_report(y_test, lgbm_pred))


=== LightGBM Results ===
Best weighted F1-score: 0.8506 at threshold 0.50
Best parameters: {'n_estimators': 82, 'boosting_type': 'dart', 'num_leaves': 50, 'learning_rate': 0.0021508352968901295, 'min_child_samples': 47, 'subsample': 0.644493230050457, 'colsample_bytree': 0.5514943925341776, 'lambda_l1': 0.05668387006714283, 'lambda_l2': 0.003973939253541027, 'feature_fraction': 0.6657362441524557, 'bagging_freq': 9, 'min_gain_to_split': 0.01479800354142247, 'extra_trees': False, 'max_depth': 7}


In [25]:
print("\n=== XGBoost Results ===")

xgb_model = xgb.XGBClassifier(**study_xgb.best_params, random_state=SEED)
xgb_model.fit(X_train, y_train)

y_proba = xgb_model.predict_proba(X_test)[:, 1]

best_threshold = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (y_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

xgb_pred = (y_proba >= best_threshold).astype(int)

print(f"Best weighted F1-score: {best_f1:.4f} at threshold {best_threshold:.2f}")
print(f"Best parameters: {study_xgb.best_params}")
#print("Classification Report:\n", classification_report(y_test, xgb_pred))


=== XGBoost Results ===
Best weighted F1-score: 0.8659 at threshold 0.45
Best parameters: {'n_estimators': 90, 'max_depth': 6, 'learning_rate': 0.01249158692561221, 'min_child_weight': 20, 'subsample': 0.831158113862705, 'colsample_bytree': 0.5002145516964018, 'gamma': 0.0601508720284904, 'reg_alpha': 0.049133351280181634, 'reg_lambda': 0.03298442349590514, 'max_leaves': 83, 'grow_policy': 'depthwise'}


In [26]:
print("\n=== Random Forest Results ===")

rf_model = RandomForestClassifier(**study_rf.best_params, random_state=SEED)
rf_model.fit(X_train, y_train)

y_proba = rf_model.predict_proba(X_test)[:, 1]

best_threshold = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (y_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

rf_pred = (y_proba >= best_threshold).astype(int)

print(f"Best weighted F1-score: {best_f1:.4f} at threshold {best_threshold:.2f}")
print(f"Best parameters: {study_rf.best_params}")
#print("Classification Report:\n", classification_report(y_test, rf_pred))


=== Random Forest Results ===
Best weighted F1-score: 0.8713 at threshold 0.35
Best parameters: {'n_estimators': 262, 'max_depth': 5, 'min_samples_split': 25, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'bootstrap': True, 'class_weight': 'balanced'}


In [27]:
print("\n=== Voting Classifier Results ===")

voting_soft = VotingClassifier(
    estimators=[
        ('lgbm', lgbm_model),
        ('xgb', xgb_model),
        ('rf', rf_model)
    ],
    voting='soft'
)

voting_hard = VotingClassifier(
    estimators=[
        ('lgbm', lgbm_model),
        ('xgb', xgb_model),
        ('rf', rf_model)
    ],
    voting='hard'
)

# Soft Voting
voting_soft.fit(X_train, y_train)
soft_proba = voting_soft.predict_proba(X_test)[:, 1]

best_threshold_soft = 0.5
best_f1_soft = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (soft_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1_soft:
        best_f1_soft = f1
        best_threshold_soft = t

soft_pred = (soft_proba >= best_threshold_soft).astype(int)

print("\nSoft Voting:")
print(f"Best weighted F1-score: {best_f1_soft:.4f} at threshold {best_threshold_soft:.2f}")
#print("Classification Report:\n", classification_report(y_test, soft_pred))

# Hard Voting
voting_hard.fit(X_train, y_train)
hard_pred = voting_hard.predict(X_test)

print("\nHard Voting:")
print(f"Weighted F1-score: {f1_score(y_test, hard_pred, average='weighted'):.4f}")
#print("Classification Report:\n", classification_report(y_test, hard_pred))


=== Voting Classifier Results ===

Soft Voting:
Best weighted F1-score: 0.8655 at threshold 0.45

Hard Voting:
Weighted F1-score: 0.8502


In [28]:
soft_probabilities = voting_soft.predict_proba(test)[:, 1]  

hard_probabilities = np.mean([clf.predict_proba(test)[:, 1] for clf in voting_hard.estimators_], axis=0)

print("Soft Probabilities Shape:", soft_probabilities.shape)
print("Hard Probabilities Shape:", hard_probabilities.shape)


Soft Probabilities Shape: (730,)
Hard Probabilities Shape: (730,)


In [29]:
submission=pd.read_csv('sample_submission.csv')

In [30]:
submission.rainfall=lgbm_model.predict_proba(test)[:, 1]  
submission.to_csv('submission_lgbm.csv',index=False)

In [31]:
submission.rainfall=xgb_model.predict_proba(test)[:, 1]  
submission.to_csv('submission_xgb.csv',index=False)

In [32]:
submission.rainfall=rf_model.predict_proba(test)[:, 1]  
submission.to_csv('submission_rf.csv',index=False)

In [33]:
submission.rainfall=soft_probabilities
submission.to_csv('submission_soft.csv',index=False)

In [34]:
submission.rainfall=hard_probabilities
submission.to_csv('submission_hard.csv',index=False)

In [35]:
submission.rainfall=(lgbm_model.predict_proba(test)[:, 1]+xgb_model.predict_proba(test)[:, 1]+rf_model.predict_proba(test)[:, 1])/3
submission.to_csv('submission.csv',index=False)