In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import auc,roc_auc_score,classification_report,roc_curve,auc,f1_score
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import optuna
import warnings
from collections import Counter
warnings.simplefilter('ignore')


SEED=95
TRIALS=100
TREES_LOWER=32
TREES_UPPER=64


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
class Config:
    train_link = "train.csv"
    test_link = "test.csv"
    sub_link  = "sample_submission.csv"
    original = "Rainfall.csv"

In [3]:
train = pd.read_csv(Config.train_link, index_col = 'id')
test = pd.read_csv(Config.test_link, index_col = 'id')
original = pd.read_csv(Config.original)

original.columns = [col.strip() for col in original.columns]

rain_map = {'yes':1,
           'no':0}

original['rainfall'] = original['rainfall'].map(rain_map)

original.dropna(inplace = True)

#train = pd.concat([train, original], axis = 0, ignore_index = True)

train = train.fillna(0)

test.winddirection=test.winddirection.fillna(test.winddirection.median())

In [4]:
train.describe()

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
count,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0
mean,179.948402,1013.602146,26.365799,23.953059,22.170091,20.454566,82.03653,75.721918,3.744429,104.863151,21.804703,0.753425
std,105.203592,5.655366,5.65433,5.22241,5.05912,5.288406,7.800654,18.026498,3.626327,80.002416,9.898659,0.431116
min,1.0,999.0,10.4,7.4,4.0,-0.3,39.0,2.0,0.0,10.0,4.4,0.0
25%,89.0,1008.6,21.3,19.3,17.7,16.8,77.0,69.0,0.4,40.0,14.125,1.0
50%,178.5,1013.0,27.8,25.5,23.85,22.15,82.0,83.0,2.4,70.0,20.5,1.0
75%,270.0,1017.775,31.2,28.4,26.4,25.0,88.0,88.0,6.8,200.0,27.9,1.0
max,365.0,1034.6,36.0,31.5,29.8,26.7,98.0,100.0,12.1,300.0,59.5,1.0


In [5]:
train['month']=((train["day"] - 1) // 30) + 1
train["season"] = train["month"].map({
        1: "Winter", 2: "Winter", 3: "Spring", 4: "Spring", 5: "Spring",
        6: "Summer", 7: "Summer", 8: "Summer", 9: "Fall", 10: "Fall",
        11: "Fall", 12: "Winter"
    })

In [6]:
rainmap=train.groupby('day')['rainfall'].mean().to_dict()
season_weights = train.groupby('season')['rainfall'].mean().to_dict()

In [7]:
num_feats=[]

In [8]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def transform_features(df, is_train=True, train_mapping=None):
    '''
    df["wind_x"] = df["windspeed"] * np.cos(np.radians(df["winddirection"]))
    df["wind_y"] = df["windspeed"] * np.sin(np.radians(df["winddirection"]))
    df["stability_index"] = df["maxtemp"] - df["mintemp"]
    df["pressure_drop"] = df["pressure"].diff().fillna(0)
    '''
    df["month"] = ((df["day"] - 1) // 30) + 1
    df["season"] = df["month"].map({
        1: "Winter", 2: "Winter", 3: "Spring", 4: "Spring", 5: "Spring",
        6: "Summer", 7: "Summer", 8: "Summer", 9: "Fall", 10: "Fall",
        11: "Fall", 12: "Winter"
    }).map(season_weights)

    #df["dew_point_depression"] = df["temparature"] - df["dewpoint"]
    #df["dew_pressure"] = df["pressure"] / (df["dewpoint"] + 1)
    #df["cloud_sun_ratio"] = df["cloud"] / (df["sunshine"] + 1)
    #df['rain_day']=df['day'].map(rainmap)
    
    df.drop(columns=["month", 'day'], inplace=True)
    df.fillna(method="ffill", inplace=True)
    df.fillna(df.median(), inplace=True)

    if is_train:
        X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['rainfall']), 
                                                            df['rainfall'], 
                                                            test_size=0.2, 
                                                            random_state=SEED,
                                                            stratify=df['rainfall'])

        num_feats=list(X_train.columns)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        return X_train, X_test, y_train, y_test,num_feats

    else:
        df.fillna(method="ffill", inplace=True)
        df.fillna(df.median(), inplace=True)
        return df


In [9]:
X_train,X_test,y_train,y_test,num_feats=transform_features(train)
test=transform_features(test,False)

In [10]:
num_feats

['pressure',
 'maxtemp',
 'temparature',
 'mintemp',
 'dewpoint',
 'humidity',
 'cloud',
 'sunshine',
 'winddirection',
 'windspeed',
 'season']

In [11]:
test.head()

Unnamed: 0_level_0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2190,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3,0.729779
2191,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3,0.729779
2192,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9,0.729779
2193,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6,0.729779
2194,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4,0.729779


In [12]:
test.isna().sum().sum()

0

In [13]:
test_mean = test[num_feats].mean(axis =0)
test_std = test[num_feats].std(axis =0)

test[num_feats] -= test_mean
test[num_feats] /= test_std

In [14]:
X_train

array([[-0.27405669,  0.24695983,  0.28717666, ..., -0.4311077 ,
         0.36833592,  1.5309898 ],
       [-0.04302011, -1.31273536, -1.47156354, ..., -0.80452413,
         1.15737724, -0.41479973],
       [-1.00271049,  1.20404552,  1.16654676, ...,  0.56466944,
        -1.24009448,  1.5309898 ],
       ...,
       [-0.36291691,  0.72550267,  0.65039475, ..., -0.68005198,
         0.72239292,  0.00745489],
       [ 0.95221436, -0.85191632, -1.14657893, ..., -0.68005198,
        -0.88603747, -0.41479973],
       [-0.96716641,  0.42419792,  0.32541015, ...,  1.68491871,
        -0.57244412,  1.5309898 ]])

In [15]:
submission=pd.read_csv('sample_submission.csv')

In [16]:
import optuna
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

def create_model(trial):
    model = Sequential()
    model.add(Dense(units=trial.suggest_int('units_1', 64, 256, log=True),
                    activation='relu', kernel_initializer='he_normal', input_shape=(X_train.shape[1],)))
    model.add(Dropout(rate=trial.suggest_uniform('dropout_1', 0.2, 0.5)))
    model.add(Dense(units=trial.suggest_int('units_2', 32, 128, log=True),
                    activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(rate=trial.suggest_uniform('dropout_2', 0.2, 0.5)))
    model.add(Dense(units=16, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(units=1, activation='sigmoid'))
    
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

In [17]:
'''
def objective(trial):
    model = create_model(trial)
    model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=0)
    score = model.evaluate(X_test, y_test, verbose=1)
    return score[1]
    
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print('Best trial:')
best_trial = study.best_trial
print(f'  Value: {best_trial.value}')
print('  Params: ')
for key, value in best_trial.params.items():
    print(f'    {key}: {value}')'''

"\ndef objective(trial):\n    model = create_model(trial)\n    model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=0)\n    score = model.evaluate(X_test, y_test, verbose=1)\n    return score[1]\n    \nstudy = optuna.create_study(direction='maximize')\nstudy.optimize(objective, n_trials=100, show_progress_bar=True)\n\nprint('Best trial:')\nbest_trial = study.best_trial\nprint(f'  Value: {best_trial.value}')\nprint('  Params: ')\nfor key, value in best_trial.params.items():\n    print(f'    {key}: {value}')"

In [18]:
'''
df_study = pd.DataFrame([
    {**t.params, "AUC": t.value, "Trial Number": t.number} 
    for t in study.trials
])

df_study = df_study.sort_values(by="AUC", ascending=False).reset_index(drop=True)

print(df_study.head(5))

top_model_params = [
    trial.params for trial in sorted(study.trials, key=lambda t: t.value, reverse=True)[:5]
]

print(top_model_params)'''

'\ndf_study = pd.DataFrame([\n    {**t.params, "AUC": t.value, "Trial Number": t.number} \n    for t in study.trials\n])\n\ndf_study = df_study.sort_values(by="AUC", ascending=False).reset_index(drop=True)\n\nprint(df_study.head(5))\n\ntop_model_params = [\n    trial.params for trial in sorted(study.trials, key=lambda t: t.value, reverse=True)[:5]\n]\n\nprint(top_model_params)'

In [19]:
top_model_params=[{'units_1': 92, 'dropout_1': 0.36537034589997436, 'units_2': 66, 'dropout_2': 0.31031514284970424}, {'units_1': 115, 'dropout_1': 0.35976665986533063, 'units_2': 61, 'dropout_2': 0.21245866054640758}, {'units_1': 235, 'dropout_1': 0.42604377318637393, 'units_2': 69, 'dropout_2': 0.2333035285439651}, {'units_1': 118, 'dropout_1': 0.4158563273591014, 'units_2': 51, 'dropout_2': 0.3560913945173838}, {'units_1': 203, 'dropout_1': 0.32328676753873037, 'units_2': 64, 'dropout_2': 0.20090305793683494}]

In [20]:
'''
model_predictions = []

for params in top_model_params[:3]:
    model = Sequential()
    model.add(Dense(units=params['units_1'], activation='relu', kernel_initializer='he_normal', 
                    input_shape=(X_train.shape[1],)))
    model.add(Dropout(rate=params['dropout_1']))
    model.add(Dense(units=params['units_2'], activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(rate=params['dropout_2']))
    model.add(Dense(units=16, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(units=1, activation='sigmoid'))

    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=0)

    preds = model.predict(test).flatten()  
    model_predictions.append(preds)

model_predictions = np.array(model_predictions)

final_probs = np.mean(model_predictions, axis=0)  

sub = pd.read_csv(Config.sub_link)
sub['rainfall'] = final_probs  
sub.to_csv('submissionDL.csv', index=False)'''

"\nmodel_predictions = []\n\nfor params in top_model_params[:3]:\n    model = Sequential()\n    model.add(Dense(units=params['units_1'], activation='relu', kernel_initializer='he_normal', \n                    input_shape=(X_train.shape[1],)))\n    model.add(Dropout(rate=params['dropout_1']))\n    model.add(Dense(units=params['units_2'], activation='relu', kernel_initializer='he_normal'))\n    model.add(Dropout(rate=params['dropout_2']))\n    model.add(Dense(units=16, activation='relu', kernel_initializer='he_normal'))\n    model.add(Dense(units=1, activation='sigmoid'))\n\n    optimizer = Adam(learning_rate=0.001)\n    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])\n\n    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)\n    model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=0)\n\n    preds = model.predict(test).flatten()  \n    mode

In [21]:
train = pd.read_csv(Config.train_link, index_col = 'id')
test = pd.read_csv(Config.test_link, index_col = 'id')
original = pd.read_csv(Config.original)

original.columns = [col.strip() for col in original.columns]

rain_map = {'yes':1,
           'no':0}

original['rainfall'] = original['rainfall'].map(rain_map)

original.dropna(inplace = True)

#train = pd.concat([train, original], axis = 0, ignore_index = True)

train = train.fillna(0)

test.winddirection=test.winddirection.fillna(test.winddirection.median())

train['month']=((train["day"] - 1) // 30) + 1
train["season"] = train["month"].map({
        1: "Winter", 2: "Winter", 3: "Spring", 4: "Spring", 5: "Spring",
        6: "Summer", 7: "Summer", 8: "Summer", 9: "Fall", 10: "Fall",
        11: "Fall", 12: "Winter"
    })

def extract_features(day):
    base_date = pd.Timestamp('2023-01-01')
    date = base_date + pd.Timedelta(days=day - 1)
    month = date.month
    day_of_week = date.dayofweek
    is_weekend = 1 if day_of_week >= 5 else 0
    return month, day_of_week, is_weekend



def transform_features(df, is_train=True, train_mapping=None):
    df[['month', 'day_of_week', 'is_weekend']] = df['day'].apply(lambda x: pd.Series(extract_features(x)))
    # Temperature features
    df['temp_range'] = df['maxtemp'] - df['mintemp']
    df['avg_temp'] = (df['maxtemp'] + df['mintemp']) / 2
    df['temp_deviation'] = df['temparature'] - df['avg_temp']
    
    # Dew point depression
    df['dew_point_depression'] = df['temparature'] - df['dewpoint']
    
    # Wind direction - sine and cosine transformation
    df['wind_dir_rad'] = np.deg2rad(df['winddirection'])
    df['wind_dir_sin'] = np.sin(df['wind_dir_rad'])
    df['wind_dir_cos'] = np.cos(df['wind_dir_rad'])
    df.drop(columns=['wind_dir_rad'], inplace=True)
    
    # Wind chill factor (simplified version)
    df['wind_chill'] = 13.12 + 0.6215 * df['temparature'] - 11.37 * (df['windspeed']**0.16) + 0.3965 * df['temparature'] * (df['windspeed']**0.16)
    
    # Interaction features
    df['humidity_temp'] = df['humidity'] * df['temparature']
    df['cloud_sunshine'] = df['cloud'] * df['sunshine']
    
    # Rolling statistical features
    df['rolling_temp_mean'] = df['avg_temp'].rolling(window=7).mean()
    df['rolling_wind_mean'] = df['windspeed'].rolling(window=7).mean()
    df['rolling_humidity_mean'] = df['humidity'].rolling(window=7).mean()
    
    # Lag features
    df['temp_lag_1'] = df['avg_temp'].shift(1)
    df['humidity_lag_1'] = df['humidity'].shift(1)
    df['windspeed_lag_1'] = df['windspeed'].shift(1)
    
    # Pressure-Temperature interaction
    df['pressure_temp_interaction'] = df['pressure'] * df['avg_temp']
    # Wind-Speed-Temperature interaction
    df['windspeed_temp_interaction'] = df['windspeed'] * df['avg_temp']
    
    # Sunshine-Cloud interaction
    df['sunshine_cloud_interaction'] = df['sunshine'] * df['cloud']
    
    # Season feature
    df['season'] = df['month'].apply(lambda x: 'Spring' if 3 <= x <= 5 else
                                      'Summer' if 6 <= x <= 8 else
                                      'Autumn' if 9 <= x <= 11 else 'Winter')

    for c in ['pressure', 'maxtemp', 'temparature', 'humidity']:
        for gap in [1]:
            df[c+f"_shift{gap}"] = df[c].shift(gap)
            df[c+f"_diff{gap}"] = df[c].diff(gap)

    # Binary encoding for season
    df = pd.get_dummies(df, columns=['season'], drop_first=True)

    # Additional features from transform_features
    df["wind_x"] = df["windspeed"] * np.cos(np.radians(df["winddirection"]))
    df["wind_y"] = df["windspeed"] * np.sin(np.radians(df["winddirection"]))
    df["stability_index"] = df["maxtemp"] - df['mintemp']
    df["pressure_drop"] = df['pressure'].diff().fillna(0)
    df["dew_pressure"] = df["pressure"] / (df["dewpoint"] + 1)
    df["cloud_sun_ratio"] = df["cloud"] / (df["sunshine"] + 1)
    df['rain_day'] = df['day'].map(rainmap)

    df.drop(columns=["month", "day"], inplace=True)
    df.fillna(method="ffill", inplace=True)
    df.fillna(df.median(), inplace=True)
    #print(df.isna().sum())

    if is_train:
        X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['rainfall']),
                                                            df['rainfall'],
                                                            test_size=0.3,
                                                            random_state=SEED,
                                                            stratify=df['rainfall'])

        num_feats = list(X_train.columns)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        return X_train, X_test, y_train, y_test, num_feats

    else:
        return df


X_train,X_test,y_train,y_test,num_feats=transform_features(train)
test=transform_features(test,False)

test_mean = test[num_feats].mean(axis =0)
test_std = test[num_feats].std(axis =0)

test[num_feats] -= test_mean
test[num_feats] /= test_std

In [22]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
import optuna

In [23]:
def objective_extra_trees(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', TREES_LOWER, TREES_UPPER),  
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 10, 30),  
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 8, 20), 
        'max_features': trial.suggest_uniform('max_features', 0.1, 0.7),  
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    model = ExtraTreesClassifier(**params, random_state=SEED)
    model.fit(X_train, y_train)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)

    return roc_auc


study_extra_trees = optuna.create_study(direction='maximize')
study_extra_trees.optimize(objective_extra_trees, n_trials=TRIALS, show_progress_bar=True)

print("Best parameters for ExtraTreesClassifier:", study_extra_trees.best_params)

[I 2025-03-04 10:11:12,043] A new study created in memory with name: no-name-b5514a25-703c-4179-85e3-636d53e7f1aa


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-03-04 10:11:12,179] Trial 0 finished with value: 0.9313380720788129 and parameters: {'n_estimators': 39, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 17, 'min_samples_leaf': 8, 'max_features': 0.3184529798401361, 'bootstrap': False}. Best is trial 0 with value: 0.9313380720788129.
[I 2025-03-04 10:11:12,318] Trial 1 finished with value: 0.9342187305150268 and parameters: {'n_estimators': 54, 'criterion': 'entropy', 'max_depth': 8, 'min_samples_split': 21, 'min_samples_leaf': 8, 'max_features': 0.6856527854118981, 'bootstrap': False}. Best is trial 1 with value: 0.9342187305150268.
[I 2025-03-04 10:11:12,396] Trial 2 finished with value: 0.928856465893503 and parameters: {'n_estimators': 52, 'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 15, 'min_samples_leaf': 20, 'max_features': 0.3515418931803922, 'bootstrap': False}. Best is trial 1 with value: 0.9342187305150268.
[I 2025-03-04 10:11:12,491] Trial 3 finished with value: 0.9325102880658436 and para

In [24]:
print("\n=== ExtraTreesClassifier Results ===")
extra_trees_model = ExtraTreesClassifier(**study_extra_trees.best_params, random_state=SEED)
extra_trees_model.fit(X_train, y_train)
y_proba_extra = extra_trees_model.predict_proba(X_test)[:, 1]

best_threshold_extra = 0.5
best_f1_extra = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (y_proba_extra >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1_extra:
        best_f1_extra = f1
        best_threshold_extra = t

extra_trees_pred = (y_proba_extra >= best_threshold_extra).astype(int)

print(f"Best weighted F1-score: {best_f1_extra:.4f} at threshold {best_threshold_extra:.2f}")
print(f"Best parameters: {study_extra_trees.best_params}")
# print("Classification Report:\n", classification_report(y_test, extra_trees_pred))

fpr, tpr, _ = roc_curve(y_train, extra_trees_model.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr, tpr)

fpr, tpr, _ = roc_curve(y_test, extra_trees_model.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr, tpr)

print((roc_auc_train,roc_auc_test))

submission.rainfall = extra_trees_model.predict_proba(test)[:, 1]
submission.to_csv('submission_extra_trees.csv', index=False)


=== ExtraTreesClassifier Results ===
Best weighted F1-score: 0.8862 at threshold 0.60
Best parameters: {'n_estimators': 61, 'criterion': 'entropy', 'max_depth': 6, 'min_samples_split': 29, 'min_samples_leaf': 8, 'max_features': 0.6295604422375194, 'bootstrap': False}
(0.9532811104239676, 0.9365506921062476)


In [25]:
def objective_lgbm(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'n_estimators': trial.suggest_int('n_estimators', TREES_LOWER, TREES_UPPER),
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        'num_leaves': trial.suggest_int('num_leaves', 10, 40),  
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.05),  
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 120),  
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 0.9),  
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-2, 2),  
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-2, 2), 
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 0.9), 
        'bagging_freq': trial.suggest_int('bagging_freq', 5, 10),  
        'min_gain_to_split': trial.suggest_loguniform('min_gain_to_split', 1e-2, 0.5),  
        'extra_trees': trial.suggest_categorical('extra_trees', [True, False]),
        'max_depth': trial.suggest_int('max_depth', 3, 8)
    }
    
    model = lgb.LGBMClassifier(**params, random_state=SEED)

    model.fit(X_train, y_train)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    
    return roc_auc

study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=TRIALS, show_progress_bar=True)

print("Best parameters for LightGBM:", study_lgbm.best_params)


[I 2025-03-04 10:11:26,942] A new study created in memory with name: no-name-2e118d1f-b804-45de-9001-9098eedc36dc


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-03-04 10:11:27,091] Trial 0 finished with value: 0.9175832398054621 and parameters: {'n_estimators': 63, 'boosting_type': 'gbdt', 'num_leaves': 31, 'learning_rate': 0.012257421321940541, 'min_child_samples': 82, 'subsample': 0.8882863492869049, 'colsample_bytree': 0.7780717675880353, 'lambda_l1': 0.38303132535005674, 'lambda_l2': 0.14807666451215273, 'feature_fraction': 0.8775383196912327, 'bagging_freq': 10, 'min_gain_to_split': 0.05689790762765406, 'extra_trees': True, 'max_depth': 5}. Best is trial 0 with value: 0.9175832398054621.
[I 2025-03-04 10:11:27,127] Trial 1 finished with value: 0.9082928045891008 and parameters: {'n_estimators': 49, 'boosting_type': 'gbdt', 'num_leaves': 31, 'learning_rate': 0.0012855301685135915, 'min_child_samples': 111, 'subsample': 0.7621107111494121, 'colsample_bytree': 0.6230185042157688, 'lambda_l1': 1.1178849705586253, 'lambda_l2': 0.022924004307375706, 'feature_fraction': 0.8291989122260552, 'bagging_freq': 8, 'min_gain_to_split': 0.243697

In [26]:
print("\n=== LightGBM Results ===")

lgbm_model = lgb.LGBMClassifier(**study_lgbm.best_params, random_state=SEED)
lgbm_model.fit(X_train, y_train)

y_proba = lgbm_model.predict_proba(X_test)[:, 1]

best_threshold = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (y_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

lgbm_pred = (y_proba >= best_threshold).astype(int)

print(f"Best weighted F1-score: {best_f1:.4f} at threshold {best_threshold:.2f}")
print(f"Best parameters: {study_lgbm.best_params}")
#print("Classification Report:\n", classification_report(y_test, lgbm_pred))

fpr, tpr, _ = roc_curve(y_train, lgbm_model.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr, tpr)

fpr, tpr, _ = roc_curve(y_test, lgbm_model.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr, tpr)

print((roc_auc_train,roc_auc_test))

submission.rainfall=lgbm_model.predict_proba(test)[:, 1]  
submission.to_csv('submission_lgbm.csv',index=False)


=== LightGBM Results ===
Best weighted F1-score: 0.8833 at threshold 0.60
Best parameters: {'n_estimators': 60, 'boosting_type': 'dart', 'num_leaves': 28, 'learning_rate': 0.036899523795865224, 'min_child_samples': 25, 'subsample': 0.8999211293797946, 'colsample_bytree': 0.7913423386970816, 'lambda_l1': 0.03164717619340073, 'lambda_l2': 0.08862909588863763, 'feature_fraction': 0.6253427250788586, 'bagging_freq': 10, 'min_gain_to_split': 0.02473364844750706, 'extra_trees': False, 'max_depth': 4}
(0.9533818914771295, 0.931525127821424)


In [27]:
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', TREES_LOWER, TREES_UPPER),
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.05),
        'min_child_weight': trial.suggest_int('min_child_weight', 10, 30),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 0.9),
        'gamma': trial.suggest_loguniform('gamma', 0.05, 1.5),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.05, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.05, 1.0),
        'max_leaves': trial.suggest_int('max_leaves', 20, 80),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    }
    
    model = xgb.XGBClassifier(**params, random_state=SEED)

    model.fit(X_train, y_train)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    
    return roc_auc

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=TRIALS, show_progress_bar=True)

print("Best parameters for XGBoost:", study_xgb.best_params)

[I 2025-03-04 10:11:36,812] A new study created in memory with name: no-name-681ca768-0b25-486c-84f3-648c29708647


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-03-04 10:11:36,871] Trial 0 finished with value: 0.9195972066342438 and parameters: {'n_estimators': 45, 'max_depth': 7, 'learning_rate': 0.013484555791678622, 'min_child_weight': 27, 'subsample': 0.7104189766617999, 'colsample_bytree': 0.7578324654538978, 'gamma': 1.329276262218789, 'reg_alpha': 0.25656460135965425, 'reg_lambda': 0.16902362684712188, 'max_leaves': 21, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.9195972066342438.
[I 2025-03-04 10:11:36,905] Trial 1 finished with value: 0.9204264870931538 and parameters: {'n_estimators': 47, 'max_depth': 4, 'learning_rate': 0.0127244308396913, 'min_child_weight': 28, 'subsample': 0.7038377860672178, 'colsample_bytree': 0.8343935817425032, 'gamma': 1.4289000321108924, 'reg_alpha': 0.1754606730211744, 'reg_lambda': 0.1043992682111005, 'max_leaves': 40, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.9204264870931538.
[I 2025-03-04 10:11:36,969] Trial 2 finished with value: 0.9224030427734132 and parameter

In [28]:
print("\n=== XGBoost Results ===")

xgb_model = xgb.XGBClassifier(**study_xgb.best_params, random_state=SEED)
xgb_model.fit(X_train, y_train)

y_proba = xgb_model.predict_proba(X_test)[:, 1]

best_threshold = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (y_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

xgb_pred = (y_proba >= best_threshold).astype(int)

print(f"Best weighted F1-score: {best_f1:.4f} at threshold {best_threshold:.2f}")
print(f"Best parameters: {study_xgb.best_params}")
#print("Classification Report:\n", classification_report(y_test, xgb_pred))

fpr, tpr, _ = roc_curve(y_train, xgb_model.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr, tpr)

fpr, tpr, _ = roc_curve(y_test, xgb_model.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr, tpr)

print((roc_auc_train,roc_auc_test))

submission.rainfall=xgb_model.predict_proba(test)[:, 1]  
submission.to_csv('submission_xgb.csv',index=False)


=== XGBoost Results ===
Best weighted F1-score: 0.8824 at threshold 0.55
Best parameters: {'n_estimators': 59, 'max_depth': 8, 'learning_rate': 0.041964532599507026, 'min_child_weight': 16, 'subsample': 0.7964325150575641, 'colsample_bytree': 0.6909321843893933, 'gamma': 0.15513755378774668, 'reg_alpha': 0.07620488881369394, 'reg_lambda': 0.4054712897472833, 'max_leaves': 79, 'grow_policy': 'lossguide'}
(0.9479328431709385, 0.9347175458286571)


In [29]:
def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', TREES_LOWER, TREES_UPPER),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 10, 40),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced', 'balanced_subsample'])
    }
    
    model = RandomForestClassifier(**params, random_state=SEED)

    model.fit(X_train, y_train)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)

    return roc_auc

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=TRIALS, show_progress_bar=True)

print("Best parameters for Random Forest:", study_rf.best_params)

[I 2025-03-04 10:11:46,545] A new study created in memory with name: no-name-da8de005-4dfc-4f9b-803a-7b40aa12c6dd


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-03-04 10:11:46,921] Trial 0 finished with value: 0.9188302780895373 and parameters: {'n_estimators': 52, 'max_depth': 11, 'min_samples_split': 14, 'min_samples_leaf': 12, 'max_features': 'sqrt', 'bootstrap': False, 'class_weight': 'balanced_subsample'}. Best is trial 0 with value: 0.9188302780895373.
[I 2025-03-04 10:11:47,201] Trial 1 finished with value: 0.9194163860830527 and parameters: {'n_estimators': 52, 'max_depth': 12, 'min_samples_split': 27, 'min_samples_leaf': 16, 'max_features': 'log2', 'bootstrap': False, 'class_weight': 'balanced_subsample'}. Best is trial 1 with value: 0.9194163860830527.
[I 2025-03-04 10:11:47,539] Trial 2 finished with value: 0.9193665045516898 and parameters: {'n_estimators': 52, 'max_depth': 19, 'min_samples_split': 38, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': False, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.9194163860830527.
[I 2025-03-04 10:11:47,739] Trial 3 finished with value: 0.9220102257139294 and 

In [30]:
print("\n=== Random Forest Results ===")

rf_model = RandomForestClassifier(**study_rf.best_params, random_state=SEED)
rf_model.fit(X_train, y_train)

y_proba = rf_model.predict_proba(X_test)[:, 1]

best_threshold = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (y_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

rf_pred = (y_proba >= best_threshold).astype(int)

print(f"Best weighted F1-score: {best_f1:.4f} at threshold {best_threshold:.2f}")
print(f"Best parameters: {study_rf.best_params}")
#print("Classification Report:\n", classification_report(y_test, rf_pred))

fpr, tpr, _ = roc_curve(y_train, rf_model.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr, tpr)

fpr, tpr, _ = roc_curve(y_test, rf_model.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr, tpr)

print((roc_auc_train,roc_auc_test))

submission.rainfall=rf_model.predict_proba(test)[:, 1]  
submission.to_csv('submission_rf.csv',index=False)


=== Random Forest Results ===
Best weighted F1-score: 0.8842 at threshold 0.35
Best parameters: {'n_estimators': 56, 'max_depth': 7, 'min_samples_split': 18, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True, 'class_weight': 'balanced'}
(0.9695916076868458, 0.9236563162489088)


In [31]:
print("\n=== Voting Classifier Results ===")

voting_soft = VotingClassifier(
    estimators=[
        ('lgbm', lgbm_model),
        ('xgb', xgb_model),
        ('rf', rf_model),
        ('ef', extra_trees_model)
    ],
    voting='soft'
)

voting_hard = VotingClassifier(
    estimators=[
        ('lgbm', lgbm_model),
        ('xgb', xgb_model),
        ('rf', rf_model),
        ('ef', extra_trees_model)
    ],
    voting='hard'
)

# Soft Voting
voting_soft.fit(X_train, y_train)
soft_proba = voting_soft.predict_proba(X_test)[:, 1]

best_threshold_soft = 0.5
best_f1_soft = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (soft_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1_soft:
        best_f1_soft = f1
        best_threshold_soft = t

soft_pred = (soft_proba >= best_threshold_soft).astype(int)

print("\nSoft Voting:")
print(f"Best weighted F1-score: {best_f1_soft:.4f} at threshold {best_threshold_soft:.2f}")
#print("Classification Report:\n", classification_report(y_test, soft_pred))

fpr, tpr, _ = roc_curve(y_train, voting_soft.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr, tpr)

fpr, tpr, _ = roc_curve(y_test, voting_soft.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr, tpr)

print((roc_auc_train,roc_auc_test))

# Hard Voting
voting_hard.fit(X_train, y_train)
hard_pred = voting_hard.predict(X_test)

print("\nHard Voting:")
print(f"Weighted F1-score: {f1_score(y_test, hard_pred, average='weighted'):.4f}")
#print("Classification Report:\n", classification_report(y_test, hard_pred))


fpr, tpr, _ = roc_curve(y_train, np.mean([clf.predict_proba(X_train)[:, 1] for clf in voting_hard.estimators_], axis=0))
roc_auc_train = auc(fpr, tpr)

fpr, tpr, _ = roc_curve(y_test, np.mean([clf.predict_proba(X_test)[:, 1] for clf in voting_hard.estimators_], axis=0))
roc_auc_test = auc(fpr, tpr)

print((roc_auc_train,roc_auc_test))


=== Voting Classifier Results ===

Soft Voting:
Best weighted F1-score: 0.8868 at threshold 0.45
(0.9610985134794658, 0.9341813193665045)

Hard Voting:
Weighted F1-score: 0.8819
(0.9610985134794658, 0.9341813193665045)


In [32]:
soft_probabilities = voting_soft.predict_proba(test)[:, 1]  

hard_probabilities = np.mean([clf.predict_proba(test)[:, 1] for clf in voting_hard.estimators_], axis=0)

print("Soft Probabilities Shape:", soft_probabilities.shape)
print("Hard Probabilities Shape:", hard_probabilities.shape)


Soft Probabilities Shape: (730,)
Hard Probabilities Shape: (730,)


In [33]:
submission.rainfall=soft_probabilities
submission.to_csv('submission_soft.csv',index=False)

In [34]:
submission.rainfall=hard_probabilities
submission.to_csv('submission_hard.csv',index=False)

In [35]:
submission.rainfall=(lgbm_model.predict_proba(test)[:, 1]+xgb_model.predict_proba(test)[:, 1]+rf_model.predict_proba(test)[:, 1]+soft_probabilities+hard_probabilities+extra_trees_model.predict_proba(test)[:, 1])/6
submission.to_csv('submission.csv',index=False)