In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import auc,roc_auc_score,classification_report,roc_curve,auc,f1_score
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import optuna
import warnings
from collections import Counter
warnings.simplefilter('ignore')


SEED=95
TRIALS=100
TREES_LOWER=32
TREES_UPPER=64


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [19]:
class Config:
    train_link = "train.csv"
    test_link = "test.csv"
    sub_link  = "sample_submission.csv"
    original = "Rainfall.csv"

In [20]:
train = pd.read_csv(Config.train_link, index_col = 'id')
test = pd.read_csv(Config.test_link, index_col = 'id')
original = pd.read_csv(Config.original)

original.columns = [col.strip() for col in original.columns]

rain_map = {'yes':1,
           'no':0}

original['rainfall'] = original['rainfall'].map(rain_map)

original.dropna(inplace = True)

train = pd.concat([train, original], axis = 0, ignore_index = True)

train = train.fillna(0)

test.winddirection=test.winddirection.fillna(test.winddirection.median())

In [21]:
train.describe()

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
count,2555.0,2555.0,2555.0,2555.0,2555.0,2555.0,2555.0,2555.0,2555.0,2555.0,2555.0,2555.0
mean,156.495108,1013.625362,26.338708,23.921918,22.128885,20.385832,81.768689,75.062622,3.840039,104.383679,21.766458,0.742857
std,113.131808,5.768099,5.701105,5.282829,5.139142,5.396821,8.187895,18.681667,3.67917,80.242979,9.921727,0.437144
min,1.0,998.5,7.1,4.9,3.1,-0.4,36.0,0.0,0.0,10.0,4.4,0.0
25%,44.5,1008.6,21.3,19.3,17.6,16.8,77.0,68.0,0.4,40.0,14.1,0.0
50%,148.0,1013.0,27.8,25.5,23.8,22.1,81.0,83.0,2.4,70.0,20.5,1.0
75%,255.0,1017.8,31.2,28.4,26.5,25.0,87.0,88.0,7.0,200.0,27.9,1.0
max,365.0,1034.6,36.3,32.4,30.0,26.7,98.0,100.0,12.1,350.0,59.5,1.0


In [22]:
train['month']=((train["day"] - 1) // 30) + 1
train["season"] = train["month"].map({
        1: "Winter", 2: "Winter", 3: "Spring", 4: "Spring", 5: "Spring",
        6: "Summer", 7: "Summer", 8: "Summer", 9: "Fall", 10: "Fall",
        11: "Fall", 12: "Winter"
    })

In [23]:
rainmap=train.groupby('day')['rainfall'].mean().to_dict()
season_weights = train.groupby('season')['rainfall'].mean().to_dict()

In [24]:
num_feats=[]

In [25]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def transform_features(df, is_train=True, train_mapping=None):
    '''
    df["wind_x"] = df["windspeed"] * np.cos(np.radians(df["winddirection"]))
    df["wind_y"] = df["windspeed"] * np.sin(np.radians(df["winddirection"]))
    df["stability_index"] = df["maxtemp"] - df["mintemp"]
    df["pressure_drop"] = df["pressure"].diff().fillna(0)
    '''
    df["month"] = ((df["day"] - 1) // 30) + 1
    df["season"] = df["month"].map({
        1: "Winter", 2: "Winter", 3: "Spring", 4: "Spring", 5: "Spring",
        6: "Summer", 7: "Summer", 8: "Summer", 9: "Fall", 10: "Fall",
        11: "Fall", 12: "Winter"
    }).map(season_weights)

    #df["dew_point_depression"] = df["temparature"] - df["dewpoint"]
    #df["dew_pressure"] = df["pressure"] / (df["dewpoint"] + 1)
    #df["cloud_sun_ratio"] = df["cloud"] / (df["sunshine"] + 1)
    df['rain_day']=df['day'].map(rainmap)
    
    df.drop(columns=["month", 'day'], inplace=True)
    #df.drop(columns=["month"], inplace=True)
    df.fillna(method="ffill", inplace=True)
    df.fillna(df.median(), inplace=True)

    if is_train:
        X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['rainfall']), 
                                                            df['rainfall'], 
                                                            test_size=0.2, 
                                                            random_state=SEED,
                                                            stratify=df['rainfall'])

        num_feats=list(X_train.columns)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        return X_train, X_test, y_train, y_test,num_feats

    else:
        df.fillna(method="ffill", inplace=True)
        df.fillna(df.median(), inplace=True)
        return df


In [26]:
X_train,X_test,y_train,y_test,num_feats=transform_features(train)
test=transform_features(test,False)

In [27]:
num_feats

['pressure',
 'maxtemp',
 'temparature',
 'mintemp',
 'dewpoint',
 'humidity',
 'cloud',
 'sunshine',
 'winddirection',
 'windspeed',
 'season',
 'rain_day']

In [28]:
test.head()

Unnamed: 0_level_0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,season,rain_day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2190,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3,0.709571,0.666667
2191,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3,0.709571,0.833333
2192,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9,0.709571,0.789474
2193,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6,0.709571,0.684211
2194,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4,0.709571,0.684211


In [29]:
test.isna().sum().sum()

0

In [30]:
test_mean = test[num_feats].mean(axis =0)
test_std = test[num_feats].std(axis =0)

test[num_feats] -= test_mean
test[num_feats] /= test_std

In [31]:
X_train

array([[ 0.47772884,  0.14879078,  0.27883038, ..., -1.36517409,
         0.21991264,  0.43239296],
       [ 0.52967375,  0.06073462, -0.15722876, ..., -0.50233368,
         1.73870339, -0.14706745],
       [ 1.55125699, -1.18966277, -1.38956982, ...,  0.72169573,
        -0.60282571, -0.37885162],
       ...,
       [ 0.16605937, -1.68277724, -1.59811985, ..., -0.07091348,
        -0.60282571,  1.24363755],
       [-2.34461131,  1.29352079,  1.16990777, ...,  0.69159665,
        -0.97454715,  0.43239296],
       [-0.76894902, -0.0097103 , -0.11931058, ..., -0.11104559,
        -0.60282571,  0.43239296]])

In [32]:
submission=pd.read_csv('sample_submission.csv')

In [33]:
import optuna
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

def create_model(trial):
    model = Sequential()
    model.add(Dense(units=trial.suggest_int('units_1', 110, 256, log=True),
                    activation='relu', kernel_initializer='he_normal', input_shape=(X_train.shape[1],)))
    model.add(Dropout(.35))
    model.add(Dense(units=trial.suggest_int('units_2', 53, 128, log=True),
                    activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(.35))
    model.add(Dense(units=32, activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(.3))
    model.add(Dense(units=16, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(units=1, activation='sigmoid'))
    
    
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

In [None]:
def objective(trial):
    model = create_model(trial)
    model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=0)
    score = model.evaluate(X_test, y_test, verbose=1)
    return score[1]
    
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print('Best trial:')
best_trial = study.best_trial
print(f'  Value: {best_trial.value}')
print('  Params: ')
for key, value in best_trial.params.items():
    print(f'    {key}: {value}')

[I 2025-03-05 10:16:23,657] A new study created in memory with name: no-name-18f22709-ae6b-4201-85b8-05af563a115e


  0%|          | 0/100 [00:00<?, ?it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9140 - loss: 0.2510 
[I 2025-03-05 10:16:32,304] Trial 0 finished with value: 0.9041095972061157 and parameters: {'units_1': 144, 'units_2': 53}. Best is trial 0 with value: 0.9041095972061157.
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9172 - loss: 0.2627 
[I 2025-03-05 10:16:44,571] Trial 1 finished with value: 0.9119373559951782 and parameters: {'units_1': 112, 'units_2': 65}. Best is trial 1 with value: 0.9119373559951782.
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9133 - loss: 0.2565 
[I 2025-03-05 10:16:51,804] Trial 2 finished with value: 0.9001957178115845 and parameters: {'units_1': 119, 'units_2': 101}. Best is trial 1 with value: 0.9119373559951782.
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9109 - loss: 0.2605 
[I 2025-03-05 10:17:01,647] Trial 3 finished wit

In [None]:
df_study = pd.DataFrame([
    {**t.params, "AUC": t.value, "Trial Number": t.number} 
    for t in study.trials
])

df_study = df_study.sort_values(by="AUC", ascending=False).reset_index(drop=True)

print(df_study.head(5))

top_model_params = [
    trial.params for trial in sorted(study.trials, key=lambda t: t.value, reverse=True)[:5]
]

print(top_model_params)

   units_1  dropout_1  units_2  dropout_2       AUC  Trial Number
0      227   0.277655       46   0.424483  0.878995             5
1      240   0.370481       50   0.277941  0.878995            65
2      202   0.453337       46   0.323742  0.878995            63
3       86   0.473484      102   0.330211  0.878995            30
4      238   0.424672       34   0.353323  0.876712            56
[{'units_1': 227, 'dropout_1': 0.27765503657786833, 'units_2': 46, 'dropout_2': 0.4244830377657848}, {'units_1': 86, 'dropout_1': 0.47348442354111114, 'units_2': 102, 'dropout_2': 0.3302108865667155}, {'units_1': 202, 'dropout_1': 0.4533367053750445, 'units_2': 46, 'dropout_2': 0.32374184345307977}, {'units_1': 240, 'dropout_1': 0.3704805214196765, 'units_2': 50, 'dropout_2': 0.277941167149058}, {'units_1': 71, 'dropout_1': 0.23605053728138425, 'units_2': 49, 'dropout_2': 0.3041498413928818}]


In [None]:
#top_model_params=[{'units_1': 92, 'dropout_1': 0.36537034589997436, 'units_2': 66, 'dropout_2': 0.31031514284970424}, {'units_1': 115, 'dropout_1': 0.35976665986533063, 'units_2': 61, 'dropout_2': 0.21245866054640758}, {'units_1': 235, 'dropout_1': 0.42604377318637393, 'units_2': 69, 'dropout_2': 0.2333035285439651}, {'units_1': 118, 'dropout_1': 0.4158563273591014, 'units_2': 51, 'dropout_2': 0.3560913945173838}, {'units_1': 203, 'dropout_1': 0.32328676753873037, 'units_2': 64, 'dropout_2': 0.20090305793683494}]

In [None]:
model_predictions = []

for params in top_model_params[:3]:
    model = Sequential()
    model.add(Dense(units=params['units_1'], activation='relu', kernel_initializer='he_normal', 
                    input_shape=(X_train.shape[1],)))
    model.add(Dropout(rate=params['dropout_1']))
    model.add(Dense(units=params['units_2'], activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(rate=params['dropout_2']))
    model.add(Dense(units=16, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(units=1, activation='sigmoid'))

    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=0)

    preds = model.predict(test).flatten()  
    model_predictions.append(preds)

model_predictions = np.array(model_predictions)

final_probs = np.mean(model_predictions, axis=0)  

sub = pd.read_csv(Config.sub_link)
sub['rainfall'] = final_probs  
sub.to_csv('submissionDL.csv', index=False)

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [None]:
train = pd.read_csv(Config.train_link, index_col = 'id')
test = pd.read_csv(Config.test_link, index_col = 'id')
original = pd.read_csv(Config.original)

original.columns = [col.strip() for col in original.columns]

rain_map = {'yes':1,
           'no':0}

original['rainfall'] = original['rainfall'].map(rain_map)

original.dropna(inplace = True)

#train = pd.concat([train, original], axis = 0, ignore_index = True)

train = train.fillna(0)

test.winddirection=test.winddirection.fillna(test.winddirection.median())

train['month']=((train["day"] - 1) // 30) + 1
train["season"] = train["month"].map({
        1: "Winter", 2: "Winter", 3: "Spring", 4: "Spring", 5: "Spring",
        6: "Summer", 7: "Summer", 8: "Summer", 9: "Fall", 10: "Fall",
        11: "Fall", 12: "Winter"
    })

def extract_features(day):
    base_date = pd.Timestamp('2023-01-01')
    date = base_date + pd.Timedelta(days=day - 1)
    month = date.month
    day_of_week = date.dayofweek
    is_weekend = 1 if day_of_week >= 5 else 0
    return month, day_of_week, is_weekend



def transform_features(df, is_train=True, train_mapping=None):
    df[['month', 'day_of_week', 'is_weekend']] = df['day'].apply(lambda x: pd.Series(extract_features(x)))
    # Temperature features
    df['temp_range'] = df['maxtemp'] - df['mintemp']
    df['avg_temp'] = (df['maxtemp'] + df['mintemp']) / 2
    df['temp_deviation'] = df['temparature'] - df['avg_temp']
    
    # Dew point depression
    df['dew_point_depression'] = df['temparature'] - df['dewpoint']
    
    # Wind direction - sine and cosine transformation
    df['wind_dir_rad'] = np.deg2rad(df['winddirection'])
    df['wind_dir_sin'] = np.sin(df['wind_dir_rad'])
    df['wind_dir_cos'] = np.cos(df['wind_dir_rad'])
    df.drop(columns=['wind_dir_rad'], inplace=True)
    
    # Wind chill factor (simplified version)
    df['wind_chill'] = 13.12 + 0.6215 * df['temparature'] - 11.37 * (df['windspeed']**0.16) + 0.3965 * df['temparature'] * (df['windspeed']**0.16)
    
    # Interaction features
    df['humidity_temp'] = df['humidity'] * df['temparature']
    df['cloud_sunshine'] = df['cloud'] * df['sunshine']
    
    # Rolling statistical features
    df['rolling_temp_mean'] = df['avg_temp'].rolling(window=7).mean()
    df['rolling_wind_mean'] = df['windspeed'].rolling(window=7).mean()
    df['rolling_humidity_mean'] = df['humidity'].rolling(window=7).mean()
    
    # Lag features
    df['temp_lag_1'] = df['avg_temp'].shift(1)
    df['humidity_lag_1'] = df['humidity'].shift(1)
    df['windspeed_lag_1'] = df['windspeed'].shift(1)
    
    # Pressure-Temperature interaction
    df['pressure_temp_interaction'] = df['pressure'] * df['avg_temp']
    # Wind-Speed-Temperature interaction
    df['windspeed_temp_interaction'] = df['windspeed'] * df['avg_temp']
    
    # Sunshine-Cloud interaction
    df['sunshine_cloud_interaction'] = df['sunshine'] * df['cloud']
    
    # Season feature
    df['season'] = df['month'].apply(lambda x: 'Spring' if 3 <= x <= 5 else
                                      'Summer' if 6 <= x <= 8 else
                                      'Autumn' if 9 <= x <= 11 else 'Winter')

    for c in ['pressure', 'maxtemp', 'temparature', 'humidity']:
        for gap in [1]:
            df[c+f"_shift{gap}"] = df[c].shift(gap)
            df[c+f"_diff{gap}"] = df[c].diff(gap)

    # Binary encoding for season
    df = pd.get_dummies(df, columns=['season'], drop_first=True)

    # Additional features from transform_features
    df["wind_x"] = df["windspeed"] * np.cos(np.radians(df["winddirection"]))
    df["wind_y"] = df["windspeed"] * np.sin(np.radians(df["winddirection"]))
    df["stability_index"] = df["maxtemp"] - df['mintemp']
    df["pressure_drop"] = df['pressure'].diff().fillna(0)
    df["dew_pressure"] = df["pressure"] / (df["dewpoint"] + 1)
    df["cloud_sun_ratio"] = df["cloud"] / (df["sunshine"] + 1)
    df['rain_day'] = df['day'].map(rainmap)

    df.drop(columns=["month", "day"], inplace=True)
    df.fillna(method="ffill", inplace=True)
    df.fillna(df.median(), inplace=True)
    #print(df.isna().sum())

    if is_train:
        X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['rainfall']),
                                                            df['rainfall'],
                                                            test_size=0.3,
                                                            random_state=SEED,
                                                            stratify=df['rainfall'])

        num_feats = list(X_train.columns)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        return X_train, X_test, y_train, y_test, num_feats

    else:
        return df


X_train,X_test,y_train,y_test,num_feats=transform_features(train)
test=transform_features(test,False)

test_mean = test[num_feats].mean(axis =0)
test_std = test[num_feats].std(axis =0)

test[num_feats] -= test_mean
test[num_feats] /= test_std

In [None]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
import optuna

In [None]:
def objective_extra_trees(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', TREES_LOWER, TREES_UPPER),  
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 10, 30),  
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 8, 20), 
        'max_features': trial.suggest_uniform('max_features', 0.1, 0.7),  
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    model = ExtraTreesClassifier(**params, random_state=SEED)
    model.fit(X_train, y_train)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)

    return roc_auc


study_extra_trees = optuna.create_study(direction='maximize')
study_extra_trees.optimize(objective_extra_trees, n_trials=TRIALS, show_progress_bar=True)

print("Best parameters for ExtraTreesClassifier:", study_extra_trees.best_params)

[I 2025-03-05 10:01:49,241] A new study created in memory with name: no-name-97ac9c9e-e82a-404b-aed5-c57b40cfb1c7


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-03-05 10:01:49,327] Trial 0 finished with value: 0.919553560294301 and parameters: {'n_estimators': 39, 'criterion': 'entropy', 'max_depth': 3, 'min_samples_split': 29, 'min_samples_leaf': 11, 'max_features': 0.34816782611146035, 'bootstrap': True}. Best is trial 0 with value: 0.919553560294301.
[I 2025-03-05 10:01:49,506] Trial 1 finished with value: 0.9333333333333333 and parameters: {'n_estimators': 58, 'criterion': 'entropy', 'max_depth': 8, 'min_samples_split': 21, 'min_samples_leaf': 8, 'max_features': 0.5971168924255719, 'bootstrap': False}. Best is trial 1 with value: 0.9333333333333333.
[I 2025-03-05 10:01:49,591] Trial 2 finished with value: 0.9256016959720664 and parameters: {'n_estimators': 44, 'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 26, 'min_samples_leaf': 17, 'max_features': 0.3478798453205936, 'bootstrap': True}. Best is trial 1 with value: 0.9333333333333333.
[I 2025-03-05 10:01:49,706] Trial 3 finished with value: 0.9313754832273351 and parame

In [None]:
print("\n=== ExtraTreesClassifier Results ===")
extra_trees_model = ExtraTreesClassifier(**study_extra_trees.best_params, random_state=SEED)
extra_trees_model.fit(X_train, y_train)
y_proba_extra = extra_trees_model.predict_proba(X_test)[:, 1]

best_threshold_extra = 0.5
best_f1_extra = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (y_proba_extra >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1_extra:
        best_f1_extra = f1
        best_threshold_extra = t

extra_trees_pred = (y_proba_extra >= best_threshold_extra).astype(int)

print(f"Best weighted F1-score: {best_f1_extra:.4f} at threshold {best_threshold_extra:.2f}")
print(f"Best parameters: {study_extra_trees.best_params}")
# print("Classification Report:\n", classification_report(y_test, extra_trees_pred))

fpr, tpr, _ = roc_curve(y_train, extra_trees_model.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr, tpr)

fpr, tpr, _ = roc_curve(y_test, extra_trees_model.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr, tpr)

print((roc_auc_train,roc_auc_test))

submission.rainfall = extra_trees_model.predict_proba(test)[:, 1]
submission.to_csv('submission_extra_trees.csv', index=False)


=== ExtraTreesClassifier Results ===
Best weighted F1-score: 0.8911 at threshold 0.60
Best parameters: {'n_estimators': 48, 'criterion': 'entropy', 'max_depth': 6, 'min_samples_split': 16, 'min_samples_leaf': 11, 'max_features': 0.6202975287517906, 'bootstrap': False}
(0.9523099475480428, 0.9370993889512408)


In [None]:
def objective_lgbm(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'n_estimators': trial.suggest_int('n_estimators', TREES_LOWER, TREES_UPPER),
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        'num_leaves': trial.suggest_int('num_leaves', 10, 40),  
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.05),  
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 120),  
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 0.9),  
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-2, 2),  
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-2, 2), 
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 0.9), 
        'bagging_freq': trial.suggest_int('bagging_freq', 5, 10),  
        'min_gain_to_split': trial.suggest_loguniform('min_gain_to_split', 1e-2, 0.5),  
        'extra_trees': trial.suggest_categorical('extra_trees', [True, False]),
        'max_depth': trial.suggest_int('max_depth', 3, 8)
    }
    
    model = lgb.LGBMClassifier(**params, random_state=SEED)

    model.fit(X_train, y_train)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    
    return roc_auc

study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=TRIALS, show_progress_bar=True)

print("Best parameters for LightGBM:", study_lgbm.best_params)


[I 2025-03-05 10:02:03,600] A new study created in memory with name: no-name-138f537e-60dc-43d8-885f-4427222fd61b


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-03-05 10:02:03,829] Trial 0 finished with value: 0.9240803092654946 and parameters: {'n_estimators': 59, 'boosting_type': 'dart', 'num_leaves': 24, 'learning_rate': 0.021244996559397187, 'min_child_samples': 24, 'subsample': 0.873684095574951, 'colsample_bytree': 0.7190506832903549, 'lambda_l1': 0.9226301860410131, 'lambda_l2': 0.1441863833588765, 'feature_fraction': 0.8894384308220955, 'bagging_freq': 5, 'min_gain_to_split': 0.03183489261255121, 'extra_trees': True, 'max_depth': 4}. Best is trial 0 with value: 0.9240803092654946.
[I 2025-03-05 10:02:03,891] Trial 1 finished with value: 0.9224591594961965 and parameters: {'n_estimators': 51, 'boosting_type': 'gbdt', 'num_leaves': 17, 'learning_rate': 0.006570228826817527, 'min_child_samples': 44, 'subsample': 0.843192803295734, 'colsample_bytree': 0.7880795494640844, 'lambda_l1': 0.3887650035558468, 'lambda_l2': 0.10415533879635444, 'feature_fraction': 0.6191981676407432, 'bagging_freq': 5, 'min_gain_to_split': 0.14724496197260

In [None]:
print("\n=== LightGBM Results ===")

lgbm_model = lgb.LGBMClassifier(**study_lgbm.best_params, random_state=SEED)
lgbm_model.fit(X_train, y_train)

y_proba = lgbm_model.predict_proba(X_test)[:, 1]

best_threshold = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (y_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

lgbm_pred = (y_proba >= best_threshold).astype(int)

print(f"Best weighted F1-score: {best_f1:.4f} at threshold {best_threshold:.2f}")
print(f"Best parameters: {study_lgbm.best_params}")
#print("Classification Report:\n", classification_report(y_test, lgbm_pred))

fpr, tpr, _ = roc_curve(y_train, lgbm_model.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr, tpr)

fpr, tpr, _ = roc_curve(y_test, lgbm_model.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr, tpr)

print((roc_auc_train,roc_auc_test))

submission.rainfall=lgbm_model.predict_proba(test)[:, 1]  
submission.to_csv('submission_lgbm.csv',index=False)


=== LightGBM Results ===
Best weighted F1-score: 0.8838 at threshold 0.60
Best parameters: {'n_estimators': 59, 'boosting_type': 'gbdt', 'num_leaves': 20, 'learning_rate': 0.016492297662071314, 'min_child_samples': 23, 'subsample': 0.8037280465914254, 'colsample_bytree': 0.7896410285639833, 'lambda_l1': 0.6886822491284842, 'lambda_l2': 0.03075640902321159, 'feature_fraction': 0.6255395520674955, 'bagging_freq': 10, 'min_gain_to_split': 0.04222690844794642, 'extra_trees': False, 'max_depth': 5}
(0.9590370828466067, 0.931687242798354)


In [None]:
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', TREES_LOWER, TREES_UPPER),
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.05),
        'min_child_weight': trial.suggest_int('min_child_weight', 10, 30),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 0.9),
        'gamma': trial.suggest_loguniform('gamma', 0.05, 1.5),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.05, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.05, 1.0),
        'max_leaves': trial.suggest_int('max_leaves', 20, 80),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    }
    
    model = xgb.XGBClassifier(**params, random_state=SEED)

    model.fit(X_train, y_train)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    
    return roc_auc

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=TRIALS, show_progress_bar=True)

print("Best parameters for XGBoost:", study_xgb.best_params)

[I 2025-03-05 10:02:12,809] A new study created in memory with name: no-name-39ba382f-9da7-4145-b24d-79565bdd04cf


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-03-05 10:02:12,893] Trial 0 finished with value: 0.9217421124828532 and parameters: {'n_estimators': 37, 'max_depth': 3, 'learning_rate': 0.020706342082573326, 'min_child_weight': 12, 'subsample': 0.6891324931434786, 'colsample_bytree': 0.8420931490807548, 'gamma': 0.7746381994244146, 'reg_alpha': 0.16090949269765917, 'reg_lambda': 0.8870231673180343, 'max_leaves': 37, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.9217421124828532.
[I 2025-03-05 10:02:12,966] Trial 1 finished with value: 0.9230951490210749 and parameters: {'n_estimators': 49, 'max_depth': 8, 'learning_rate': 0.004488778168154083, 'min_child_weight': 18, 'subsample': 0.8519030145115905, 'colsample_bytree': 0.6266563141925081, 'gamma': 0.3956651032627102, 'reg_alpha': 0.28709220808687447, 'reg_lambda': 0.30389223410157185, 'max_leaves': 45, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.9230951490210749.
[I 2025-03-05 10:02:13,012] Trial 2 finished with value: 0.9249906472128694 and param

In [None]:
print("\n=== XGBoost Results ===")

xgb_model = xgb.XGBClassifier(**study_xgb.best_params, random_state=SEED)
xgb_model.fit(X_train, y_train)

y_proba = xgb_model.predict_proba(X_test)[:, 1]

best_threshold = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (y_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

xgb_pred = (y_proba >= best_threshold).astype(int)

print(f"Best weighted F1-score: {best_f1:.4f} at threshold {best_threshold:.2f}")
print(f"Best parameters: {study_xgb.best_params}")
#print("Classification Report:\n", classification_report(y_test, xgb_pred))

fpr, tpr, _ = roc_curve(y_train, xgb_model.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr, tpr)

fpr, tpr, _ = roc_curve(y_test, xgb_model.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr, tpr)

print((roc_auc_train,roc_auc_test))

submission.rainfall=xgb_model.predict_proba(test)[:, 1]  
submission.to_csv('submission_xgb.csv',index=False)


=== XGBoost Results ===
Best weighted F1-score: 0.8803 at threshold 0.45
Best parameters: {'n_estimators': 62, 'max_depth': 6, 'learning_rate': 0.04983345876271907, 'min_child_weight': 12, 'subsample': 0.8865994773068875, 'colsample_bytree': 0.7371378644563915, 'gamma': 0.37212171541308064, 'reg_alpha': 0.11094113891751262, 'reg_lambda': 0.20376979100289452, 'max_leaves': 55, 'grow_policy': 'lossguide'}
(0.9636409445933255, 0.9344806085546825)


In [None]:
def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', TREES_LOWER, TREES_UPPER),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 10, 40),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced', 'balanced_subsample'])
    }
    
    model = RandomForestClassifier(**params, random_state=SEED)

    model.fit(X_train, y_train)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)

    return roc_auc

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=TRIALS, show_progress_bar=True)

print("Best parameters for Random Forest:", study_rf.best_params)

[I 2025-03-05 10:02:22,810] A new study created in memory with name: no-name-9a91e650-ca76-4fe4-902b-fd73955363f4


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-03-05 10:02:23,182] Trial 0 finished with value: 0.9201271979049757 and parameters: {'n_estimators': 60, 'max_depth': 16, 'min_samples_split': 23, 'min_samples_leaf': 14, 'max_features': 'sqrt', 'bootstrap': False, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.9201271979049757.
[I 2025-03-05 10:02:23,499] Trial 1 finished with value: 0.9190173338321487 and parameters: {'n_estimators': 45, 'max_depth': 14, 'min_samples_split': 14, 'min_samples_leaf': 12, 'max_features': 'sqrt', 'bootstrap': False, 'class_weight': 'balanced_subsample'}. Best is trial 0 with value: 0.9201271979049757.
[I 2025-03-05 10:02:23,695] Trial 2 finished with value: 0.9190796857463525 and parameters: {'n_estimators': 33, 'max_depth': 15, 'min_samples_split': 17, 'min_samples_leaf': 14, 'max_features': 'sqrt', 'bootstrap': False, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.9201271979049757.
[I 2025-03-05 10:02:23,926] Trial 3 finished with value: 0.9176081805711436 and parameters

In [None]:
print("\n=== Random Forest Results ===")

rf_model = RandomForestClassifier(**study_rf.best_params, random_state=SEED)
rf_model.fit(X_train, y_train)

y_proba = rf_model.predict_proba(X_test)[:, 1]

best_threshold = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (y_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

rf_pred = (y_proba >= best_threshold).astype(int)

print(f"Best weighted F1-score: {best_f1:.4f} at threshold {best_threshold:.2f}")
print(f"Best parameters: {study_rf.best_params}")
#print("Classification Report:\n", classification_report(y_test, rf_pred))

fpr, tpr, _ = roc_curve(y_train, rf_model.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr, tpr)

fpr, tpr, _ = roc_curve(y_test, rf_model.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr, tpr)

print((roc_auc_train,roc_auc_test))

submission.rainfall=rf_model.predict_proba(test)[:, 1]  
submission.to_csv('submission_rf.csv',index=False)


=== Random Forest Results ===
Best weighted F1-score: 0.8822 at threshold 0.55
Best parameters: {'n_estimators': 54, 'max_depth': 7, 'min_samples_split': 33, 'min_samples_leaf': 11, 'max_features': 'sqrt', 'bootstrap': False, 'class_weight': None}
(0.9681050871527062, 0.9245167726649208)


In [None]:
print("\n=== Voting Classifier Results ===")

voting_soft = VotingClassifier(
    estimators=[
        ('lgbm', lgbm_model),
        ('xgb', xgb_model),
        ('rf', rf_model),
        ('ef', extra_trees_model)
    ],
    voting='soft'
)

voting_hard = VotingClassifier(
    estimators=[
        ('lgbm', lgbm_model),
        ('xgb', xgb_model),
        ('rf', rf_model),
        ('ef', extra_trees_model)
    ],
    voting='hard'
)

# Soft Voting
voting_soft.fit(X_train, y_train)
soft_proba = voting_soft.predict_proba(X_test)[:, 1]

best_threshold_soft = 0.5
best_f1_soft = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (soft_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1_soft:
        best_f1_soft = f1
        best_threshold_soft = t

soft_pred = (soft_proba >= best_threshold_soft).astype(int)

print("\nSoft Voting:")
print(f"Best weighted F1-score: {best_f1_soft:.4f} at threshold {best_threshold_soft:.2f}")
#print("Classification Report:\n", classification_report(y_test, soft_pred))

fpr, tpr, _ = roc_curve(y_train, voting_soft.predict_proba(X_train)[:, 1])
roc_auc_train = auc(fpr, tpr)

fpr, tpr, _ = roc_curve(y_test, voting_soft.predict_proba(X_test)[:, 1])
roc_auc_test = auc(fpr, tpr)

print((roc_auc_train,roc_auc_test))

# Hard Voting
voting_hard.fit(X_train, y_train)
hard_pred = voting_hard.predict(X_test)

print("\nHard Voting:")
print(f"Weighted F1-score: {f1_score(y_test, hard_pred, average='weighted'):.4f}")
#print("Classification Report:\n", classification_report(y_test, hard_pred))


fpr, tpr, _ = roc_curve(y_train, np.mean([clf.predict_proba(X_train)[:, 1] for clf in voting_hard.estimators_], axis=0))
roc_auc_train = auc(fpr, tpr)

fpr, tpr, _ = roc_curve(y_test, np.mean([clf.predict_proba(X_test)[:, 1] for clf in voting_hard.estimators_], axis=0))
roc_auc_test = auc(fpr, tpr)

print((roc_auc_train,roc_auc_test))


=== Voting Classifier Results ===

Soft Voting:
Best weighted F1-score: 0.8847 at threshold 0.55
(0.9636432350718065, 0.9347300162114976)

Hard Voting:
Weighted F1-score: 0.8839
(0.9636432350718065, 0.9347300162114976)


In [None]:
soft_probabilities = voting_soft.predict_proba(test)[:, 1]  

hard_probabilities = np.mean([clf.predict_proba(test)[:, 1] for clf in voting_hard.estimators_], axis=0)

print("Soft Probabilities Shape:", soft_probabilities.shape)
print("Hard Probabilities Shape:", hard_probabilities.shape)


Soft Probabilities Shape: (730,)
Hard Probabilities Shape: (730,)


In [None]:
submission.rainfall=soft_probabilities
submission.to_csv('submission_soft.csv',index=False)

In [None]:
submission.rainfall=hard_probabilities
submission.to_csv('submission_hard.csv',index=False)

In [None]:
submission.rainfall=(lgbm_model.predict_proba(test)[:, 1]+xgb_model.predict_proba(test)[:, 1]+rf_model.predict_proba(test)[:, 1]+soft_probabilities+hard_probabilities+extra_trees_model.predict_proba(test)[:, 1])/6
submission.to_csv('submission.csv',index=False)