In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
SEED=95
TRIALS=200

In [2]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.metrics import auc,roc_auc_score,classification_report,roc_curve,auc,f1_score
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import optuna
import warnings
from collections import Counter
warnings.simplefilter('ignore')

In [3]:
class Config:
    train_link = "train.csv"
    test_link = "test.csv"
    sub_link  = "sample_submission.csv"
    original = "Rainfall.csv"

In [4]:
train = pd.read_csv(Config.train_link, index_col = 'id')
test = pd.read_csv(Config.test_link, index_col = 'id')
original = pd.read_csv(Config.original)

original.columns = [col.strip() for col in original.columns]

rain_map = {'yes':1,
           'no':0}

original['rainfall'] = original['rainfall'].map(rain_map)

original.dropna(inplace = True)

train = pd.concat([train, original], axis = 0, ignore_index = True)

train = train.fillna(0)

test.winddirection=test.winddirection.fillna(test.winddirection.median())

In [5]:
train.describe()

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
count,2555.0,2555.0,2555.0,2555.0,2555.0,2555.0,2555.0,2555.0,2555.0,2555.0,2555.0,2555.0
mean,156.495108,1013.625362,26.338708,23.921918,22.128885,20.385832,81.768689,75.062622,3.840039,104.383679,21.766458,0.742857
std,113.131808,5.768099,5.701105,5.282829,5.139142,5.396821,8.187895,18.681667,3.67917,80.242979,9.921727,0.437144
min,1.0,998.5,7.1,4.9,3.1,-0.4,36.0,0.0,0.0,10.0,4.4,0.0
25%,44.5,1008.6,21.3,19.3,17.6,16.8,77.0,68.0,0.4,40.0,14.1,0.0
50%,148.0,1013.0,27.8,25.5,23.8,22.1,81.0,83.0,2.4,70.0,20.5,1.0
75%,255.0,1017.8,31.2,28.4,26.5,25.0,87.0,88.0,7.0,200.0,27.9,1.0
max,365.0,1034.6,36.3,32.4,30.0,26.7,98.0,100.0,12.1,350.0,59.5,1.0


In [6]:
train['month']=((train["day"] - 1) // 30) + 1
train["season"] = train["month"].map({
        1: "Winter", 2: "Winter", 3: "Spring", 4: "Spring", 5: "Spring",
        6: "Summer", 7: "Summer", 8: "Summer", 9: "Fall", 10: "Fall",
        11: "Fall", 12: "Winter"
    })

In [7]:
rainmap=train.groupby('day')['rainfall'].mean().to_dict()
season_weights = train.groupby('season')['rainfall'].mean().to_dict()

In [8]:
num_feats=[]

In [9]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def transform_features(df, is_train=True, train_mapping=None):
    df["wind_x"] = df["windspeed"] * np.cos(np.radians(df["winddirection"]))
    df["wind_y"] = df["windspeed"] * np.sin(np.radians(df["winddirection"]))
    df["stability_index"] = df["maxtemp"] - df["mintemp"]
    df["pressure_drop"] = df["pressure"].diff().fillna(0)
    df["month"] = ((df["day"] - 1) // 30) + 1
    df["season"] = df["month"].map({
        1: "Winter", 2: "Winter", 3: "Spring", 4: "Spring", 5: "Spring",
        6: "Summer", 7: "Summer", 8: "Summer", 9: "Fall", 10: "Fall",
        11: "Fall", 12: "Winter"
    }).map(season_weights)

    #df["dew_point_depression"] = df["temparature"] - df["dewpoint"]
    #df["dew_pressure"] = df["pressure"] / (df["dewpoint"] + 1)
    #df["cloud_sun_ratio"] = df["cloud"] / (df["sunshine"] + 1)
    #df['rain_day']=df['day'].map(rainmap)
    
    df.drop(columns=["month", "mintemp", "maxtemp", 'day', 'winddirection','windspeed'], inplace=True)
    df.fillna(method="ffill", inplace=True)
    df.fillna(df.median(), inplace=True)

    if is_train:
        X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['rainfall']), 
                                                            df['rainfall'], 
                                                            test_size=0.2, 
                                                            random_state=SEED,
                                                            stratify=df['rainfall'])

        num_feats=list(X_train.columns)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        return X_train, X_test, y_train, y_test,num_feats

    else:
        df.fillna(method="ffill", inplace=True)
        df.fillna(df.median(), inplace=True)
        return df


In [10]:
X_train,X_test,y_train,y_test,num_feats=transform_features(train)
test=transform_features(test,False)

In [11]:
num_feats

['pressure',
 'temparature',
 'dewpoint',
 'humidity',
 'cloud',
 'sunshine',
 'season',
 'wind_x',
 'wind_y',
 'stability_index',
 'pressure_drop']

In [12]:
test.head()

Unnamed: 0_level_0,pressure,temparature,dewpoint,humidity,cloud,sunshine,wind_x,wind_y,stability_index,pressure_drop,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2190,1019.5,15.8,14.9,96.0,99.0,0.0,15.619739,18.61488,4.8,0.0,0.709571
2191,1016.5,16.5,15.1,97.0,99.0,0.0,22.690403,27.041369,1.7,-3.0,0.709571
2192,1023.9,10.4,8.9,86.0,96.0,0.0,12.946151,10.863111,1.8,7.4,0.709571
2193,1022.9,17.3,9.5,75.0,45.0,7.1,47.548447,17.306219,5.4,-1.0,0.709571
2194,1022.2,13.8,4.3,68.0,49.0,9.2,18.230037,6.635191,9.7,-0.7,0.709571


In [13]:
test.isna().sum().sum()

0

In [14]:
test_mean = test[num_feats].mean(axis =0)
test_std = test[num_feats].std(axis =0)

test[num_feats] -= test_mean
test[num_feats] /= test_std

In [15]:
X_train

array([[ 4.77728838e-01,  2.78830383e-01,  4.62590393e-01, ...,
        -2.10691635e-01, -6.66295866e-01, -3.59233062e-01],
       [ 5.29673749e-01, -1.57228762e-01,  1.28400444e-01, ...,
         1.43096322e-01,  1.17649306e-01, -1.10516662e-03],
       [ 1.55125699e+00, -1.38956982e+00, -2.24806142e+00, ...,
         1.27538892e+00,  4.44293128e-01,  1.32121322e+00],
       ...,
       [ 1.66059374e-01, -1.59811985e+00, -9.11301621e-01, ...,
         3.19990301e-01, -1.31958351e+00,  7.15150624e-01],
       [-2.34461131e+00,  1.16990777e+00,  8.15346451e-01, ...,
         1.17419128e+00,  5.74950656e-01, -2.49039863e-01],
       [-7.68949017e-01, -1.19310576e-01,  2.76929310e-01, ...,
        -7.67621613e-01,  5.09621892e-01, -6.62264358e-01]])

In [16]:
submission=pd.read_csv('sample_submission.csv')

In [17]:
import optuna
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

def create_model(trial):
    model = Sequential()
    model.add(Dense(units=trial.suggest_int('units_1', 64, 256, log=True),
                    activation='relu', kernel_initializer='he_normal', input_shape=(X_train.shape[1],)))
    model.add(Dropout(rate=trial.suggest_uniform('dropout_1', 0.2, 0.5)))
    model.add(Dense(units=trial.suggest_int('units_2', 32, 128, log=True),
                    activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(rate=trial.suggest_uniform('dropout_2', 0.2, 0.5)))
    model.add(Dense(units=16, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(units=1, activation='sigmoid'))
    
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

In [None]:
def objective(trial):
    model = create_model(trial)
    model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=0)
    score = model.evaluate(X_test, y_test, verbose=1)
    return score[1]
    
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print('Best trial:')
best_trial = study.best_trial
print(f'  Value: {best_trial.value}')
print('  Params: ')
for key, value in best_trial.params.items():
    print(f'    {key}: {value}')

[I 2025-03-03 09:59:33,724] A new study created in memory with name: no-name-7e9326c9-d398-4efa-a6a8-1874f03e0d08


  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
df_study = pd.DataFrame([
    {**t.params, "AUC": t.value, "Trial Number": t.number} 
    for t in study.trials
])

df_study = df_study.sort_values(by="AUC", ascending=False).reset_index(drop=True)

print(df_study.head(5))

top_model_params = [
    trial.params for trial in sorted(study.trials, key=lambda t: t.value, reverse=True)[:5]
]

print(top_model_params)

In [None]:
#top_model_params=[{'num_units_1': 41, 'num_units_2': 41}, {'num_units_1': 16, 'num_units_2': 174}, {'num_units_1': 52, 'num_units_2': 35}, {'num_units_1': 23, 'num_units_2': 150}, {'num_units_1': 23, 'num_units_2': 129}]

In [None]:
model_predictions = []

for params in top_model_params:
    model = Sequential()
    model.add(Dense(units=params['units_1'], activation='relu', kernel_initializer='he_normal', 
                    input_shape=(X_train_scaled.shape[1],)))
    model.add(Dropout(rate=params['dropout_1']))
    model.add(Dense(units=params['units_2'], activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(rate=params['dropout_2']))
    model.add(Dense(units=16, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(units=1, activation='sigmoid'))

    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=0)

    preds = model.predict(test).flatten()  
    model_predictions.append(preds)

model_predictions = np.array(model_predictions)

final_probs = np.mean(model_predictions, axis=0)  

sub = pd.read_csv(Config.sub_link)
sub['rainfall'] = final_probs  
sub.to_csv('submissionDL.csv', index=False)

In [None]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
import optuna

In [None]:
def objective_lgbm(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'n_estimators': trial.suggest_int('n_estimators', 128,512),
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        'num_leaves': trial.suggest_int('num_leaves', 20, 64),  
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),  
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),  
        'subsample': trial.suggest_uniform('subsample', 0.5, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.9),  
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 1),  
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 1), 
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 0.9), 
        'bagging_freq': trial.suggest_int('bagging_freq', 5, 15),  
        'min_gain_to_split': trial.suggest_loguniform('min_gain_to_split', 1e-2, 1.0),  
        'extra_trees': trial.suggest_categorical('extra_trees', [True, False]),
        'max_depth': trial.suggest_int('max_depth', 3, 10)
    }
    
    model = lgb.LGBMClassifier(**params, random_state=SEED)

    model.fit(X_train, y_train)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    
    return roc_auc

study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=TRIALS, show_progress_bar=True)

print("Best parameters for LightGBM:", study_lgbm.best_params)


In [None]:
print("\n=== LightGBM Results ===")

lgbm_model = lgb.LGBMClassifier(**study_lgbm.best_params, random_state=SEED)
lgbm_model.fit(X_train, y_train)

y_proba = lgbm_model.predict_proba(X_test)[:, 1]

best_threshold = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (y_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

lgbm_pred = (y_proba >= best_threshold).astype(int)

print(f"Best weighted F1-score: {best_f1:.4f} at threshold {best_threshold:.2f}")
print(f"Best parameters: {study_lgbm.best_params}")
#print("Classification Report:\n", classification_report(y_test, lgbm_pred))

submission.rainfall=lgbm_model.predict_proba(test)[:, 1]  
submission.to_csv('submission_lgbm.csv',index=False)

In [None]:
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 128,512),
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 20),
        'subsample': trial.suggest_uniform('subsample', 0.5, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.9),
        'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 0.5),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 0.5),
        'max_leaves': trial.suggest_int('max_leaves', 30, 100),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    }
    
    model = xgb.XGBClassifier(**params, random_state=SEED)

    model.fit(X_train, y_train)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    
    return roc_auc

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=TRIALS, show_progress_bar=True)

print("Best parameters for XGBoost:", study_xgb.best_params)


In [None]:
print("\n=== XGBoost Results ===")

xgb_model = xgb.XGBClassifier(**study_xgb.best_params, random_state=SEED)
xgb_model.fit(X_train, y_train)

y_proba = xgb_model.predict_proba(X_test)[:, 1]

best_threshold = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (y_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

xgb_pred = (y_proba >= best_threshold).astype(int)

print(f"Best weighted F1-score: {best_f1:.4f} at threshold {best_threshold:.2f}")
print(f"Best parameters: {study_xgb.best_params}")
#print("Classification Report:\n", classification_report(y_test, xgb_pred))

submission.rainfall=xgb_model.predict_proba(test)[:, 1]  
submission.to_csv('submission_xgb.csv',index=False)

In [None]:
def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 128,1024),
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 5, 30),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 15),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced', 'balanced_subsample'])
    }
    
    model = RandomForestClassifier(**params, random_state=SEED)

    model.fit(X_train, y_train)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)

    return roc_auc

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=TRIALS, show_progress_bar=True)

print("Best parameters for Random Forest:", study_rf.best_params)


In [None]:
print("\n=== Random Forest Results ===")

rf_model = RandomForestClassifier(**study_rf.best_params, random_state=SEED)
rf_model.fit(X_train, y_train)

y_proba = rf_model.predict_proba(X_test)[:, 1]

best_threshold = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (y_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

rf_pred = (y_proba >= best_threshold).astype(int)

print(f"Best weighted F1-score: {best_f1:.4f} at threshold {best_threshold:.2f}")
print(f"Best parameters: {study_rf.best_params}")
#print("Classification Report:\n", classification_report(y_test, rf_pred))

submission.rainfall=rf_model.predict_proba(test)[:, 1]  
submission.to_csv('submission_rf.csv',index=False)

In [None]:
print("\n=== Voting Classifier Results ===")

voting_soft = VotingClassifier(
    estimators=[
        ('lgbm', lgbm_model),
        ('xgb', xgb_model),
        ('rf', rf_model)
    ],
    voting='soft'
)

voting_hard = VotingClassifier(
    estimators=[
        ('lgbm', lgbm_model),
        ('xgb', xgb_model),
        ('rf', rf_model)
    ],
    voting='hard'
)

# Soft Voting
voting_soft.fit(X_train, y_train)
soft_proba = voting_soft.predict_proba(X_test)[:, 1]

best_threshold_soft = 0.5
best_f1_soft = 0

for t in np.arange(0.1, 0.9, 0.05):
    y_pred_temp = (soft_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_temp, average='weighted')
    if f1 > best_f1_soft:
        best_f1_soft = f1
        best_threshold_soft = t

soft_pred = (soft_proba >= best_threshold_soft).astype(int)

print("\nSoft Voting:")
print(f"Best weighted F1-score: {best_f1_soft:.4f} at threshold {best_threshold_soft:.2f}")
#print("Classification Report:\n", classification_report(y_test, soft_pred))

# Hard Voting
voting_hard.fit(X_train, y_train)
hard_pred = voting_hard.predict(X_test)

print("\nHard Voting:")
print(f"Weighted F1-score: {f1_score(y_test, hard_pred, average='weighted'):.4f}")
#print("Classification Report:\n", classification_report(y_test, hard_pred))

In [None]:
soft_probabilities = voting_soft.predict_proba(test)[:, 1]  

hard_probabilities = np.mean([clf.predict_proba(test)[:, 1] for clf in voting_hard.estimators_], axis=0)

print("Soft Probabilities Shape:", soft_probabilities.shape)
print("Hard Probabilities Shape:", hard_probabilities.shape)


In [None]:
submission.rainfall=soft_probabilities
submission.to_csv('submission_soft.csv',index=False)

In [None]:
submission.rainfall=hard_probabilities
submission.to_csv('submission_hard.csv',index=False)

In [None]:
submission.rainfall=(lgbm_model.predict_proba(test)[:, 1]+xgb_model.predict_proba(test)[:, 1]+rf_model.predict_proba(test)[:, 1]+soft_probabilities+hard_probabilities)/5
submission.to_csv('submission.csv',index=False)