In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

In [51]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
submission_df = pd.read_csv("data/sample_submission.csv")

In [52]:
train_df.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


In [53]:
test_df.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
0,2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
1,2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2,2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
3,2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
4,2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


In [54]:
train_df.isnull().sum() #Missing Values

id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
rainfall         0
dtype: int64

In [55]:
test_df.isnull().sum() #Missing Values

id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    1
windspeed        0
dtype: int64

In [56]:
# Define features and target variable
X = train_df.drop(columns=['id', 'rainfall'])
y = train_df['rainfall']

In [57]:
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [58]:
# Identify common features between train and test datasets
common_features = list(set(X_train.columns) & set(test_df.columns))

In [59]:
# Select only the common features
X_train_filtered = X_train[common_features]
X_val_filtered = X_val[common_features]
X_test_filtered = test_df[common_features]

In [60]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled_filtered = scaler.fit_transform(X_train_filtered)
X_val_scaled_filtered = scaler.transform(X_val_filtered)
X_test_scaled_filtered = scaler.transform(X_test_filtered)

In [61]:
# Impute missing values using the median strategy
imputer = SimpleImputer(strategy="median")
X_train_imputed_filtered = imputer.fit_transform(X_train_scaled_filtered)
X_val_imputed_filtered = imputer.transform(X_val_scaled_filtered)
X_test_imputed_filtered = imputer.transform(X_test_scaled_filtered)

In [62]:
# Hyperparameter tuning for SVM
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1], 'kernel': ['rbf']}
grid_search = GridSearchCV(SVC(probability=True, random_state=42), param_grid, scoring='roc_auc', cv=3, n_jobs=-1)
grid_search.fit(X_train_imputed_filtered, y_train)

In [63]:
# Best model
best_svm_model = grid_search.best_estimator_

In [64]:
# Evaluate the model
val_preds_filtered = best_svm_model.predict_proba(X_val_imputed_filtered)[:, 1]
auc_score_filtered = roc_auc_score(y_val, val_preds_filtered)
print(f"Validation AUC-ROC Score: {auc_score_filtered:.4f}")

Validation AUC-ROC Score: 0.8749


In [65]:
# Generate predictions for the test set
test_preds_filtered = best_svm_model.predict_proba(X_test_imputed_filtered)[:, 1]

In [66]:
# Prepare the submission file
submission_df["rainfall"] = test_preds_filtered
updated_submission_path = "data/updated_final_submission.csv"
submission_df.to_csv(updated_submission_path, index=False)

print(f"Submission file saved to: {updated_submission_path}")

Submission file saved to: data/updated_final_submission.csv
