In [None]:
# Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
import time

# Load Datasets
start_time = time.time()
train_df = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')
print(f"Data loading time: {time.time() - start_time} seconds")

# Prepare Data
start_time = time.time()
X = train_df.drop(columns=['id', 'FloodProbability'])
y = train_df['FloodProbability']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print(f"Data preparation time: {time.time() - start_time} seconds")

# Define and Train Models
start_time = time.time()

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# Train each model and evaluate its performance
best_model = None
best_score = -np.inf

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    score = r2_score(y_valid, y_pred)
    print(f'{name} R2 Score: {score}')
    
    if score > best_score:
        best_score = score
        best_model = model

print(f"Best Model: {best_model} with R2 Score: {best_score}")
print(f"Model training and evaluation time: {time.time() - start_time} seconds")

# Make Predictions and Prepare Submission
start_time = time.time()
X_test = test_df.drop(columns=['id'])
X_test_imputed = imputer.transform(X_test)
X_test_scaled = scaler.transform(X_test_imputed)
test_predictions = best_model.predict(X_test_scaled)

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'FloodProbability': test_predictions
})

# Save submission file to the current directory
submission_file_path = 'best_submission.csv'
submission_df.to_csv(submission_file_path, index=False)
print(f"Prediction and submission time: {time.time() - start_time} seconds")

# Download the Submission File (Kaggle specific code)
from IPython.display import FileLink
FileLink(submission_file_path)

# Generate Visualizations
# Feature Importance (if applicable)
if hasattr(best_model, 'feature_importances_'):
    feature_importances = best_model.feature_importances_
    features = X.columns
    plt.figure(figsize=(10, 6))
    sns.barplot(x=feature_importances, y=features)
    plt.title('Feature Importance')
    plt.show()

# Predicted vs Actual
plt.figure(figsize=(10, 6))
plt.scatter(y_valid, y_pred, alpha=0.5)
plt.xlabel('Actual Flood Probability')
plt.ylabel('Predicted Flood Probability')
plt.title('Predicted vs Actual Flood Probability')
plt.plot([0, 1], [0, 1], 'r--')
plt.show()

# Residuals Plot
residuals = y_valid - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.title('Residuals Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

# Distribution of Predictions
plt.figure(figsize=(10, 6))
sns.histplot(test_predictions, kde=True)
plt.title('Distribution of Test Predictions')
plt.xlabel('Predicted Flood Probability')
plt.ylabel('Frequency')
plt.show()

# Correlation Heatmap
corr_matrix = train_df.drop(columns=['id']).corr()
plt.figure(figsize=(15, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
