In [2]:
# Weather Forecasting Challenge - Data Preprocessing and EDA
# Intellihack 5 Hackathon

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

print("Weather Forecasting Challenge - Data Preprocessing and EDA")
print("=" * 80)

# 1. Data Loading
print("\n1. Loading and Examining the Dataset")
print("-" * 50)

# Load the dataset
df = pd.read_csv('weather_data.csv')

# Display basic information
print("\nData Overview:")
print(f"Dataset Shape: {df.shape}")
display(df.head())

print("\nData Information:")
df.info()

print("\nSummary Statistics:")
display(df.describe())

print("\nChecking for missing values:")
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})
display(missing_df)

# 2. Data Preprocessing
print("\n2. Data Preprocessing")
print("-" * 50)

# Make a copy of the dataframe to preserve the original
df_processed = df.copy()

# 2.1 Convert the date column to datetime type
print("\nConverting date column to datetime...")
df_processed['date'] = pd.to_datetime(df_processed['date'])

# 2.2 Handle the target variable - convert 'Rain'/'No Rain' text to binary
print("\nConverting target variable to binary...")
# Check unique values in the target column
print(f"Unique values in rain_or_not column: {df_processed['rain_or_not'].unique()}")

# Convert to binary (1 = Rain, 0 = No Rain)
df_processed['rain_or_not_binary'] = df_processed['rain_or_not'].apply(lambda x: 1 if str(x).lower() == 'rain' else 0)

# Display the conversion
rain_counts = df_processed['rain_or_not_binary'].value_counts()
print(f"\nRain distribution after conversion: {rain_counts.to_dict()}")
print(f"Percentage of rainy days: {rain_counts[1] / len(df_processed) * 100:.2f}%")

# 2.3 Handle missing values
print("\nHandling missing values...")

# Check for columns with missing values
columns_with_missing = df_processed.columns[df_processed.isnull().any()].tolist()
print(f"Columns with missing values: {columns_with_missing}")

# For numeric columns, impute missing values with median
for col in columns_with_missing:
    if df_processed[col].dtype in ['int64', 'float64']:
        median_value = df_processed[col].median()
        df_processed[col].fillna(median_value, inplace=True)
        print(f"Filled missing values in '{col}' with median: {median_value}")

# Check if we have any remaining missing values
remaining_missing = df_processed.isnull().sum().sum()
print(f"\nRemaining missing values after imputation: {remaining_missing}")

# 2.4 Feature Engineering from date
print("\nExtracting features from date...")
df_processed['month'] = df_processed['date'].dt.month
df_processed['day_of_week'] = df_processed['date'].dt.dayofweek
df_processed['day_of_year'] = df_processed['date'].dt.dayofyear
df_processed['season'] = df_processed['month'].apply(lambda x: 
                                               'Winter' if x in [12, 1, 2] else
                                               'Spring' if x in [3, 4, 5] else
                                               'Summer' if x in [6, 7, 8] else 'Fall')

print("\nData after feature engineering:")
display(df_processed[['date', 'month', 'day_of_week', 'day_of_year', 'season']].head())

# 2.5 Check for outliers
print("\nChecking for outliers in numeric features...")
numeric_features = df_processed.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_features = [col for col in numeric_features if col != 'rain_or_not_binary']

# Create box plots for numeric features
plt.figure(figsize=(14, 10))
for i, feature in enumerate(numeric_features[:6], 1):  # Limit to 6 features for visibility
    plt.subplot(2, 3, i)
    sns.boxplot(y=df_processed[feature])
    plt.title(f'Box Plot of {feature}')
    plt.tight_layout()
plt.savefig('outliers_boxplot.png')
plt.close()

# 2.6 Normalize numerical features
print("\nNormalizing numerical features...")
from sklearn.preprocessing import StandardScaler

# Create a copy of the features to be normalized
features_to_normalize = ['avg_temperature', 'humidity', 'avg_wind_speed', 'cloud_cover', 'pressure']
features_to_normalize = [f for f in features_to_normalize if f in df_processed.columns]

# Create a scaler object
scaler = StandardScaler()

# Fit and transform the selected features
df_processed[features_to_normalize] = scaler.fit_transform(df_processed[features_to_normalize])

print("\nData after normalization:")
display(df_processed[features_to_normalize].describe())

# 3. Exploratory Data Analysis (EDA)
print("\n3. Exploratory Data Analysis")
print("-" * 50)

# 3.1 Distribution of the target variable
print("\nAnalyzing the distribution of the target variable...")
plt.figure(figsize=(10, 6))
sns.countplot(x='rain_or_not_binary', data=df_processed)
plt.title('Distribution of Rain vs No Rain Days')
plt.xlabel('Rain (1) vs No Rain (0)')
plt.ylabel('Count')
plt.xticks([0, 1], ['No Rain', 'Rain'])
plt.savefig('rain_distribution.png')
plt.close()

# 3.2 Distribution of numeric features
print("\nAnalyzing the distribution of numeric features...")
plt.figure(figsize=(16, 12))
for i, feature in enumerate(features_to_normalize, 1):
    plt.subplot(3, 2, i)
    sns.histplot(df_processed[feature], kde=True)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.savefig('feature_distributions.png')
plt.close()

# 3.3 Correlation matrix
print("\nAnalyzing feature correlations...")
correlation_matrix = df_processed[features_to_normalize + ['rain_or_not_binary']].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.close()

# Display correlation with target
print("\nCorrelation of features with rain_or_not_binary:")
target_correlation = correlation_matrix['rain_or_not_binary'].sort_values(ascending=False)
display(target_correlation)

# 3.4 Feature relationships with target
print("\nAnalyzing relationships between features and rain occurrence...")
plt.figure(figsize=(16, 12))
for i, feature in enumerate(features_to_normalize, 1):
    plt.subplot(3, 2, i)
    sns.boxplot(x='rain_or_not_binary', y=feature, data=df_processed)
    plt.title(f'{feature} vs Rain')
    plt.xlabel('Rain (1) vs No Rain (0)')
    plt.xticks([0, 1], ['No Rain', 'Rain'])
plt.tight_layout()
plt.savefig('feature_vs_rain.png')
plt.close()

# 3.5 Seasonal patterns in rainfall
print("\nAnalyzing seasonal patterns in rainfall...")
# Group by month and calculate percentage of rainy days
monthly_rain = df_processed.groupby('month')['rain_or_not_binary'].mean() * 100

plt.figure(figsize=(12, 6))
monthly_rain.plot(kind='bar', color='skyblue')
plt.title('Percentage of Rainy Days by Month')
plt.xlabel('Month')
plt.ylabel('Percentage of Rainy Days')
plt.xticks(range(12), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.savefig('monthly_rain_pattern.png')
plt.close()

# 3.6 Feature relationships
print("\nAnalyzing relationships between features...")
plt.figure(figsize=(20, 16))
sns.pairplot(df_processed[features_to_normalize + ['rain_or_not_binary']], 
             hue='rain_or_not_binary', palette={0: 'skyblue', 1: 'salmon'})
plt.savefig('feature_relationships.png')
plt.close()

# 3.7 Time series analysis
print("\nTime series analysis of weather variables...")
# Select a subset of features for time series visualization
time_series_features = ['avg_temperature', 'humidity', 'avg_wind_speed', 'rain_or_not_binary']
time_series_features = [f for f in time_series_features if f in df_processed.columns]

# Plot time series for these features
fig, axes = plt.subplots(len(time_series_features), 1, figsize=(14, 12), sharex=True)
for i, feature in enumerate(time_series_features):
    axes[i].plot(df_processed['date'], df_processed[feature], marker='o', linestyle='-', markersize=2)
    axes[i].set_title(f'{feature} Over Time')
    axes[i].set_ylabel(feature)
    if feature == 'rain_or_not_binary':
        axes[i].set_yticks([0, 1])
        axes[i].set_yticklabels(['No Rain', 'Rain'])
plt.tight_layout()
plt.savefig('time_series_analysis.png')
plt.close()

# 3.8 Rain probability by features
print("\nAnalyzing rain probability by feature ranges...")
# For each numeric feature, divide into bins and calculate rain probability
plt.figure(figsize=(16, 12))
for i, feature in enumerate(features_to_normalize[:5], 1):  # Limit to 5 features
    plt.subplot(3, 2, i)
    # Create bins for the feature
    df_processed[f'{feature}_bins'] = pd.qcut(df_processed[feature], 10, duplicates='drop')
    # Calculate rain probability for each bin
    rain_by_feature = df_processed.groupby(f'{feature}_bins')['rain_or_not_binary'].mean() * 100
    rain_by_feature.plot(kind='bar')
    plt.title(f'Rain Probability by {feature}')
    plt.xlabel(feature)
    plt.ylabel('Rain Probability (%)')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('rain_probability_by_features.png')
plt.close()

# 4. Summary of findings
print("\n4. Summary of EDA Findings")
print("-" * 50)

print("\nKey findings from exploratory data analysis:")
print("- Target Distribution: {}% of days have rain".format(round(df_processed['rain_or_not_binary'].mean() * 100, 2)))
print("- Most correlated features with rain:")
for feature, corr in target_correlation.items():
    if feature != 'rain_or_not_binary' and abs(corr) > 0.1:
        print(f"  * {feature}: {corr:.4f}")

print("\nPreprocessing and EDA completed successfully!")
print("All visualizations have been saved as PNG files.")

# Save the processed dataframe for further analysis
df_processed.to_csv('weather_data_processed.csv', index=False)
print("\nProcessed data saved to 'weather_data_processed.csv'")

# 5. Prepare data for modeling
print("\n5. Preparing Data for Modeling")
print("-" * 50)

# Define features and target
X = df_processed.drop(['date', 'rain_or_not', 'rain_or_not_binary'], axis=1)
# Remove any bin columns created during EDA
X = X.loc[:, ~X.columns.str.contains('_bins')]
y = df_processed['rain_or_not_binary']

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print("\nFeatures to be used in modeling:")
display(X.columns.tolist())

# Ready for model training
print("\nData is now ready for model training!")

Weather Forecasting Challenge - Data Preprocessing and EDA

1. Loading and Examining the Dataset
--------------------------------------------------

Data Overview:
Dataset Shape: (311, 7)


Unnamed: 0,date,avg_temperature,humidity,avg_wind_speed,rain_or_not,cloud_cover,pressure
0,2023-01-01,23.745401,46.140905,7.845981,Rain,20.851051,992.965681
1,2023-01-02,30.030503,59.876587,5.382457,Rain,93.059521,1037.273025
2,2023-01-03,28.365224,51.464618,13.158008,Rain,11.63664,1034.193357
3,2023-01-04,27.550929,53.103799,5.886677,Rain,81.744971,968.610142
4,2023-01-05,23.639303,57.826186,12.248992,Rain,38.062329,1030.264331



Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311 entries, 0 to 310
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             311 non-null    object 
 1   avg_temperature  296 non-null    float64
 2   humidity         296 non-null    float64
 3   avg_wind_speed   296 non-null    float64
 4   rain_or_not      311 non-null    object 
 5   cloud_cover      296 non-null    float64
 6   pressure         311 non-null    float64
dtypes: float64(5), object(2)
memory usage: 17.1+ KB

Summary Statistics:


Unnamed: 0,avg_temperature,humidity,avg_wind_speed,cloud_cover,pressure
count,296.0,296.0,296.0,296.0,311.0
mean,25.98384,55.041385,7.556636,49.834827,1001.059119
std,6.802475,19.220133,5.344683,29.009459,28.835595
min,15.0,30.0,0.06948,0.321826,951.240404
25%,20.265692,34.280826,3.550354,24.530951,975.757545
50%,27.177958,56.759806,7.326421,50.72512,1001.938586
75%,32.204599,72.189837,11.050627,76.046506,1026.578884
max,35.0,90.0,56.636041,99.834751,1049.543752



Checking for missing values:


Unnamed: 0,Missing Values,Percentage
date,0,0.0
avg_temperature,15,4.823151
humidity,15,4.823151
avg_wind_speed,15,4.823151
rain_or_not,0,0.0
cloud_cover,15,4.823151
pressure,0,0.0



2. Data Preprocessing
--------------------------------------------------

Converting date column to datetime...

Converting target variable to binary...
Unique values in rain_or_not column: ['Rain' 'No Rain']

Rain distribution after conversion: {1: 198, 0: 113}
Percentage of rainy days: 63.67%

Handling missing values...
Columns with missing values: ['avg_temperature', 'humidity', 'avg_wind_speed', 'cloud_cover']
Filled missing values in 'avg_temperature' with median: 27.177958126582883
Filled missing values in 'humidity' with median: 56.75980567828731
Filled missing values in 'avg_wind_speed' with median: 7.326421214194906
Filled missing values in 'cloud_cover' with median: 50.7251204878262

Remaining missing values after imputation: 0

Extracting features from date...

Data after feature engineering:


Unnamed: 0,date,month,day_of_week,day_of_year,season
0,2023-01-01,1,6,1,Winter
1,2023-01-02,1,0,2,Winter
2,2023-01-03,1,1,3,Winter
3,2023-01-04,1,2,4,Winter
4,2023-01-05,1,3,5,Winter



Checking for outliers in numeric features...

Normalizing numerical features...

Data after normalization:


Unnamed: 0,avg_temperature,humidity,avg_wind_speed,cloud_cover,pressure
count,311.0,311.0,311.0,311.0,311.0
mean,1.028116e-16,4.569407e-16,7.425286e-17,-2.8558790000000004e-17,8.674733e-16
std,1.001612,1.001612,1.001612,1.001612,1.001612
min,-1.665345,-1.341906,-1.436151,-1.753943,-1.730466
25%,-0.836795,-1.038202,-0.736483,-0.8466853,-0.8788565
50%,0.1714184,0.08735538,-0.04209132,0.02999052,0.03054851
75%,0.877398,0.9001727,0.6438495,0.8595001,0.8864354
max,1.351192,1.86274,9.430295,1.768137,1.684126



3. Exploratory Data Analysis
--------------------------------------------------

Analyzing the distribution of the target variable...

Analyzing the distribution of numeric features...

Analyzing feature correlations...

Correlation of features with rain_or_not_binary:


rain_or_not_binary    1.000000
humidity              0.321515
avg_temperature       0.294066
avg_wind_speed        0.125911
pressure              0.090059
cloud_cover          -0.034236
Name: rain_or_not_binary, dtype: float64


Analyzing relationships between features and rain occurrence...

Analyzing seasonal patterns in rainfall...

Analyzing relationships between features...

Time series analysis of weather variables...

Analyzing rain probability by feature ranges...

4. Summary of EDA Findings
--------------------------------------------------

Key findings from exploratory data analysis:
- Target Distribution: 63.67% of days have rain
- Most correlated features with rain:
  * humidity: 0.3215
  * avg_temperature: 0.2941
  * avg_wind_speed: 0.1259

Preprocessing and EDA completed successfully!
All visualizations have been saved as PNG files.

Processed data saved to 'weather_data_processed.csv'

5. Preparing Data for Modeling
--------------------------------------------------
Feature matrix shape: (311, 9)
Target vector shape: (311,)

Features to be used in modeling:


['avg_temperature',
 'humidity',
 'avg_wind_speed',
 'cloud_cover',
 'pressure',
 'month',
 'day_of_week',
 'day_of_year',
 'season']


Data is now ready for model training!


<Figure size 2000x1600 with 0 Axes>

In [3]:
# Weather Forecasting Challenge - Model Training and Evaluation
# Intellihack 5 Hackathon

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import pickle
import warnings
import re  # Add this line to import the 're' module
warnings.filterwarnings('ignore')

# Import ML libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

# Set style for plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

print("Weather Forecasting Challenge - Model Training and Evaluation")
print("=" * 80)

# 1. Load the processed data
print("\n1. Loading the processed data")
print("-" * 50)

# Load the processed dataset
df_processed = pd.read_csv('weather_data_processed.csv')
df_processed['date'] = pd.to_datetime(df_processed['date'])

print(f"Loaded dataset shape: {df_processed.shape}")
display(df_processed.head())

# Check if categorical features need encoding
print("\nChecking data types:")
display(df_processed.dtypes)

# If 'season' is a string, one-hot encode it
if df_processed['season'].dtype == 'object':
    print("\nOne-hot encoding the 'season' feature...")
    # Get dummies for season
    season_dummies = pd.get_dummies(df_processed['season'], prefix='season')
    df_processed = pd.concat([df_processed, season_dummies], axis=1)
    df_processed.drop('season', axis=1, inplace=True)

# 2. Prepare features and target
print("\n2. Preparing features and target")
print("-" * 50)

# Define features and target
X = df_processed.drop(['date', 'rain_or_not', 'rain_or_not_binary'], axis=1)
y = df_processed['rain_or_not_binary']

# Store the binning and season information from X
X_columns = X.columns
# Get list of binning features
binning_features = [col for col in X.columns if '_bins' in col]
# Check if season is in dataframe
if 'season_Fall' in X.columns:
  seasonal_features = ['season_Fall', 'season_Spring', 'season_Summer', 'season_Winter']
else:
  seasonal_features = []

# Convert interval columns to numeric in X
for col in X.columns:
    if X[col].dtype == 'object':
        try:
            # Attempt to convert to numeric, coercing errors to NaN
            X[col] = pd.to_numeric(X[col], errors='coerce')
        except ValueError as e:
            print(f"Column '{col}' could not be converted to numeric: {e}")
        except Exception as e:
            print(f"Unexpected error converting column '{col}': {e}")

# Fill NaN values with 0 or a more appropriate value
X = X.fillna(0)  # Using 0 to avoid introducing new NaN values

# Display feature names
print("\nFeatures to be used in modeling:")
display(X.columns.tolist())
print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# 3. Split the data
print("\n3. Splitting the data into training and testing sets")
print("-" * 50)

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Testing set shape: X_test: {X_test.shape}, y_test: {y_test.shape}")

# 4. Model training and evaluation
print("\n4. Training and evaluating multiple models")
print("-" * 50)

# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

# Initialize results dictionary
results = {}
model_predictions = {}

# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining and evaluating {name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    # Store predictions
    model_predictions[name] = {
        'y_pred': y_pred,
        'y_prob': y_prob
    }
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)
    
    # Store metrics
    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc
    }
    
    # Print metrics
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    print(f"  ROC AUC: {roc_auc:.4f}")
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['No Rain', 'Rain'],
                yticklabels=['No Rain', 'Rain'])
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'confusion_matrix_{name.replace(" ", "_").lower()}.png')
    plt.close()

# Summarize model comparison
print("\nModel Comparison Summary:")
results_df = pd.DataFrame(results).T
display(results_df)

# Plot model comparison
plt.figure(figsize=(14, 8))
results_df.plot(kind='bar', figsize=(14, 8))
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xlabel('Model')
plt.xticks(rotation=45)
plt.grid(True, axis='y')
plt.tight_layout()
plt.savefig('model_comparison.png')
plt.close()

# Find the best model
best_model_name = results_df['roc_auc'].idxmax()
print(f"\nBest model based on ROC AUC: {best_model_name}")
best_model = models[best_model_name]

# 5. Cross-validation of the best model
print("\n5. Cross-validation of the best model")
print("-" * 50)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='roc_auc')
print(f"\nCross-validation ROC AUC scores for {best_model_name}:")
print(cv_scores)
print(f"Mean ROC AUC: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")

# 6. Feature importance analysis
print("\n6. Feature Importance Analysis")
print("-" * 50)

# Get feature importances from the best model (if available)
if hasattr(best_model, 'feature_importances_'):
    # Get feature importances
    importances = best_model.feature_importances_
    
    # Create DataFrame for feature importances
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)
    
    print("\nFeature Importances:")
    display(feature_importance_df)
    
    # Plot feature importances
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
    plt.title(f'Feature Importances from {best_model_name}')
    plt.tight_layout()
    plt.savefig('feature_importances.png')
    plt.close()
elif best_model_name == 'Logistic Regression':
    # For logistic regression, use coefficients
    coefficients = best_model.coef_[0]
    
    # Create DataFrame for coefficients
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': coefficients
    }).sort_values(by='Coefficient', ascending=False)
    
    print("\nFeature Coefficients:")
    display(feature_importance_df)
    
    # Plot coefficients
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Coefficient', y='Feature', data=feature_importance_df)
    plt.title(f'Feature Coefficients from {best_model_name}')
    plt.tight_layout()
    plt.savefig('feature_coefficients.png')
    plt.close()

# 7. Hyperparameter tuning
print("\n7. Hyperparameter Tuning")
print("-" * 50)

# Define parameter grids for each model type
param_grids = {
    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet', None],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    },
    'Decision Tree': {
        'max_depth': [None, 5, 10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'criterion': ['gini', 'entropy']
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5],
        'subsample': [0.8, 1.0]
    },
    'XGBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    }
}

# Check if the best model has a parameter grid
if best_model_name in param_grids:
    print(f"\nPerforming hyperparameter tuning for {best_model_name}...")
    
    # Get parameter grid for the best model
    param_grid = param_grids[best_model_name]
    
    # Create a new instance of the best model
    tuning_model = models[best_model_name]
    
    # Use RandomizedSearchCV for faster tuning
    random_search = RandomizedSearchCV(
        estimator=tuning_model,
        param_distributions=param_grid,
        n_iter=10,  # Number of parameter settings to try
        cv=5,
        scoring='roc_auc',
        random_state=42,
        n_jobs=-1
    )
    
    # Fit RandomizedSearchCV
    random_search.fit(X_train, y_train)
    
    # Print the best parameters and score
    print(f"\nBest parameters: {random_search.best_params_}")
    print(f"Best cross-validation score: {random_search.best_score_:.4f}")
    
    # Create the best model with optimized parameters
    best_model = random_search.best_estimator_
    
    # Evaluate the tuned model
    y_pred_tuned = best_model.predict(X_test)
    y_prob_tuned = best_model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics for the tuned model
    accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
    precision_tuned = precision_score(y_test, y_pred_tuned)
    recall_tuned = recall_score(y_test, y_pred_tuned)
    f1_tuned = f1_score(y_test, y_pred_tuned)
    roc_auc_tuned = roc_auc_score(y_test, y_prob_tuned)
    
    # Print metrics for the tuned model
    print("\nTuned Model Performance:")
    print(f"  Accuracy: {accuracy_tuned:.4f}")
    print(f"  Precision: {precision_tuned:.4f}")
    print(f"  Recall: {recall_tuned:.4f}")
    print(f"  F1 Score: {f1_tuned:.4f}")
    print(f"  ROC AUC: {roc_auc_tuned:.4f}")
    
    # Print classification report for the tuned model
    print("\nClassification Report for Tuned Model:")
    print(classification_report(y_test, y_pred_tuned))
    
    # Plot confusion matrix for the tuned model
    plt.figure(figsize=(8, 6))
    cm_tuned = confusion_matrix(y_test, y_pred_tuned)
    sns.heatmap(cm_tuned, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['No Rain', 'Rain'],
                yticklabels=['No Rain', 'Rain'])
    plt.title(f'Confusion Matrix - Tuned {best_model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'confusion_matrix_tuned_{best_model_name.replace(" ", "_").lower()}.png')
    plt.close()
    
    # Train the final model on the full dataset
    print("\nTraining the final model on the full dataset...")
    best_model.fit(X, y)
else:
    print(f"\nHyperparameter grid not defined for {best_model_name}. Skipping tuning.")
    # Train the final model on the full dataset
    print("Training the final model on the full dataset...")
    best_model.fit(X, y)

# 8. Generate predictions for the next 21 days
print("\n8. Generating predictions for the next 21 days")
print("-" * 50)

# ... (previous code remains the same until future date generation)

# Get the last date in the dataset
last_date = df_processed['date'].max()
print(f"Last date in the dataset: {last_date}")

# Generate dates for the next 21 days
future_dates = [last_date + timedelta(days=i+1) for i in range(21)]
print(f"Future dates to predict: {future_dates[0]} to {future_dates[-1]}")

# Create a DataFrame for future dates
future_df = pd.DataFrame({'date': future_dates})

# Extract features from dates
future_df['month'] = future_df['date'].dt.month
future_df['day_of_week'] = future_df['date'].dt.dayofweek
future_df['day_of_year'] = future_df['date'].dt.dayofyear

# Add season based on month
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

# Create season column and then one-hot encode it
future_df['season'] = future_df['month'].apply(get_season)
season_dummies = pd.get_dummies(future_df['season'], prefix='season')
future_df = pd.concat([future_df, season_dummies], axis=1)
future_df.drop('season', axis=1, inplace=True)

# Simulate weather features for future days
print("\nSimulating weather features for future predictions...")
last_7_days = df_processed.sort_values('date').tail(7)

# Calculate average values for numerical features
weather_features = ['avg_temperature', 'humidity', 'avg_wind_speed', 'cloud_cover', 'pressure']
for feature in weather_features:
    if feature in df_processed.columns:
        avg_value = last_7_days[feature].mean()
        std_value = last_7_days[feature].std()
        # Add some random variation
        future_df[feature] = np.random.normal(avg_value, std_value/2, len(future_df))
        print(f"Average {feature} for last 7 days: {avg_value:.2f}")

# Add binning features (if they exist in original data)
binning_features = [col for col in X.columns if '_bins' in col]
for feature in binning_features:
    future_df[feature] = 0  # Initialize with 0

# Ensure all required columns are present
for col in X.columns:
    if col not in future_df.columns:
        future_df[col] = 0

# Make sure future_df has the same columns as X (in the same order)
future_X = future_df[X.columns]

# Make predictions
future_preds_binary = best_model.predict(future_X)
future_preds_prob = best_model.predict_proba(future_X)[:, 1]

# Add predictions to the future DataFrame
future_df['rain_prediction'] = future_preds_binary
future_df['rain_probability'] = future_preds_prob

# Display future predictions
print("\nPredictions for the next 21 days:")
prediction_results = future_df[['date', 'rain_prediction', 'rain_probability']].copy()
prediction_results['date'] = prediction_results['date'].dt.strftime('%Y-%m-%d')
display(prediction_results)

# Rest of the code remains the same...

# Visualize predictions
plt.figure(figsize=(14, 8))
plt.bar(range(len(future_df)), future_df['rain_probability'], color=future_df['rain_prediction'].map({0: 'skyblue', 1: 'navy'}))
plt.axhline(y=0.5, color='red', linestyle='--', label='50% Threshold')
plt.xticks(range(len(future_df)), future_df['date'].dt.strftime('%Y-%m-%d'), rotation=45)
plt.title('Rain Probability Forecast for the Next 21 Days')
plt.ylabel('Probability of Rain')
plt.xlabel('Date')
plt.grid(True, axis='y')
plt.ylim(0, 1)
plt.tight_layout()
plt.savefig('future_rain_forecast.png')
plt.close()

# 9. Save the final model
print("\n9. Saving the final model")
print("-" * 50)

# Save the model to disk
pickle.dump(best_model, open('rain_prediction_model.pkl', 'wb'))
print("Model saved as 'rain_prediction_model.pkl'")

# Save the feature list
with open('model_features.txt', 'w') as f:
    f.write('\n'.join(X.columns))
print("Feature list saved as 'model_features.txt'")

# Save future predictions
future_df[['date', 'rain_prediction', 'rain_probability']].to_csv('future_rain_predictions.csv', index=False)
print("Future predictions saved as 'future_rain_predictions.csv'")

print("\nModel training, evaluation, and prediction completed successfully!")


Weather Forecasting Challenge - Model Training and Evaluation

1. Loading the processed data
--------------------------------------------------
Loaded dataset shape: (311, 17)


Unnamed: 0,date,avg_temperature,humidity,avg_wind_speed,rain_or_not,cloud_cover,pressure,rain_or_not_binary,month,day_of_week,day_of_year,season,avg_temperature_bins,humidity_bins,avg_wind_speed_bins,cloud_cover_bins,pressure_bins
0,2023-01-01,-0.346303,-0.479808,0.057716,Rain,-1.027348,-0.281128,1,1,6,1,Winter,"(-0.655, -0.253]","(-0.813, -0.303]","(-0.0421, 0.187]","(-1.049, -0.649]","(-0.727, -0.244]"
1,2023-01-02,0.601659,0.253825,-0.415527,Rain,1.52834,1.257899,1,1,0,2,Winter,"(0.444, 0.8]","(0.0874, 0.412]","(-0.593, -0.3]","(1.337, 1.768]","(1.071, 1.326]"
2,2023-01-03,0.35049,-0.195465,1.078158,Rain,-1.353475,1.150926,1,1,1,3,Winter,"(0.171, 0.444]","(-0.303, 0.0874]","(0.838, 1.126]","(-1.418, -1.049]","(1.071, 1.326]"
3,2023-01-04,0.227672,-0.107915,-0.318667,Rain,1.127882,-1.127123,1,1,2,4,Winter,"(0.171, 0.444]","(-0.303, 0.0874]","(-0.593, -0.3]","(1.013, 1.337]","(-1.412, -1.072]"
4,2023-01-05,-0.362306,0.144312,0.903535,Rain,-0.418186,1.01445,1,1,3,5,Winter,"(-0.655, -0.253]","(0.0874, 0.412]","(0.838, 1.126]","(-0.649, -0.257]","(0.725, 1.071]"



Checking data types:


date                    datetime64[ns]
avg_temperature                float64
humidity                       float64
avg_wind_speed                 float64
rain_or_not                     object
cloud_cover                    float64
pressure                       float64
rain_or_not_binary               int64
month                            int64
day_of_week                      int64
day_of_year                      int64
season                          object
avg_temperature_bins            object
humidity_bins                   object
avg_wind_speed_bins             object
cloud_cover_bins                object
pressure_bins                   object
dtype: object


One-hot encoding the 'season' feature...

2. Preparing features and target
--------------------------------------------------

Features to be used in modeling:


['avg_temperature',
 'humidity',
 'avg_wind_speed',
 'cloud_cover',
 'pressure',
 'month',
 'day_of_week',
 'day_of_year',
 'avg_temperature_bins',
 'humidity_bins',
 'avg_wind_speed_bins',
 'cloud_cover_bins',
 'pressure_bins',
 'season_Fall',
 'season_Spring',
 'season_Summer',
 'season_Winter']

Feature matrix shape: (311, 17)
Target vector shape: (311,)

3. Splitting the data into training and testing sets
--------------------------------------------------
Training set shape: X_train: (248, 17), y_train: (248,)
Testing set shape: X_test: (63, 17), y_test: (63,)

4. Training and evaluating multiple models
--------------------------------------------------

Training and evaluating Logistic Regression...
  Accuracy: 0.6667
  Precision: 0.7021
  Recall: 0.8250
  F1 Score: 0.7586
  ROC AUC: 0.7359

Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.39      0.46        23
           1       0.70      0.82      0.76        40

    accuracy                           0.67        63
   macro avg       0.63      0.61      0.61        63
weighted avg       0.65      0.67      0.65        63


Training and evaluating Decision Tree...
  Accuracy: 0.6349
  Precision: 0.6977
  Recall: 0.7500
  F1 Score: 0.7229
  ROC AUC: 0.5924

Class

Unnamed: 0,accuracy,precision,recall,f1_score,roc_auc
Logistic Regression,0.666667,0.702128,0.825,0.758621,0.73587
Decision Tree,0.634921,0.697674,0.75,0.722892,0.592391
Random Forest,0.634921,0.673469,0.825,0.741573,0.65163
Gradient Boosting,0.587302,0.659091,0.725,0.690476,0.667391
XGBoost,0.587302,0.645833,0.775,0.704545,0.617391



Best model based on ROC AUC: Logistic Regression

5. Cross-validation of the best model
--------------------------------------------------

Cross-validation ROC AUC scores for Logistic Regression:
[0.85326087 0.74886364 0.61136364 0.60423634 0.73690078]
Mean ROC AUC: 0.7109
Standard Deviation: 0.0935

6. Feature Importance Analysis
--------------------------------------------------

Feature Coefficients:


Unnamed: 0,Feature,Coefficient
1,humidity,0.708052
16,season_Winter,0.605357
2,avg_wind_speed,0.283179
4,pressure,0.181307
15,season_Summer,0.143587
0,avg_temperature,0.074158
3,cloud_cover,0.066553
6,day_of_week,0.051808
7,day_of_year,0.00275
9,humidity_bins,0.0



7. Hyperparameter Tuning
--------------------------------------------------

Performing hyperparameter tuning for Logistic Regression...

Best parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 0.1}
Best cross-validation score: 0.6975

Tuned Model Performance:
  Accuracy: 0.6825
  Precision: 0.7000
  Recall: 0.8750
  F1 Score: 0.7778
  ROC AUC: 0.7109

Classification Report for Tuned Model:
              precision    recall  f1-score   support

           0       0.62      0.35      0.44        23
           1       0.70      0.88      0.78        40

    accuracy                           0.68        63
   macro avg       0.66      0.61      0.61        63
weighted avg       0.67      0.68      0.66        63


Training the final model on the full dataset...

8. Generating predictions for the next 21 days
--------------------------------------------------
Last date in the dataset: 2023-11-07 00:00:00
Future dates to predict: 2023-11-08 00:00:00 to 2023-11-28 00:00:00

Simulati

Unnamed: 0,date,rain_prediction,rain_probability
0,2023-11-08,0,0.403533
1,2023-11-09,0,0.433503
2,2023-11-10,0,0.486951
3,2023-11-11,0,0.451028
4,2023-11-12,1,0.533671
5,2023-11-13,0,0.347719
6,2023-11-14,0,0.438432
7,2023-11-15,0,0.443458
8,2023-11-16,0,0.422231
9,2023-11-17,0,0.487101



9. Saving the final model
--------------------------------------------------
Model saved as 'rain_prediction_model.pkl'
Feature list saved as 'model_features.txt'
Future predictions saved as 'future_rain_predictions.csv'

Model training, evaluation, and prediction completed successfully!


<Figure size 1400x800 with 0 Axes>