# Sri Lanka Weather Prediction Model - Implementation

In [1]:
# Install required packages
!pip install pandas numpy scikit-learn xgboost lightgbm tensorflow keras plotly seaborn



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Machine Learning Libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
import lightgbm as lgb

In [4]:
# Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [5]:
import joblib
import pickle
from google.colab import files

In [6]:
class SriLankaWeatherPredictor:
    def __init__(self):
        self.models = {}
        self.scalers = {}
        self.label_encoders = {}
        self.feature_columns = []
        self.target_columns = [
            'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean',
            'apparent_temperature_max', 'apparent_temperature_min', 'apparent_temperature_mean',
            'precipitation_sum', 'rain_sum', 'windspeed_10m_max', 'windgusts_10m_max',
            'shortwave_radiation_sum', 'et0_fao_evapotranspiration', 'precipitation_hours'
        ]

    def load_and_preprocess_data(self, file_path):
        """Load and preprocess the weather data"""
        print("Loading data...")
        df = pd.read_csv(file_path)

        # Convert time to datetime
        df['time'] = pd.to_datetime(df['time'])
        df['sunrise'] = pd.to_datetime(df['sunrise'])
        df['sunset'] = pd.to_datetime(df['sunset'])

        # Extract time features
        df['year'] = df['time'].dt.year
        df['month'] = df['time'].dt.month
        df['day'] = df['time'].dt.day
        df['day_of_year'] = df['time'].dt.dayofyear
        df['week_of_year'] = df['time'].dt.isocalendar().week
        df['season'] = df['month'].apply(self._get_season)

        # Calculate daylight hours
        df['daylight_hours'] = (df['sunset'] - df['sunrise']).dt.total_seconds() / 3600

        # Add cyclical features for better temporal understanding
        df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
        df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
        df['day_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
        df['day_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)

        # Encode categorical variables
        le_city = LabelEncoder()
        le_country = LabelEncoder()
        df['city_encoded'] = le_city.fit_transform(df['city'])
        df['country_encoded'] = le_country.fit_transform(df['country'])

        self.label_encoders['city'] = le_city
        self.label_encoders['country'] = le_country

        # Create lag features for time series
        df = df.sort_values(['city', 'time'])
        for col in ['temperature_2m_mean', 'precipitation_sum', 'windspeed_10m_max']:
            df[f'{col}_lag1'] = df.groupby('city')[col].shift(1)
            df[f'{col}_lag7'] = df.groupby('city')[col].shift(7)
            df[f'{col}_ma7'] = df.groupby('city')[col].rolling(window=7).mean().values

        # Calculate weather stability indicators
        df['temp_range'] = df['temperature_2m_max'] - df['temperature_2m_min']
        df['apparent_temp_range'] = df['apparent_temperature_max'] - df['apparent_temperature_min']

        # Drop rows with NaN values (from lag features)
        df = df.dropna()

        print(f"Data shape after preprocessing: {df.shape}")
        return df

    def _get_season(self, month):
        """Get season based on month (Sri Lankan climate)"""
        if month in [12, 1, 2]:
            return 'NE_Monsoon'  # Northeast Monsoon
        elif month in [3, 4, 5]:
            return 'Inter_Monsoon_1'  # First Inter-monsoon
        elif month in [6, 7, 8, 9]:
            return 'SW_Monsoon'  # Southwest Monsoon
        else:
            return 'Inter_Monsoon_2'  # Second Inter-monsoon

    def prepare_features(self, df):
        """Prepare feature matrix"""
        feature_cols = [
            'weathercode', 'latitude', 'longitude', 'elevation',
            'year', 'month', 'day', 'day_of_year', 'week_of_year',
            'month_sin', 'month_cos', 'day_sin', 'day_cos',
            'city_encoded', 'country_encoded', 'daylight_hours',
            'winddirection_10m_dominant'
        ]

        # Add lag features
        lag_features = [col for col in df.columns if 'lag' in col or 'ma7' in col]
        feature_cols.extend(lag_features)

        # Add season encoding
        season_dummies = pd.get_dummies(df['season'], prefix='season')
        df = pd.concat([df, season_dummies], axis=1)
        feature_cols.extend(season_dummies.columns.tolist())

        self.feature_columns = feature_cols
        return df[feature_cols], df[self.target_columns]

    def train_ensemble_model(self, X, y):
        """Train ensemble of models for maximum accuracy"""
        print("Training ensemble models...")

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, shuffle=True
        )

        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        self.scalers['feature_scaler'] = scaler

        # Scale targets
        target_scaler = StandardScaler()
        y_train_scaled = target_scaler.fit_transform(y_train)
        y_test_scaled = target_scaler.transform(y_test)
        self.scalers['target_scaler'] = target_scaler

        models_performance = {}

        # 1. XGBoost Model
        print("Training XGBoost...")
        xgb_model = MultiOutputRegressor(
            xgb.XGBRegressor(
                n_estimators=300,
                max_depth=8,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                n_jobs=-1
            )
        )
        xgb_model.fit(X_train_scaled, y_train_scaled)
        self.models['xgboost'] = xgb_model

        # Evaluate XGBoost
        y_pred_xgb = xgb_model.predict(X_test_scaled)
        y_pred_xgb = target_scaler.inverse_transform(y_pred_xgb)
        models_performance['xgboost'] = self.evaluate_model(y_test, y_pred_xgb)

        # 2. LightGBM Model
        print("Training LightGBM...")
        lgb_model = MultiOutputRegressor(
            lgb.LGBMRegressor(
                n_estimators=300,
                max_depth=8,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                n_jobs=-1,
                verbose=-1
            )
        )
        lgb_model.fit(X_train_scaled, y_train_scaled)
        self.models['lightgbm'] = lgb_model

        # Evaluate LightGBM
        y_pred_lgb = lgb_model.predict(X_test_scaled)
        y_pred_lgb = target_scaler.inverse_transform(y_pred_lgb)
        models_performance['lightgbm'] = self.evaluate_model(y_test, y_pred_lgb)

        # 3. Random Forest Model
        print("Training Random Forest...")
        rf_model = MultiOutputRegressor(
            RandomForestRegressor(
                n_estimators=200,
                max_depth=15,
                min_samples_split=5,
                min_samples_leaf=2,
                random_state=42,
                n_jobs=-1
            )
        )
        rf_model.fit(X_train_scaled, y_train_scaled)
        self.models['random_forest'] = rf_model

        # Evaluate Random Forest
        y_pred_rf = rf_model.predict(X_test_scaled)
        y_pred_rf = target_scaler.inverse_transform(y_pred_rf)
        models_performance['random_forest'] = self.evaluate_model(y_test, y_pred_rf)

        # 4. Neural Network Model
        print("Training Neural Network...")
        nn_model = self.build_neural_network(X_train_scaled.shape[1], len(self.target_columns))

        early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.0001)

        history = nn_model.fit(
            X_train_scaled, y_train_scaled,
            validation_data=(X_test_scaled, y_test_scaled),
            epochs=100,
            batch_size=32,
            callbacks=[early_stopping, reduce_lr],
            verbose=0
        )

        self.models['neural_network'] = nn_model

        # Evaluate Neural Network
        y_pred_nn = nn_model.predict(X_test_scaled)
        y_pred_nn = target_scaler.inverse_transform(y_pred_nn)
        models_performance['neural_network'] = self.evaluate_model(y_test, y_pred_nn)

        # 5. Create Ensemble Model
        print("Creating ensemble model...")
        # Weight models based on their performance
        weights = self.calculate_model_weights(models_performance)

        # Ensemble prediction
        ensemble_pred = (
            weights['xgboost'] * y_pred_xgb +
            weights['lightgbm'] * y_pred_lgb +
            weights['random_forest'] * y_pred_rf +
            weights['neural_network'] * y_pred_nn
        )

        models_performance['ensemble'] = self.evaluate_model(y_test, ensemble_pred)
        self.model_weights = weights

        # Print performance comparison
        print("\n=== Model Performance Comparison ===")
        for model_name, metrics in models_performance.items():
            print(f"{model_name.upper()}:")
            print(f"  MAE: {metrics['mae']:.4f}")
            print(f"  RMSE: {metrics['rmse']:.4f}")
            print(f"  R²: {metrics['r2']:.4f}")
            print()

        return models_performance

    def build_neural_network(self, input_dim, output_dim):
        """Build neural network model"""
        model = Sequential([
            Dense(512, activation='relu', input_dim=input_dim),
            Dropout(0.3),
            Dense(256, activation='relu'),
            Dropout(0.3),
            Dense(128, activation='relu'),
            Dropout(0.2),
            Dense(64, activation='relu'),
            Dense(output_dim, activation='linear')
        ])

        model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss='mse',
            metrics=['mae']
        )

        return model

    def calculate_model_weights(self, performance):
        """Calculate ensemble weights based on model performance"""
        # Use R² scores to determine weights (higher R² gets higher weight)
        r2_scores = {model: metrics['r2'] for model, metrics in performance.items()}
        total_r2 = sum(r2_scores.values())

        weights = {model: score / total_r2 for model, score in r2_scores.items()}
        return weights

    def evaluate_model(self, y_true, y_pred):
        """Calculate evaluation metrics"""
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        r2 = r2_score(y_true, y_pred)

        return {
            'mae': mae,
            'rmse': rmse,
            'r2': r2
        }

    def predict_weather(self, date, city, latitude=None, longitude=None, elevation=None):
        """Predict weather for a specific date and city"""
        # Create feature vector for prediction
        date_obj = pd.to_datetime(date)

        # Get city encoding
        if city in self.label_encoders['city'].classes_:
            city_encoded = self.label_encoders['city'].transform([city])[0]
        else:
            # Use average city encoding for unknown cities
            city_encoded = 0

        # Create feature vector
        features = {
            'weathercode': 1,  # Default clear weather
            'latitude': latitude or 7.8731,  # Default to Colombo
            'longitude': longitude or 80.7718,
            'elevation': elevation or 8,
            'year': date_obj.year,
            'month': date_obj.month,
            'day': date_obj.day,
            'day_of_year': date_obj.dayofyear,
            'week_of_year': date_obj.isocalendar().week,
            'month_sin': np.sin(2 * np.pi * date_obj.month / 12),
            'month_cos': np.cos(2 * np.pi * date_obj.month / 12),
            'day_sin': np.sin(2 * np.pi * date_obj.dayofyear / 365),
            'day_cos': np.cos(2 * np.pi * date_obj.dayofyear / 365),
            'city_encoded': city_encoded,
            'country_encoded': 0,  # Sri Lanka
            'daylight_hours': 12,  # Approximate for Sri Lanka
            'winddirection_10m_dominant': 180
        }

        # Add seasonal features
        season = self._get_season(date_obj.month)
        for s in ['NE_Monsoon', 'Inter_Monsoon_1', 'SW_Monsoon', 'Inter_Monsoon_2']:
            features[f'season_{s}'] = 1 if season == s else 0

        # Add lag features (use defaults for prediction)
        lag_defaults = {
            'temperature_2m_mean_lag1': 26.0,
            'temperature_2m_mean_lag7': 26.0,
            'temperature_2m_mean_ma7': 26.0,
            'precipitation_sum_lag1': 0.0,
            'precipitation_sum_lag7': 0.0,
            'precipitation_sum_ma7': 0.0,
            'windspeed_10m_max_lag1': 10.0,
            'windspeed_10m_max_lag7': 10.0,
            'windspeed_10m_max_ma7': 10.0
        }
        features.update(lag_defaults)

        # Create feature vector in correct order
        feature_vector = np.array([features[col] for col in self.feature_columns]).reshape(1, -1)

        # Scale features
        feature_vector_scaled = self.scalers['feature_scaler'].transform(feature_vector)

        # Make ensemble prediction
        predictions = {}
        for model_name, model in self.models.items():
            if model_name == 'neural_network':
                pred_scaled = model.predict(feature_vector_scaled)
            else:
                pred_scaled = model.predict(feature_vector_scaled)

            pred = self.scalers['target_scaler'].inverse_transform(pred_scaled)
            predictions[model_name] = pred[0]

        # Calculate ensemble prediction
        ensemble_pred = np.zeros(len(self.target_columns))
        for model_name, weight in self.model_weights.items():
            ensemble_pred += weight * predictions[model_name]

        # Create result dictionary
        result = {}
        for i, col in enumerate(self.target_columns):
            result[col] = float(ensemble_pred[i])

        return result

    def save_model(self, base_filename='sri_lanka_weather_model'):
        """Save all models and preprocessors"""
        print("Saving models...")

        # Save sklearn models
        for name, model in self.models.items():
            if name != 'neural_network':
                joblib.dump(model, f'{base_filename}_{name}.pkl')

        # Save neural network
        if 'neural_network' in self.models:
            self.models['neural_network'].save(f'{base_filename}_neural_network.h5')

        # Save scalers and encoders
        joblib.dump(self.scalers, f'{base_filename}_scalers.pkl')
        joblib.dump(self.label_encoders, f'{base_filename}_label_encoders.pkl')
        joblib.dump(self.feature_columns, f'{base_filename}_feature_columns.pkl')
        joblib.dump(self.target_columns, f'{base_filename}_target_columns.pkl')
        joblib.dump(self.model_weights, f'{base_filename}_weights.pkl')

        print("Models saved successfully!")

        # Download files
        try:
            files.download(f'{base_filename}_xgboost.pkl')
            files.download(f'{base_filename}_lightgbm.pkl')
            files.download(f'{base_filename}_random_forest.pkl')
            files.download(f'{base_filename}_neural_network.h5')
            files.download(f'{base_filename}_scalers.pkl')
            files.download(f'{base_filename}_label_encoders.pkl')
            files.download(f'{base_filename}_feature_columns.pkl')
            files.download(f'{base_filename}_target_columns.pkl')
            files.download(f'{base_filename}_weights.pkl')
        except:
            print("Files saved locally. Use files.download() to download manually if needed.")

In [7]:
# Main execution
if __name__ == "__main__":
    # Initialize predictor
    predictor = SriLankaWeatherPredictor()

    # Upload and load data
    print("Please upload your CSV file...")
    uploaded = files.upload()

    file_name = list(uploaded.keys())[0]
    print(f"Processing file: {file_name}")

    # Load and preprocess data
    df = predictor.load_and_preprocess_data(file_name)

    # Prepare features
    X, y = predictor.prepare_features(df)

    # Train models
    performance = predictor.train_ensemble_model(X, y)

    # Test prediction
    print("\n=== Testing Prediction ===")
    test_prediction = predictor.predict_weather('2024-06-15', 'Colombo', 6.9271, 79.8612, 8)
    print("Weather prediction for Colombo on 2024-06-15:")
    for key, value in test_prediction.items():
        print(f"{key}: {value:.2f}")

    # Save models
    predictor.save_model()

    print("\n=== Model Training Complete! ===")
    print("All model files have been saved and are ready for download.")
    print("Use these files in your backend API for weather predictions.")

Please upload your CSV file...


Saving SriLanka_Weather_Dataset.csv to SriLanka_Weather_Dataset.csv
Processing file: SriLanka_Weather_Dataset.csv
Loading data...
Data shape after preprocessing: (147270, 48)
Training ensemble models...
Training XGBoost...
Training LightGBM...
Training Random Forest...
Training Neural Network...
[1m921/921[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Creating ensemble model...

=== Model Performance Comparison ===
XGBOOST:
  MAE: 0.9226
  RMSE: 1.8413
  R²: 0.9188

LIGHTGBM:
  MAE: 1.1731
  RMSE: 2.2105
  R²: 0.8827

RANDOM_FOREST:
  MAE: 1.0360
  RMSE: 2.0638
  R²: 0.8943

NEURAL_NETWORK:
  MAE: 1.2841
  RMSE: 2.3274
  R²: 0.8633

ENSEMBLE:
  MAE: 1.0471
  RMSE: 1.9917
  R²: 0.9028


=== Testing Prediction ===
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Weather prediction for Colombo on 2024-06-15:
temperature_2m_max: 30.26
temperature_2m_min: 22.34
temperature_2m_mean: 26.36
apparent_temperature_max: 35.05
apparent_temperature_min: 27.05



Models saved successfully!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


=== Model Training Complete! ===
All model files have been saved and are ready for download.
Use these files in your backend API for weather predictions.
