In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

# Load and preprocess data
def load_and_preprocess_data(file_path):
    # Load dataset
    df = pd.read_csv(file_path)
    
    # Convert date column to datetime and sort
    df['fecha'] = pd.to_datetime(df['fecha'])
    df = df.sort_values('fecha')
    
    # Create precipitation classes
    conditions = [
        df['precipitacion_total'] == 0,
        (df['precipitacion_total'] > 0) & (df['precipitacion_total'] <= 2),
        (df['precipitacion_total'] > 2) & (df['precipitacion_total'] <= 15),
        df['precipitacion_total'] > 15
    ]
    choices = ['null', 'low', 'moderate', 'intense']
    df['precipitation_class'] = np.select(conditions, choices, default='null')
    
    # Handle missing values
    df = df.dropna()
    
    return df

# Prepare features and targets
def prepare_data(df):
    # Feature columns (excluding date and target columns)
    feature_cols = ['temp_max', 'temp_min', 'presion_max', 'presion_min', 
        'rocio_max', 'rocio_min', 'viento_promedio', 'radiacion']
    
    # Scale features
    scaler = StandardScaler()
    X = scaler.fit_transform(df[feature_cols])
    
    # Encode labels
    le = LabelEncoder()
    y = le.fit_transform(df['precipitation_class'])
    y = to_categorical(y)
    
    return X, y, scaler, le

# Create sequences for LSTM
def create_sequences(X, y, time_steps=30):
    X_seq, y_seq = [], []
    for i in range(time_steps, len(X)):
        X_seq.append(X[i-time_steps:i])
        y_seq.append(y[i])
    return np.array(X_seq), np.array(y_seq)

# Build LSTM model
def build_model(input_shape, num_classes):
    model = Sequential([
        LSTM(64, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        LSTM(32, return_sequences=False),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer=Adam(learning_rate=0.00001),
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

# Main execution
def main():
    # Load and preprocess data
    df = load_and_preprocess_data('dic1985-oct2025.csv')  # Replace with your file path
    
    # Prepare features and targets
    X, y, scaler, le = prepare_data(df)
    
    # Create sequences
    time_steps = 30  # Use 30 days of historical data
    X_seq, y_seq = create_sequences(X, y, time_steps)
    
    # Split data (maintaining temporal order)
    split_idx = int(0.8 * len(X_seq))
    X_train, X_test = X_seq[:split_idx], X_seq[split_idx:]
    y_train, y_test = y_seq[:split_idx], y_seq[split_idx:]
    y_train_labels, y_test_labels = y_seq[:split_idx], y_seq[split_idx:]
    print(y_test_labels)
    
    # Build model
    model = build_model((X_train.shape[1], X_train.shape[2]), y_train.shape[1])
    
    # Train model
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    history = model.fit(
        X_train, y_train,
        epochs=100,
        batch_size=32,
        validation_data=(X_test, y_test),
        callbacks=[early_stop],
        verbose=1,
        shuffle=False
    )
    
    # Evaluate model
    train_loss = model.evaluate(X_train, y_train, verbose=0)
    test_loss = model.evaluate(X_test, y_test, verbose=0)

    print(f"\nTraining Loss: {train_loss[0]:.4f}, Accuracy: {train_loss[1]:.4f}")
    print(f"Test Loss: {test_loss[0]:.4f}, Accuracy: {test_loss[1]:.4f}")

    # Make predictions
    y_pred_proba = model.predict(X_test, verbose=0)
    y_pred = np.argmax(y_pred_proba, axis=1)
    
    # Convert back to original labels
    y_pred_labels = le.inverse_transform(y_pred)
    y_actual_labels = le.inverse_transform(y_test_labels)

    # Additional classification metrics that are more appropriate
    print(f"\nClassification Report:")
    print(classification_report(y_actual_labels, y_pred_labels))
    
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_actual_labels, y_pred_labels))

    # Save model and preprocessors
    #model.save('rainfall_lstm_model.h5')
    #import joblib
    #joblib.dump(scaler, 'scaler.pkl')
    #joblib.dump(le, 'label_encoder.pkl')

if __name__ == '__main__':
    main()

[[0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]]
Epoch 1/100


  super().__init__(**kwargs)


[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.5223 - loss: 1.3160 - val_accuracy: 0.3849 - val_loss: 1.3431
Epoch 2/100
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.5538 - loss: 1.2578 - val_accuracy: 0.4003 - val_loss: 1.3005
Epoch 3/100
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.5581 - loss: 1.1936 - val_accuracy: 0.4041 - val_loss: 1.2602
Epoch 4/100
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.5594 - loss: 1.1366 - val_accuracy: 0.4059 - val_loss: 1.2258
Epoch 5/100
[1m316/364[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 4ms/step - accuracy: 0.5752 - loss: 1.0839

KeyboardInterrupt: 

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

# Load and preprocess data
def load_and_preprocess_data(file_path):
    # Load dataset
    df = pd.read_csv(file_path)
    
    # Convert date column to datetime and sort
    df['fecha'] = pd.to_datetime(df['fecha'])
    df = df.sort_values('fecha')
    
    # Create precipitation classes
    conditions = [
        df['precipitacion_total'] == 0,
        (df['precipitacion_total'] > 0) & (df['precipitacion_total'] <= 2),
        (df['precipitacion_total'] > 2) & (df['precipitacion_total'] <= 15),
        df['precipitacion_total'] > 15
    ]
    choices = ['null', 'low', 'moderate', 'intense']
    df['precipitation_class'] = np.select(conditions, choices, default='null')
    
    # Handle missing values
    df = df.dropna()
    
    return df

# Prepare features and targets
def prepare_data(df):
    # Feature columns (excluding date and target columns)
    feature_cols = ['temp_max', 'temp_min', 'presion_max', 'presion_min', 
        'rocio_max', 'rocio_min', 'viento_promedio', 'radiacion']
    
    # Scale features
    scaler = StandardScaler()
    X = scaler.fit_transform(df[feature_cols])
    
    # Encode labels
    le = LabelEncoder()
    y = le.fit_transform(df['precipitation_class'])
    y_categorical = to_categorical(y)
    
    return X, y, y_categorical, scaler, le

# Create sequences for LSTM
def create_sequences(X, y, y_categorical, time_steps=30):
    X_seq, y_seq, y_cat_seq = [], [], []
    for i in range(time_steps, len(X)):
        X_seq.append(X[i-time_steps:i])
        y_seq.append(y[i])
        y_cat_seq.append(y_categorical[i])
    return np.array(X_seq), np.array(y_seq), np.array(y_cat_seq)

# Build LSTM model
def build_model(input_shape, num_classes):
    model = Sequential([
        LSTM(64, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        LSTM(32, return_sequences=False),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer='adam',
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

# Main execution
def main():
    # Load and preprocess data
    df = load_and_preprocess_data('dic1985-oct2025.csv')  # Replace with your file path
    
    # Prepare features and targets
    X, y, y_categorical, scaler, le = prepare_data(df)
    
    # Create sequences
    time_steps = 30  # Use 30 days of historical data
    X_seq, y_seq, y_cat_seq = create_sequences(X, y, y_categorical, time_steps)
    
    # Split data (maintaining temporal order)
    split_idx = int(0.8 * len(X_seq))
    X_train, X_test = X_seq[:split_idx], X_seq[split_idx:]
    y_train, y_test = y_cat_seq[:split_idx], y_cat_seq[split_idx:]
    y_train_labels, y_test_labels = y_seq[:split_idx], y_seq[split_idx:]
    
    # Build model
    model = build_model((X_train.shape[1], X_train.shape[2]), y_train.shape[1])
    
    # Train model
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    history = model.fit(
        X_train, y_train,
        epochs=100,
        batch_size=32,
        validation_data=(X_test, y_test),
        callbacks=[early_stop],
        verbose=1
    )
    
    # Evaluate model
    train_loss = model.evaluate(X_train, y_train, verbose=0)
    test_loss = model.evaluate(X_test, y_test, verbose=0)

    print(f"\nTraining Loss: {train_loss[0]:.4f}, Accuracy: {train_loss[1]:.4f}")
    print(f"Test Loss: {test_loss[0]:.4f}, Accuracy: {test_loss[1]:.4f}")

    # Make predictions
    y_pred_proba = model.predict(X_test, verbose=0)
    y_pred = np.argmax(y_pred_proba, axis=1)
    
    # Convert back to original labels
    y_pred_labels = le.inverse_transform(y_pred)
    y_actual_labels = le.inverse_transform(y_test_labels)
    
    # Additional classification metrics that are more appropriate
    print(f"\nClassification Report:")
    print(classification_report(y_actual_labels, y_pred_labels))
    
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_actual_labels, y_pred_labels))

    # Save model and preprocessors
    model.save('rainfall_lstm_model.h5')
    import joblib
    joblib.dump(scaler, 'scaler.pkl')
    joblib.dump(le, 'label_encoder.pkl')

if __name__ == '__main__':
    main()

Epoch 1/100


  super().__init__(**kwargs)


[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.6169 - loss: 0.8731 - val_accuracy: 0.5222 - val_loss: 1.0506
Epoch 2/100
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6503 - loss: 0.7858 - val_accuracy: 0.5914 - val_loss: 0.9788
Epoch 3/100
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6652 - loss: 0.7628 - val_accuracy: 0.5938 - val_loss: 0.9564
Epoch 4/100
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6690 - loss: 0.7521 - val_accuracy: 0.5928 - val_loss: 0.9485
Epoch 5/100
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6691 - loss: 0.7474 - val_accuracy: 0.5769 - val_loss: 0.9803
Epoch 6/100
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6719 - loss: 0.7424 - val_accuracy: 0.5356 - val_loss: 1.0647
Epoch 7/100
[1m364/364[0m [32m━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
              precision    recall  f1-score   support

     intense       0.00      0.00      0.00       166
         low       0.39      0.34      0.36       775
    moderate       0.54      0.56      0.55       747
        null       0.71      0.86      0.78      1217

    accuracy                           0.59      2905
   macro avg       0.41      0.44      0.42      2905
weighted avg       0.54      0.59      0.56      2905


Confusion Matrix:
[[   0   33  130    3]
 [   0  260  183  332]
 [   0  247  416   84]
 [   0  124   47 1046]]
