# Train a separate Model to mask out the water pixels

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Step 1: Read the CSV file into a DataFrame
csv_path = "/home/jovyan/shared-public/ml_swe_monitoring_prod/test_data_predicted_latest_2023-12-15.csv"
df = pd.read_csv(csv_path)

# Step 2: Inspect the DataFrame
print("First few rows of the dataset:")
print(df.head())

print("\nColumns in the dataset:")
print(df.columns)

# Step 2: Get basic statistics for numeric columns
numeric_stats = df.describe()
print("Basic statistics for numeric columns:")
print(numeric_stats)

# Step 3: Data Preprocessing
# Drop any columns that are not relevant for training (e.g., date columns)
df = df.select_dtypes(include=['number'])  # Keep only numeric columns

# Check if there are still non-numeric columns that might need encoding or further processing
non_numeric_columns = df.select_dtypes(exclude=['number']).columns
print(f"\nNon-numeric columns: {non_numeric_columns}")

First few rows of the dataset:
    lat      lon  relative_humidity_rmax  mean_vapor_pressure_deficit  \
0  49.0 -125.000                     0.0                          0.0   
1  49.0 -124.964                     0.0                          0.0   
2  49.0 -124.928                     0.0                          0.0   
3  49.0 -124.892                     0.0                          0.0   
4  49.0 -124.856                     0.0                          0.0   

   relative_humidity_rmin  precipitation_amount  wind_speed  \
0                     0.0                   0.0         0.0   
1                     0.0                   0.0         0.0   
2                     0.0                   0.0         0.0   
3                     0.0                   0.0         0.0   
4                     0.0                   0.0         0.0   

   potential_evapotranspiration  air_temperature_tmmx  air_temperature_tmmn  \
0                           0.0                   0.0                   

In [None]:

# Step 4: Define features and target
target_column = 'fsca'

# Ensure the target column is numeric and present in the DataFrame
if target_column in df.columns:
    # Define features (excluding target)
    features = df.drop(columns=[target_column])
    # Define the target
    target = df[target_column]
else:
    raise ValueError(f"Target column '{target_column}' not found in the DataFrame.")

# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Step 6: Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 7: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 8: Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
print(f"\nMean Squared Error on the test set: {mse:.4f}")

# Optionally, you can also check feature importances
feature_importances = model.feature_importances_
important_features = pd.DataFrame({'Feature': features.columns, 'Importance': feature_importances})
print("\nFeature importances:")
print(important_features.sort_values(by='Importance', ascending=False))