In [0]:
%pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, month, year, avg, to_date
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor
from pyspark.ml import Pipeline

spark = SparkSession.builder.appName("Evapotranspiration_MLlib").getOrCreate()

df = spark.read.option("header", "true").option("inferSchema", "true").csv("/opt/data/weatherData.csv")

print("Data loaded successfully!")
print(f"Total records: {df.count()}")
# df.printSchema()

In [1]:
%pyspark
df_clean = df.withColumnRenamed("precipitation_hours (h)", "precipitation_hours").withColumnRenamed("sunshine_duration (s)", "sunshine_duration") \
            .withColumnRenamed("wind_speed_10m_max (km/h)", "wind_speed").withColumnRenamed("et0_fao_evapotranspiration (mm)", "evapotranspiration")

df_clean = df_clean.withColumn("date_parsed", to_date(col("date"), "M/d/yyyy")).withColumn("year", year("date_parsed")).withColumn("month", month("date_parsed"))

# Filter for May (month = 5)
df_may = df_clean.filter(col("month") == 5)

df_features = df_may.select("precipitation_hours", "sunshine_duration", "wind_speed", "evapotranspiration").na.drop()

print(f"May records: {df_features.count()}")
df_features.describe().show()


In [2]:
%pyspark
feature_cols = ["precipitation_hours", "sunshine_duration", "wind_speed"]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_raw")
scaler = StandardScaler(inputCol="features_raw", outputCol="features", withStd=True, withMean=True)

df_assembled = assembler.transform(df_features)
scaler_model = scaler.fit(df_assembled)
df_scaled = scaler_model.transform(df_assembled)

print("Features assembled and scaled!")
df_scaled.select("features", "evapotranspiration").show(5, truncate=False)

In [3]:
%pyspark
# Split data: 80% training, 20% validation
train_data, test_data = df_scaled.randomSplit([0.8, 0.2], seed=42)

print(f"Training samples: {train_data.count()}")
print(f"Validation samples: {test_data.count()}")

In [4]:
%pyspark
# Initialize Linear Regression
lr = LinearRegression(
    featuresCol="features",
    labelCol="evapotranspiration",
    maxIter=100,
    regParam=0.3,
    elasticNetParam=0.8)

lr_model = lr.fit(train_data)

print("=== Linear Regression Model ===")
print(f"Coefficients: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")
print(f"R-squared (training): {lr_model.summary.r2}")

In [5]:
%pyspark
# Random Forest for comparison
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="evapotranspiration",
    numTrees=100,
    maxDepth=10,
    seed=42)

rf_model = rf.fit(train_data)

print("=== Random Forest Model ===")
print(f"Feature Importances: {rf_model.featureImportances}")
print(f"Features: {feature_cols}")

from pyspark.ml.regression import GBTRegressor # Remove this

# Gradient Boosted Trees (Adding for better non-linear performance)
gbt = GBTRegressor(
    featuresCol="features",
    labelCol="evapotranspiration",
    maxIter=100,
    seed=42)

gbt_model = gbt.fit(train_data)
print("\n=== Gradient Boosted Trees (GBT) Model ===")
print(f"Feature Importances: {gbt_model.featureImportances}")
print(f"Features: {feature_cols}")

In [6]:
%pyspark
# Model evaluation
evaluator_rmse = RegressionEvaluator(labelCol="evapotranspiration", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="evapotranspiration", predictionCol="prediction", metricName="r2")
evaluator_mae = RegressionEvaluator(labelCol="evapotranspiration", predictionCol="prediction", metricName="mae")

# Linear Regression predictions
lr_predictions = lr_model.transform(test_data)
lr_rmse = evaluator_rmse.evaluate(lr_predictions)
lr_r2 = evaluator_r2.evaluate(lr_predictions)

# Random Forest predictions
rf_predictions = rf_model.transform(test_data)
rf_rmse = evaluator_rmse.evaluate(rf_predictions)
rf_r2 = evaluator_r2.evaluate(rf_predictions)

# GBT predictions
gbt_predictions = gbt_model.transform(test_data)
gbt_rmse = evaluator_rmse.evaluate(gbt_predictions)
gbt_r2 = evaluator_r2.evaluate(gbt_predictions)
gbt_mae = evaluator_mae.evaluate(gbt_predictions)

print("=== Linear Regression Evaluation ===")
print(f"RMSE: {lr_rmse:.4f}, R2: {lr_r2:.4f}")

print("\n=== Random Forest Evaluation ===")
print(f"RMSE: {rf_rmse:.4f}, R2: {rf_r2:.4f}")

print("\n=== Gradient Boosted Trees Evaluation ===")
print(f"RMSE: {gbt_rmse:.4f}")
print(f"R-2: {gbt_r2:.4f}")
print(f"MAE: {gbt_mae:.4f}")

In [7]:
%pyspark
# Find patterns where evapotranspiration < 1.5mm
low_et = df_features.filter(col("evapotranspiration") < 1.5)

print("=== Conditions for Evapotranspiration < 1.5mm ===")
print(f"Number of days with ET < 1.5mm: {low_et.count()}")

low_et_stats = low_et.agg(
    avg("precipitation_hours").alias("mean_precipitation_hours"),
    avg("sunshine_duration").alias("mean_sunshine_duration"),
    avg("wind_speed").alias("mean_wind_speed"))

print("\nMean values when evapotranspiration < 1.5mm:")
low_et_stats.show()

In [8]:
%pyspark
import random
from pyspark.sql.types import StructType, StructField, DoubleType

print("PREDICTION: Best Achievable Weather Conditions (GBT + Random Search)")
print("(Goal: Minimal Realistic ET for May using Non-Linear Model)")

# Random Grid Search Parameters
# Compared to the mean values to have evapotranspiration < 1.5mm: in the month of march 
# mean_precipitation_hours - 22.134328358208954
# mean_sunshine_duration_seconds 1593.0579850746274
# mean_wind_speed - 18.490298507462686

NUM_SAMPLES = 50000
MIN_SUNSHINE = 0.0         # 0.44 hours is very close to 0
MAX_SUNSHINE = 3600.0      # Cap at 1 hour (3600s) to force that low-sunshine similarity
MIN_WIND = 15.0            # Range around the 18.5 km/h mean
MAX_WIND = 25.0            # Allow it to go higher to see if more wind helps even more
MIN_PRECIP = 20.0          # Force high rain (centered on 22h)
MAX_PRECIP = 24.0          # Max possible

print(f"Generating {NUM_SAMPLES} synthetic realistic weather points...")

synthetic_data = []
for _ in range(NUM_SAMPLES):
    synthetic_data.append((
        random.uniform(MIN_PRECIP, MAX_PRECIP),
        random.uniform(MIN_SUNSHINE, MAX_SUNSHINE),
        random.uniform(MIN_WIND, MAX_WIND)
    ))

# Create DataFrame
schema = StructType([
    StructField("precipitation_hours", DoubleType(), True),
    StructField("sunshine_duration", DoubleType(), True),
    StructField("wind_speed", DoubleType(), True)
])

df_synthetic = spark.createDataFrame(synthetic_data, schema)

# Transform features
df_synthetic_assembled = assembler.transform(df_synthetic)
df_synthetic_scaled = scaler_model.transform(df_synthetic_assembled)

# Predict using GBT
predictions = gbt_model.transform(df_synthetic_scaled)

# Find minimum
best_result = predictions.orderBy("prediction").first()
min_et = best_result["prediction"]

print(f"\nOptimization Complete.")
print(f"Lowest Predicted ET found: {min_et:.4f} mm")

recommended_stats = {
    'precip': best_result["precipitation_hours"],
    'sun': best_result["sunshine_duration"],
    'wind': best_result["wind_speed"]
}

print(f"\n*** Recommended Values for May 2026 ***")
print(f"Precipitation Hours: {recommended_stats['precip']:.2f} hours")
print(f"Sunshine Duration: {recommended_stats['sun']:.2f} seconds ({recommended_stats['sun']/3600:.2f} hours)")
print(f"Wind Speed: {recommended_stats['wind']:.2f} km/h")

print(f"\nModel Predicted Evapotranspiration (GBT): {min_et:.4f} mm")

if min_et < 1.5:
    print("SUCCESS: GBT found a realistic scenario < 1.5mm!")
else:
    print("NOTE: Even with GBT, the lowest realistic ET is above 1.5mm.")
    print("      This represents the absolute best-case scenario found by the model.")