## 1. Environment Setup

In [None]:
# Install PySpark (for Google Colab)
!pip install pyspark -q

In [2]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns

print("✅ Libraries imported successfully")

ModuleNotFoundError: No module named 'pyspark'

In [None]:
# Initialize Spark Session with 1 executor (Sequential Baseline)
import sys

spark = SparkSession.builder \
    .appName("RandomForest_Sequential_Baseline") \
    .master("local[1]") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("spark.default.parallelism", "1") \
    .config("spark.pyspark.python", sys.executable) \
    .config("spark.pyspark.driver.python", sys.executable) \
    .getOrCreate()

print(f"✅ Spark Session initialized")
print(f"Spark Version: {spark.version}")
print(f"Master: {spark.sparkContext.master}")
print(f"Executors: 1 (Sequential Mode)")
print(f"Python executable: {sys.executable}")

✅ Spark Session initialized
Spark Version: 4.1.1
Master: local[1]
Executors: 1 (Sequential Mode)
Python executable: c:\Users\debas\workspace\mtech\sem2\FIT\lab1_env\Scripts\python.exe


## 2. Data Loading and Preprocessing

In [None]:
# Download Covertype dataset using scikit-learn (cache locally for reuse)
from sklearn.datasets import fetch_covtype
from pathlib import Path
import os

print("Preparing Covertype dataset...")

data_dir = Path("data")
data_dir.mkdir(parents=True, exist_ok=True)
local_csv = data_dir / "covtype.csv"

if local_csv.exists():
    print(f"Loading dataset from local cache: {local_csv}")
    df_pandas = pd.read_csv(local_csv)
else:
    print("Downloading Covertype dataset...")
    covtype = fetch_covtype(as_frame=True)
    df_pandas = covtype.frame
    df_pandas.to_csv(local_csv, index=False)
    print(f"✅ Dataset cached locally: {local_csv}")

print("✅ Dataset ready")
print(f"Shape: {df_pandas.shape}")
print(f"Columns: {df_pandas.columns.tolist()[:5]}...")  # Show first 5 columns

Preparing Covertype dataset...
Loading dataset from local cache: data\covtype.csv
✅ Dataset ready
Shape: (581012, 55)
Columns: ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology']...


In [None]:
# Convert to Spark DataFrame
print("Converting to Spark DataFrame...")
spark_df = spark.createDataFrame(df_pandas)

# Verify data
print(f"✅ Spark DataFrame created")
print(f"Rows: {spark_df.count():,}")
print(f"Columns: {len(spark_df.columns)}")

spark_df.printSchema()

Converting to Spark DataFrame...
✅ Spark DataFrame created


Py4JJavaError: An error occurred while calling o64.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0) (LAPTOP-VQI0DOIO executor driver): java.io.IOException: Cannot run program "python3": CreateProcess error=2, The system cannot find the file specified
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1143)
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1073)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:247)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:154)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:158)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:309)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:72)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:107)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:180)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:716)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:86)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:83)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:97)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:719)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.io.IOException: CreateProcess error=2, The system cannot find the file specified
	at java.base/java.lang.ProcessImpl.create(Native Method)
	at java.base/java.lang.ProcessImpl.<init>(ProcessImpl.java:494)
	at java.base/java.lang.ProcessImpl.start(ProcessImpl.java:159)
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1110)
	... 35 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$3(DAGScheduler.scala:3122)
	at scala.Option.getOrElse(Option.scala:201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:3122)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:3114)
	at scala.collection.immutable.List.foreach(List.scala:323)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:3114)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1303)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1303)
	at scala.Option.foreach(Option.scala:437)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1303)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3397)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3328)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3317)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:50)
Caused by: java.io.IOException: Cannot run program "python3": CreateProcess error=2, The system cannot find the file specified
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1143)
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1073)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:247)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:154)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:158)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:309)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:72)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:107)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:180)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:716)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:86)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:83)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:97)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:719)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.io.IOException: CreateProcess error=2, The system cannot find the file specified
	at java.base/java.lang.ProcessImpl.create(Native Method)
	at java.base/java.lang.ProcessImpl.<init>(ProcessImpl.java:494)
	at java.base/java.lang.ProcessImpl.start(ProcessImpl.java:159)
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1110)
	... 35 more


In [None]:
# Data preprocessing: Feature assembly
print("Preparing features...")

# Get feature column names (all except target)
feature_cols = [col for col in spark_df.columns if col != 'Cover_Type']
print(f"Number of features: {len(feature_cols)}")

# Assemble features into a single vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
data = assembler.transform(spark_df)

# Rename target column to 'label' (required by MLlib)
data = data.withColumnRenamed('Cover_Type', 'label')

# Select only necessary columns
data = data.select('features', 'label')

print("✅ Features assembled")
data.show(5)

In [None]:
# Train-test split (80-20)
print("Splitting data into train and test sets...")
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Partition and cache for performance
train_data = train_data.repartition(4).cache()
test_data = test_data.cache()

# Materialize the cache
train_count = train_data.count()
test_count = test_data.count()

print(f"✅ Data split complete")
print(f"Training samples: {train_count:,}")
print(f"Test samples: {test_count:,}")

## 3. Sequential Random Forest Training

Train Random Forest with different tree counts to establish baseline performance.

In [None]:
def train_and_evaluate_rf(train_data, test_data, num_trees, max_depth=10, exp_id=""):
    """
    Train Random Forest and measure performance
    
    Args:
        train_data: Spark DataFrame for training
        test_data: Spark DataFrame for testing
        num_trees: Number of trees in the forest
        max_depth: Maximum depth of each tree
        exp_id: Experiment identifier
    
    Returns:
        Dictionary containing metrics and model
    """
    print(f"\n{'='*60}")
    print(f"Experiment {exp_id}: Training with {num_trees} trees")
    print(f"{'='*60}")
    
    # Configure Random Forest
    rf = RandomForestClassifier(
        numTrees=num_trees,
        maxDepth=max_depth,
        seed=42,
        labelCol='label',
        featuresCol='features'
    )
    
    # Train model and measure time
    print(f"Training started...")
    start_time = time.time()
    model = rf.fit(train_data)
    training_time = time.time() - start_time
    print(f"✅ Training completed in {training_time:.2f} seconds")
    
    # Make predictions and measure time
    print(f"Making predictions...")
    start_time = time.time()
    predictions = model.transform(test_data)
    predictions.cache()
    pred_count = predictions.count()  # Trigger computation
    prediction_time = time.time() - start_time
    print(f"✅ Predictions completed in {prediction_time:.2f} seconds")
    
    # Evaluate accuracy
    evaluator = MulticlassClassificationEvaluator(
        labelCol='label',
        predictionCol='prediction',
        metricName='accuracy'
    )
    accuracy = evaluator.evaluate(predictions)
    print(f"✅ Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    
    # Collect metrics
    metrics = {
        'experiment_id': exp_id,
        'num_executors': 1,
        'num_trees': num_trees,
        'max_depth': max_depth,
        'training_time': training_time,
        'prediction_time': prediction_time,
        'total_time': training_time + prediction_time,
        'accuracy': accuracy,
        'train_samples': train_count,
        'test_samples': test_count
    }
    
    return {
        'metrics': metrics,
        'model': model,
        'predictions': predictions
    }

In [None]:
# Baseline Experiment 1: 50 trees
result_50 = train_and_evaluate_rf(
    train_data, test_data, 
    num_trees=50, 
    exp_id="B1"
)

In [None]:
# Baseline Experiment 2: 100 trees (standard configuration)
result_100 = train_and_evaluate_rf(
    train_data, test_data, 
    num_trees=100, 
    exp_id="B2"
)

In [None]:
# Baseline Experiment 3: 200 trees
result_200 = train_and_evaluate_rf(
    train_data, test_data, 
    num_trees=200, 
    exp_id="B3"
)

## 4. Results Summary and Export

In [None]:
# Compile all baseline results
baseline_results = pd.DataFrame([
    result_50['metrics'],
    result_100['metrics'],
    result_200['metrics']
])

print("\n" + "="*80)
print("BASELINE RESULTS SUMMARY")
print("="*80)
print(baseline_results.to_string(index=False))
print("="*80)

In [None]:
# Visualize baseline performance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Training time vs number of trees
axes[0].bar(baseline_results['num_trees'], baseline_results['training_time'], 
            color='steelblue', alpha=0.7)
axes[0].set_xlabel('Number of Trees', fontsize=12)
axes[0].set_ylabel('Training Time (seconds)', fontsize=12)
axes[0].set_title('Sequential Baseline: Training Time', fontsize=14)
axes[0].grid(True, alpha=0.3)

# Add value labels on bars
for i, (trees, time_val) in enumerate(zip(baseline_results['num_trees'], 
                                           baseline_results['training_time'])):
    axes[0].text(trees, time_val, f'{time_val:.1f}s', 
                ha='center', va='bottom', fontsize=10)

# Plot 2: Accuracy vs number of trees
axes[1].plot(baseline_results['num_trees'], baseline_results['accuracy']*100, 
            marker='o', linewidth=2, markersize=8, color='green')
axes[1].set_xlabel('Number of Trees', fontsize=12)
axes[1].set_ylabel('Accuracy (%)', fontsize=12)
axes[1].set_title('Sequential Baseline: Accuracy', fontsize=14)
axes[1].grid(True, alpha=0.3)
axes[1].set_ylim([baseline_results['accuracy'].min()*100-5, 
                   baseline_results['accuracy'].max()*100+5])

plt.tight_layout()
plt.savefig('baseline_performance.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Baseline performance visualization saved")

In [None]:
# Export baseline results for comparison with parallel implementation
import os

# Create results directory if it doesn't exist
os.makedirs('results/metrics', exist_ok=True)

# Save to CSV
baseline_results.to_csv('results/metrics/baseline_results.csv', index=False)
print("✅ Baseline results exported to: results/metrics/baseline_results.csv")

# Also save predictions from 100-tree model for correctness validation
predictions_100 = result_100['predictions'].select('label', 'prediction').toPandas()
predictions_100.to_csv('results/metrics/baseline_predictions_100trees.csv', index=False)
print("✅ Baseline predictions (100 trees) saved for validation")

## 5. Key Findings and Observations

In [None]:
# Analysis
print("\n" + "="*80)
print("KEY FINDINGS FROM SEQUENTIAL BASELINE")
print("="*80)

# Training time scaling
time_50 = result_50['metrics']['training_time']
time_100 = result_100['metrics']['training_time']
time_200 = result_200['metrics']['training_time']

print(f"\n1. Training Time Scaling:")
print(f"   - 50 trees:  {time_50:.2f} seconds")
print(f"   - 100 trees: {time_100:.2f} seconds ({time_100/time_50:.2f}x vs 50 trees)")
print(f"   - 200 trees: {time_200:.2f} seconds ({time_200/time_100:.2f}x vs 100 trees)")
print(f"\n   ➜ Training time scales approximately linearly with number of trees")

# Accuracy analysis
acc_50 = result_50['metrics']['accuracy']
acc_100 = result_100['metrics']['accuracy']
acc_200 = result_200['metrics']['accuracy']

print(f"\n2. Accuracy Analysis:")
print(f"   - 50 trees:  {acc_50:.4f} ({acc_50*100:.2f}%)")
print(f"   - 100 trees: {acc_100:.4f} ({acc_100*100:.2f}%)")
print(f"   - 200 trees: {acc_200:.4f} ({acc_200*100:.2f}%)")
acc_improvement = (acc_200 - acc_50) * 100
print(f"\n   ➜ Accuracy improvement from 50 to 200 trees: {acc_improvement:.2f}%")

# Baseline for speedup calculations
print(f"\n3. Baseline for Parallel Comparison:")
print(f"   - Using 100 trees as standard configuration")
print(f"   - Sequential training time: {time_100:.2f} seconds")
print(f"   - Target speedup with 4 executors: >3.0x")
print(f"   - Expected parallel time (4 executors): <{time_100/3:.2f} seconds")

print("\n" + "="*80)

## 6. Cleanup

In [None]:
# Unpersist cached data
train_data.unpersist()
test_data.unpersist()
result_50['predictions'].unpersist()
result_100['predictions'].unpersist()
result_200['predictions'].unpersist()

print("✅ Cache cleared")

# Note: Don't stop Spark session if running more notebooks in same session
# spark.stop()

## Conclusion

This notebook established the sequential baseline for Random Forest training on the Covertype dataset. Key results:

- ✅ Successfully loaded and preprocessed 581,012 samples with 54 features
- ✅ Trained Random Forest models with 50, 100, and 200 trees
- ✅ Measured training times and accuracies for all configurations
- ✅ Exported baseline metrics for parallel comparison
- ✅ Saved predictions for correctness validation

**Next Steps:**
- Proceed to `parallel_implementation.ipynb` for parallel scaling experiments
- Use these baseline times to calculate speedup and efficiency
- Validate that parallel implementation produces identical predictions