In [1]:
# === ALL-IN-ONE FINAL CODE BLOCK FOR VS CODE ===

# 1. IMPORTS
import os
import sys
from pyspark import SparkConf, SparkSession, SparkFiles
from pyspark.sql.functions import monotonically_increasing_id, col
from pyspark.ml.recommendation import ALS

print("--- Starting Spark Setup ---")

# Set up required environment variables for PySpark
# This helps ensure Python and Java can communicate properly
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = None
try:
    # 2. SPARK SESSION CONFIGURATION
    conf = SparkConf()
    conf.setMaster("local[*]").setAppName("FinalAttemptApp")
    conf.set("spark.executor.memory", "4g")
    
    # This configuration is the key to avoiding the native library error on Windows
    conf.set("spark.hadoop.io.native.lib.available", "false")

    # Build the SparkSession, stopping any existing one first.
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    print(f"SparkSession created successfully. Version: {spark.version}")


    # 3. DATA LOADING
    print("--- Loading Data ---")
    GITHUB_URL = "https://raw.githubusercontent.com/farhodibr/datasets/heads/main/books_recommender/"
    
    # Download files to Spark context
    spark.sparkContext.addFile(f"{GITHUB_URL}/Ratings.csv")
    spark.sparkContext.addFile(f"{GITHUB_URL}/Books.csv")
    spark.sparkContext.addFile(f"{GITHUB_URL}/Users.csv")
    
    # Read the data into DataFrames
    ratings_df = spark.read.csv(SparkFiles.get("Ratings.csv"), header=True, inferSchema=True)
    books_df = spark.read.csv(SparkFiles.get("Books.csv"), header=True, inferSchema=True)
    users_df = spark.read.csv(SparkFiles.get("Users.csv"), header=True, inferSchema=True)
    print(f"Ratings DataFrame loaded with {ratings_df.count()} rows.")

    
    # 4. DATA PREPARATION
    print("--- Preparing Data ---")
    users = users_df.select("User-ID").distinct().coalesce(1).withColumn("userIntId", monotonically_increasing_id())
    books = ratings_df.select("ISBN").distinct().coalesce(1).withColumn("bookIntId", monotonically_increasing_id())
    
    ratings_with_ids = ratings_df.join(users, "User-ID", "left").join(books, "ISBN", "left")
    
    ratings = ratings_with_ids.select(
        col("userIntId").alias("userId"),
        col("bookIntId").alias("bookId"),
        col("Book-Rating").alias("rating")
    ).na.drop() # Drop any rows where joins might have failed
    
    print(f"Final ratings data prepared with {ratings.count()} rows.")


    # 5. MODEL TRAINING
    print("--- Training Model ---")
    # Create a fresh, local training/test split for this experiment
    (training_data, test_data) = ratings.randomSplit([0.7, 0.3], seed=42)
    
    # Cache the training data in memory for the iterative algorithm
    training_data.cache()
    print(f"Training data created with {training_data.count()} rows.")

    # Set the ALS hyperparameters
    als = ALS(userCol="userId", itemCol="bookId", ratingCol="rating", rank=10, maxIter=15, regParam=0.1,
              coldStartStrategy="drop", nonnegative=True)

    # Fit the model
    model = als.fit(training_data)
    print("✅✅✅ Model fitting complete! ✅✅✅")


finally:
    # 6. SHUTDOWN
    if spark:
        print("--- Stopping Spark Session ---")
        spark.stop()

ImportError: cannot import name 'SparkSession' from 'pyspark' (c:\CUNY_MSDS\DATA612\PROJECT5\.venv\Lib\site-packages\pyspark\__init__.py)