In [8]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

def create_spark_session(app_name):
    """ 创建和配置SparkSession """
    spark = SparkSession.builder.appName(app_name).getOrCreate()
    return spark

def setup_environment(spark, environment):
    """ 根据运行环境配置输入和输出路径 """
    if environment == 'local':
        input_path = "file:///path/to/local/data.csv"
        output_path = "file:///path/to/local/output"
    elif environment == 'aks':
        spark.conf.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
        spark.conf.set("fs.azure.account.key.yourstorageaccount.blob.core.windows.net", "your_key")
        input_path = "wasbs://container@yourstorageaccount.blob.core.windows.net/data.csv"
        output_path = "wasbs://container@yourstorageaccount.blob.core.windows.net/output"
    else:
        spark.stop()
        raise ValueError("Unsupported environment. Please use 'local' or 'aks'")
    return input_path, output_path

def read_data(spark, input_path):
    """ 读取数据 """
    data = spark.read.option("header", "true").csv(input_path)
    return data

def preprocess_data(data):
    """ 数据预处理：处理缺失值和异常值 """
    # 示例：过滤掉包含空值的行
    processed_data = data.na.drop()
    return processed_data

def perform_eda1(data):
    """ 第一个EDA分析任务 """
    result = data.filter(col("ColumnName").isNull())
    result.explain()
    return result

def perform_eda2(data):
    """ 第二个EDA分析任务 """
    result = data.groupBy("AnotherColumn").count()
    result.explain()
    return result

def perform_eda3(data):
    """ 第三个EDA分析任务 """
    # 具体实现根据需求来
    result = data.select("Column1").distinct()
    result.explain()
    return result

def perform_eda4(data):
    """ 第四个EDA分析任务 """
    # 具体实现根据需求来
    result = data.agg({"Column2": "max"})
    result.explain()
    return result

def export_results(result, output_path):
    """ 导出结果到指定格式和路径 """
    result.write.format("parquet").mode("overwrite").save(output_path)

def main(environment):
    """ 主函数，根据环境执行所有步骤 """
    spark = create_spark_session("Olympic Data Analysis")
    input_path, output_path = setup_environment(spark, environment)
    data = read_data(spark, input_path)
    processed_data = preprocess_data(data)
    result1 = perform_eda1(processed_data)
    result2 = perform_eda2(processed_data)
    result3 = perform_eda3(processed_data)
    result4 = perform_eda4(processed_data)
    export_results(result1, output_path)  # 示例只导出第一个结果
    spark.stop()

if __name__ == "__main__":
    environment = sys.argv[1] if len(sys.argv) > 1 else 'local'
    main(environment)


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/path/to/local/data.csv.