In [134]:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,count,mean,udf,sum,when
import pandas as pd

spark = SparkSession.builder \
    .appName("Typhoon Analyze") \
    .master("local[*]") \
    .getOrCreate()
spark.conf.set("spark.rapids.sql.enable","true")


df_grade=spark.read.option("header", True).csv(r"../design/result/grade_trend/part-00000-7dcf4de8-72b3-42bb-bf65-4d2d581f866e-c000.csv")
df_intensity=spark.read.option("header", True).csv(r"../design/result/intensity_trend/part-00000-230f148b-8c77-4f42-bc0c-4d0236d48799-c000.csv")



In [135]:
from pyspark.sql.types import IntegerType
df_intensity = df_intensity.withColumn("year", df_intensity["year"].cast(IntegerType()))
df_intensity = df_intensity.withColumn("avg_central_pressure", df_intensity["avg_central_pressure"].cast("double"))
df_intensity = df_intensity.withColumn("avg_wind_speed", df_intensity["avg_wind_speed"].cast("double"))
df_intensity = df_intensity.filter(df_intensity.year >= 1977)
df_intensity.show()

+----+--------------------+------------------+
|year|avg_central_pressure|    avg_wind_speed|
+----+--------------------+------------------+
|1977|   986.1454311454312|31.975546975546976|
|1978|   986.0371747211896| 34.02416356877323|
|1979|   983.2565997888067| 35.45406546990496|
|1980|    984.497641509434|  36.6627358490566|
|1981|   985.9777777777778|  30.7979797979798|
|1982|   981.8138195777351|41.602687140115165|
|1983|   985.1509433962265|34.782293178519595|
|1984|   985.1949339207049|35.401982378854626|
|1985|   989.1456815816857| 32.43496357960458|
|1986|   985.3704891740176| 34.86367281475541|
|1987|   978.7153679653679| 43.52272727272727|
|1988|     988.74715261959|  32.2380410022779|
|1989|   983.5480116391852| 37.59456838021339|
|1990|   981.2694198623402| 40.08849557522124|
|1991|   978.0804416403786| 43.71845425867508|
|1992|   981.2928679817906| 39.55993930197268|
|1993|   987.3385321100917| 33.41284403669725|
|1994|   983.7460857726345| 38.35602450646699|
|1995|   987.

In [136]:
df_intensiy_wind = df_intensity.filter(df_intensity.avg_wind_speed.isNotNull()).drop('avg_central_pressure')
df_intensiy_wind.show(10)


+----+------------------+
|year|    avg_wind_speed|
+----+------------------+
|1977|31.975546975546976|
|1978| 34.02416356877323|
|1979| 35.45406546990496|
|1980|  36.6627358490566|
|1981|  30.7979797979798|
|1982|41.602687140115165|
|1983|34.782293178519595|
|1984|35.401982378854626|
|1985| 32.43496357960458|
|1986| 34.86367281475541|
+----+------------------+
only showing top 10 rows



In [137]:
df_intensity_pressure = df_intensity.drop('avg_wind_speed')
df_intensity_pressure.show(10)
df_intensity_pressure

+----+--------------------+
|year|avg_central_pressure|
+----+--------------------+
|1977|   986.1454311454312|
|1978|   986.0371747211896|
|1979|   983.2565997888067|
|1980|    984.497641509434|
|1981|   985.9777777777778|
|1982|   981.8138195777351|
|1983|   985.1509433962265|
|1984|   985.1949339207049|
|1985|   989.1456815816857|
|1986|   985.3704891740176|
+----+--------------------+
only showing top 10 rows



DataFrame[year: int, avg_central_pressure: double]

In [138]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator


#预测强度的回归模型
# year作为输入，输出强度
# 提取特征向量
assembler = VectorAssembler(inputCols=["year"], outputCol="features")
df_intensity_features = assembler.transform(df_intensity_pressure)

#划分训练集和测试集
pressure_train, pressure_test = df_intensity_features.randomSplit([0.8, 0.2], seed=1234)

#训练
pressure_model = LinearRegression(featuresCol="features", labelCol="avg_central_pressure", regParam=0.1)
lr_model_central_pressure = pressure_model.fit(pressure_train)

#预测
pressure_prediction = lr_model_central_pressure.transform(pressure_test)

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="avg_central_pressure", metricName="rmse")
rmse_central_pressure = evaluator.evaluate(pressure_prediction)

print(f"RMSE for central pressure prediction: {rmse_central_pressure}")




RMSE for central pressure prediction: 3.473577220150774


In [139]:
# 预测
future_years = spark.createDataFrame([(year,) for year in range(2015, 2033)], ["year"])
future_years_features = assembler.transform(future_years)
pressure_predictions = lr_model_central_pressure.transform(future_years_features)
pressure_predictions.show()

+----+--------+-----------------+
|year|features|       prediction|
+----+--------+-----------------+
|2015|[2015.0]|983.6482701040463|
|2016|[2016.0]|983.6388986396908|
|2017|[2017.0]|983.6295271753352|
|2018|[2018.0]|983.6201557109796|
|2019|[2019.0]| 983.610784246624|
|2020|[2020.0]|983.6014127822684|
|2021|[2021.0]| 983.592041317913|
|2022|[2022.0]|983.5826698535574|
|2023|[2023.0]|983.5732983892018|
|2024|[2024.0]|983.5639269248462|
|2025|[2025.0]|983.5545554604906|
|2026|[2026.0]| 983.545183996135|
|2027|[2027.0]|983.5358125317795|
|2028|[2028.0]|983.5264410674239|
|2029|[2029.0]|983.5170696030683|
|2030|[2030.0]|983.5076981387128|
|2031|[2031.0]|983.4983266743573|
|2032|[2032.0]|983.4889552100017|
+----+--------+-----------------+



In [140]:
# 提取特征向量
assembler_wind = VectorAssembler(inputCols=["year"], outputCol="features")
df_intensiy_wind_features = assembler_wind.transform(df_intensiy_wind)

# 划分训练集和测试集
wind_train, wind_test = df_intensiy_wind_features.randomSplit([0.8, 0.2], seed=1234)

# 训练
wind_model = LinearRegression(featuresCol="features", labelCol="avg_wind_speed",regParam=0.1)
lr_model_wind_speed = wind_model.fit(wind_train)

# 预测
wind_prediction = lr_model_wind_speed.transform(wind_test)

evaluator_wind = RegressionEvaluator(predictionCol="prediction", labelCol="avg_wind_speed", metricName="rmse")
rmse_wind_speed = evaluator_wind.evaluate(wind_prediction)

print(f"RMSE for wind speed prediction: {rmse_wind_speed}")

RMSE for wind speed prediction: 3.987021392568346


In [141]:

future_years = spark.createDataFrame([(year,) for year in range(2015, 2033)], ["year"])
future_years_features = assembler_wind.transform(future_years)
wind_predictions = lr_model_wind_speed.transform(future_years_features)
wind_predictions.show()

+----+--------+------------------+
|year|features|        prediction|
+----+--------+------------------+
|2015|[2015.0]|  37.5224634705423|
|2016|[2016.0]|  37.5457055616171|
|2017|[2017.0]|  37.5689476526919|
|2018|[2018.0]| 37.59218974376669|
|2019|[2019.0]| 37.61543183484149|
|2020|[2020.0]| 37.63867392591629|
|2021|[2021.0]| 37.66191601699108|
|2022|[2022.0]| 37.68515810806588|
|2023|[2023.0]| 37.70840019914068|
|2024|[2024.0]| 37.73164229021547|
|2025|[2025.0]| 37.75488438129027|
|2026|[2026.0]| 37.77812647236507|
|2027|[2027.0]|37.801368563439866|
|2028|[2028.0]| 37.82461065451466|
|2029|[2029.0]|37.847852745589456|
|2030|[2030.0]|37.871094836664255|
|2031|[2031.0]| 37.89433692773905|
|2032|[2032.0]|37.917579018813846|
+----+--------+------------------+



In [142]:
# 合并两个预测结果
combined_predictions = pressure_predictions.select("year", "prediction").withColumnRenamed("prediction", "predicted_pressure") \
    .join(wind_predictions.select("year", "prediction").withColumnRenamed("prediction", "predicted_wind_speed"), on="year", how="inner")

combined_predictions.show()



+----+------------------+--------------------+
|year|predicted_pressure|predicted_wind_speed|
+----+------------------+--------------------+
|2015| 983.6482701040463|    37.5224634705423|
|2016| 983.6388986396908|    37.5457055616171|
|2017| 983.6295271753352|    37.5689476526919|
|2018| 983.6201557109796|   37.59218974376669|
|2019|  983.610784246624|   37.61543183484149|
|2020| 983.6014127822684|   37.63867392591629|
|2021|  983.592041317913|   37.66191601699108|
|2022| 983.5826698535574|   37.68515810806588|
|2023| 983.5732983892018|   37.70840019914068|
|2024| 983.5639269248462|   37.73164229021547|
|2025| 983.5545554604906|   37.75488438129027|
|2026|  983.545183996135|   37.77812647236507|
|2027| 983.5358125317795|  37.801368563439866|
|2028| 983.5264410674239|   37.82461065451466|
|2029| 983.5170696030683|  37.847852745589456|
|2030| 983.5076981387128|  37.871094836664255|
|2031| 983.4983266743573|   37.89433692773905|
|2032| 983.4889552100017|  37.917579018813846|
+----+-------

                                                                                

In [143]:
combined_predictions.coalesce(1).write.mode("overwrite").option("header",True).csv("result/intensity_prediction")

                                                                                