In [70]:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,count,mean,udf,sum,when
spark = SparkSession.builder \
    .appName("Typhoon Analyze") \
    .master("local[*]") \
    .getOrCreate()
spark.conf.set("spark.rapids.sql.enable","true")

data = spark.read.option("header", True).csv("../design/result/track")
data = data.filter(col("year") > 1990)

24/12/24 06:01:48 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [71]:
data

DataFrame[storm_id: string, grade: string, latitude: string, longitude: string, year: string, date: string, prev_latitude: string, prev_longitude: string, distance: string, time_diff: string, speed: string]

In [72]:
data.show(10)

+--------+-------------------+--------+---------+----+--------------------+-------------+--------------+------------------+---------+------------------+
|storm_id|              grade|latitude|longitude|year|                date|prev_latitude|prev_longitude|          distance|time_diff|             speed|
+--------+-------------------+--------+---------+----+--------------------+-------------+--------------+------------------+---------+------------------+
|       1|Tropical Depression|     8.8|    137.5|2000|2000-05-04T18:00:...|         NULL|          NULL|              NULL|     NULL|              NULL|
|       1|Tropical Depression|     9.7|    136.0|2000|2000-05-05T00:00:...|          8.8|         137.5|192.54726886145951|      6.0| 32.09121147690992|
|       1|Tropical Depression|     9.9|    135.0|2000|2000-05-05T06:00:...|          9.7|         136.0|111.91371569445359|      6.0|18.652285949075598|
|       1|Tropical Depression|    10.2|    134.4|2000|2000-05-05T12:00:...|       

In [73]:
path_length = data.groupBy("storm_id").agg(sum("distance").alias("path_length"))
path_length.show(10)


+--------+------------------+
|storm_id|       path_length|
+--------+------------------+
|    1512| 6594.883858906432|
|     125| 5553.276690071122|
|    2110| 4935.734515735788|
|    1808| 4531.674801549938|
|     124|2195.8928753480845|
|    1903| 7184.293669084471|
|     307|  5685.09002049795|
|     613|  4964.94834152174|
|    1706| 4284.268123257509|
|     205| 4418.494488395747|
+--------+------------------+
only showing top 10 rows



In [74]:
avg_speed = data.groupBy("storm_id").agg(mean("speed").alias("avg_speed"))
avg_speed.show(10)

+--------+------------------+
|storm_id|         avg_speed|
+--------+------------------+
|    1512|19.981565793310413|
|     125|18.147963039448108|
|    2110|26.536207073848313|
|    1808|  20.9505070487213|
|     124|16.635552085970332|
|    1903|37.418196193148276|
|     307| 22.55988103372203|
|     613| 21.19057979867417|
|    1706| 22.31389647529953|
|     205|19.303145245576346|
+--------+------------------+
only showing top 10 rows



In [75]:
from pyspark.sql.functions import variance

lat_variance = data.groupBy("storm_id").agg(variance(col("latitude")).alias("lat_variance"))
lat_variance.show()

+--------+-------------------+
|storm_id|       lat_variance|
+--------+-------------------+
|    1512| 28.132807287093954|
|     125|   68.6150339366516|
|    2110|  30.25894153225808|
|    1808| 35.284024154589346|
|     124| 1.2469565217391305|
|    1903|  165.3475189393939|
|     307|   34.2275415282392|
|     613|  73.96095762711866|
|    1706|  9.340473484848495|
|     205|  92.83998226950366|
|    1104| 11.647642857142856|
|    1418|  85.02210196078437|
|    1008|0.11733333333333332|
|    1305| 3.1505263157894787|
|      15|  8.287125506072877|
|    1207|   45.5277207977208|
|    1304|  94.79811051693409|
|    1412|  42.95328042328047|
|    2016| 1.3995238095238143|
|     317|  40.93044534412955|
+--------+-------------------+
only showing top 20 rows



In [76]:
lon_variance = data.groupBy("storm_id").agg(variance(col("longitude")).alias("lon_variance"))
lon_variance.show()

+--------+------------------+
|storm_id|      lon_variance|
+--------+------------------+
|    1512| 294.9700702370507|
|     125| 33.00138763197594|
|    2110|181.83415322580692|
|    1808|126.94785024154574|
|     124| 39.78166007905139|
|    1903| 374.0538257575751|
|     307|185.86012181616874|
|     613| 23.39654237288131|
|    1706|125.34695075757539|
|     205|19.989202127659667|
|    1104| 65.31107142857125|
|    1418|146.15734901960784|
|    1008|2.5728888888888886|
|    1305|20.647076023391747|
|      15| 22.53236167341435|
|    1207| 32.39387464387475|
|    1304|252.61761140819954|
|    1412| 6.807447089947085|
|    2016|23.529523809523784|
|     317| 33.94932523616738|
+--------+------------------+
only showing top 20 rows



In [77]:
from pyspark.sql.functions import covar_pop

lat_lon_covariance = data.groupBy("storm_id").agg(covar_pop("latitude", "longitude").alias("lat_lon_covariance"))
lat_lon_covariance.show()

+--------+-------------------+
|storm_id| lat_lon_covariance|
+--------+-------------------+
|    1512| -82.67305795847763|
|     125| -18.20511834319524|
|    2110|  65.59490234375014|
|    1808| -63.95113894139881|
|     124|-6.4966351606805315|
|    1903|  234.7015518824607|
|     307| -77.54066522444573|
|     613|  7.836866666666517|
|    1706|-12.435133149678574|
|     205|-25.819947916666607|
|    1104|-22.042152777777744|
|    1418| 18.552199154171504|
|    1008|0.10059999999999629|
|    1305|-6.6450138504155065|
|      15| 1.8008086785010191|
|    1207|-29.126268861454157|
|    1304| 116.95759515570934|
|    1412|-15.009515306122454|
|    2016| -5.280444444444449|
|     317|  35.02112426035506|
+--------+-------------------+
only showing top 20 rows



In [78]:
from pyspark.sql.functions import collect_list
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType
from math import atan2, degrees

def combine_point(lati, lonti):
    return [float(lati), float(lonti)]
combine_point_udf = udf(combine_point, ArrayType(DoubleType()))

typhoon_tracks = data.withColumn("points", combine_point_udf(col('latitude'), col('longitude')))
typhoon_tracks = typhoon_tracks.groupBy("storm_id").agg(
    collect_list("points").alias("points"),
)

def calculate_direction(points):
    if len(points) < 2:
        return None
    lat1, lon1 = points[0]
    lat2, lon2 = points[-1]
    angle = atan2(lon2 - lon1, lat2 - lat1)
    return degrees(angle)

calculate_direction_udf = udf(calculate_direction, DoubleType())

typhoon_tracks = typhoon_tracks.withColumn("direction", calculate_direction_udf(col("points")))
typhoon_tracks.show(10)

+--------+--------------------+-------------------+
|storm_id|              points|          direction|
+--------+--------------------+-------------------+
|       1|[[8.8, 137.5], [9...|  52.84646219839816|
|      10|[[11.5, 137.0], [...|-22.173213349259573|
|    1001|[[8.4, 141.7], [8...| -45.90938044919913|
|    1002|[[13.9, 132.6], [...| -75.57366363534628|
|    1003|[[15.8, 123.1], [...| -65.11551647323572|
|    1004|[[20.7, 124.2], [...| 49.944866292096826|
|    1005|[[16.4, 115.1], [...| -74.74488129694222|
|    1006|[[16.5, 118.0], [...| -48.57633437499734|
|    1007|[[18.4, 139.0], [...|  48.13246519856804|
|    1008|[[25.1, 124.3], [...| -88.72696997994332|
+--------+--------------------+-------------------+
only showing top 10 rows



In [79]:
#特征
combined_features = path_length.join(avg_speed, on="storm_id") \
    .join(lat_variance, on="storm_id") \
    .join(lon_variance, on="storm_id") \
    .join(lat_lon_covariance, on="storm_id")\
    .join(typhoon_tracks,on="storm_id")
combined_features.show(10)

                                                                                

+--------+------------------+------------------+------------------+------------------+-------------------+--------------------+------------------+
|storm_id|       path_length|         avg_speed|      lat_variance|      lon_variance| lat_lon_covariance|              points|         direction|
+--------+------------------+------------------+------------------+------------------+-------------------+--------------------+------------------+
|    1512| 6594.883858906432|19.981565793310413|28.132807287093954| 294.9700702370507| -82.67305795847763|[[12.6, 180.7], [...|-67.30795395694041|
|     125| 5553.276690071122|18.147963039448108|  68.6150339366516| 33.00138763197594| -18.20511834319524|[[4.9, 162.3], [5...| 12.84067901209276|
|    2110| 4935.734515735788|26.536207073848313| 30.25894153225808|181.83415322580692|  65.59490234375014|[[23.8, 124.5], [...| 73.34770026964586|
|    1808| 4531.674801549938|  20.9505070487213|35.284024154589346|126.94785024154574| -63.95113894139881|[[10.1, 147.

In [80]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType

# 选择要用于聚类的特征列
feature_columns = ["path_length", "avg_speed", "lat_variance", "lon_variance", "lat_lon_covariance","direction"]

# 将特征列组合成一个向量
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
feature_vector = assembler.transform(combined_features)


In [81]:
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import StandardScaler

# 标准化特征向量
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
scaler_model = scaler.fit(feature_vector)
scaled_data = scaler_model.transform(feature_vector)

# 使用剪影法确定最佳聚类数量
evaluator = ClusteringEvaluator(featuresCol="scaled_features", metricName="silhouette", distanceMeasure="squaredEuclidean")

silhouette_scores = []
for k in range(2, 10):
    kmeans = KMeans(k=k, featuresCol="scaled_features", seed=1)
    model = kmeans.fit(scaled_data)
    predictions = model.transform(scaled_data)
    silhouette = evaluator.evaluate(predictions)
    silhouette_scores.append((k, silhouette))

# 找到最佳的k值
best_k = max(silhouette_scores, key=lambda x: x[1])[0]
print(f"Best k: {best_k}")

# 使用最佳k值进行最终聚类
kmeans = KMeans(k=best_k, featuresCol="scaled_features", seed=1)
model = kmeans.fit(scaled_data)
clusters = model.transform(scaled_data)
clusters.show(10)

                                                                                

Best k: 2
+--------+------------------+------------------+------------------+------------------+-------------------+--------------------+------------------+--------------------+--------------------+----------+
|storm_id|       path_length|         avg_speed|      lat_variance|      lon_variance| lat_lon_covariance|              points|         direction|            features|     scaled_features|prediction|
+--------+------------------+------------------+------------------+------------------+-------------------+--------------------+------------------+--------------------+--------------------+----------+
|    1512| 6594.883858906432|19.981565793310413|28.132807287093954| 294.9700702370507| -82.67305795847763|[[12.6, 180.7], [...|-67.30795395694041|[6594.88385890643...|[2.94049290293191...|         1|
|     125| 5553.276690071122|18.147963039448108|  68.6150339366516| 33.00138763197594| -18.20511834319524|[[4.9, 162.3], [5...| 12.84067901209276|[5553.27669007112...|[2.47606645462276...|  

In [99]:
clusters_2=clusters

In [100]:
kmeans = KMeans(k=3, featuresCol="scaled_features", seed=1)
model = kmeans.fit(scaled_data)
clusters_3 = model.transform(scaled_data)

                                                                                

In [101]:
kmeans = KMeans(k=4, featuresCol="scaled_features", seed=1)
model = kmeans.fit(scaled_data)
clusters_4 = model.transform(scaled_data)

In [102]:
def points_to_string(points):
    return ','.join([f"({lat},{lon})" for lat, lon in points])

# Register the UDF
points_to_string_udf = udf(points_to_string)

# Apply the UDF to convert points to string
clusters_2 = clusters_2.withColumn("points", points_to_string_udf(col("points")))
clusters_3 = clusters_3.withColumn("points", points_to_string_udf(col("points")))
clusters_4 = clusters_4.withColumn("points", points_to_string_udf(col("points")))
clusters_2.show(5)

+--------+------------------+------------------+------------------+------------------+-------------------+--------------------+------------------+--------------------+--------------------+----------+
|storm_id|       path_length|         avg_speed|      lat_variance|      lon_variance| lat_lon_covariance|              points|         direction|            features|     scaled_features|prediction|
+--------+------------------+------------------+------------------+------------------+-------------------+--------------------+------------------+--------------------+--------------------+----------+
|    1512| 6594.883858906432|19.981565793310413|28.132807287093954| 294.9700702370507| -82.67305795847763|(12.6,180.7),(13....|-67.30795395694041|[6594.88385890643...|[2.94049290293191...|         1|
|     125| 5553.276690071122|18.147963039448108|  68.6150339366516| 33.00138763197594| -18.20511834319524|(4.9,162.3),(5.0,...| 12.84067901209276|[5553.27669007112...|[2.47606645462276...|         1|


In [130]:
combined_features = combined_features.withColumn("points", points_to_string_udf(col("points")))
combined_features.coalesce(1).write.mode("overwrite").option("header", True).csv("result/clusters/features")

                                                                                

In [105]:
clusters_2.drop("features", "scaled_features").coalesce(1).write.mode("overwrite").option("header", True).csv("result/clusters/cluster2")
clusters_3.drop("features", "scaled_features").coalesce(1).write.mode("overwrite").option("header", True).csv("result/clusters/cluster3")
clusters_4.drop("features", "scaled_features").coalesce(1).write.mode("overwrite").option("header", True).csv("result/clusters/cluster4")


                                                                                

In [None]:
import folium
import matplotlib.pyplot as plt

def show_cluster(clusters):
    # 将聚类结果转换为 Pandas DataFrame
    clusters_pd = clusters.select("prediction", "points").toPandas()
    # 创建一个地图对象
    m = folium.Map(location=[20, 130], zoom_start=3)


    # 为每个聚类添加点
    for cluster in clusters_pd['prediction'].unique():
        if  cluster==0:
            color='red'
        elif cluster==1:
            color='blue'
        elif cluster==2:
            color='yellow'
        else:
            color='green'
        cluster_points = clusters_pd[clusters_pd['prediction'] == cluster]['points']
        for points in cluster_points:
            points=eval(points)
            folium.PolyLine(points, color=color, weight=0.2).add_to(m)
    # 显示地图
    return m

show_cluster(clusters_2)

In [128]:
show_cluster(clusters_4)