In [80]:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,count,mean,udf,sum,when
spark = SparkSession.builder \
    .appName("Typhoon Analyze") \
    .master("local[*]") \
    .getOrCreate()
spark.conf.set("spark.rapids.sql.enable","true")



In [81]:
# 读取台风路径数据集
df = spark.read.option("header", True).csv("../typhoon_data.csv")
info = spark.read.option("header", True).csv("../typhoon_info.csv")

df.printSchema()



root
 |-- _c0: string (nullable = true)
 |-- International number ID: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- Latitude of the center: string (nullable = true)
 |-- Longitude of the center: string (nullable = true)
 |-- Central pressure: string (nullable = true)
 |-- Maximum sustained wind speed: string (nullable = true)
 |-- Direction of the longest radius of 50kt winds or greater: string (nullable = true)
 |-- The longeast radius of 50kt winds or greater: string (nullable = true)
 |-- The shortest radius of 50kt winds or greater: string (nullable = true)
 |-- Direction of the longest radius of 30kt winds or greater: string (nullable = true)
 |-- The longeast radius of 30kt winds or greater: string (nullable = true)
 |-- The shortest radius of 30kt winds or greater: string (nullable = true)
 |-- Indicator of landfall 

In [82]:
df.describe().show()

24/12/23 21:05:16 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , International number ID, year, month, day, hour, grade, Latitude of the center, Longitude of the center, Central pressure, Maximum sustained wind speed, Direction of the longest radius of 50kt winds or greater, The longeast radius of 50kt winds or greater, The shortest radius of 50kt winds or greater, Direction of the longest radius of 30kt winds or greater, The longeast radius of 30kt winds or greater, The shortest radius of 30kt winds or greater, Indicator of landfall or passage
 Schema: _c0, International number ID, year, month, day, hour, grade, Latitude of the center, Longitude of the center, Central pressure, Maximum sustained wind speed, Direction of the longest radius of 50kt winds or greater, The longeast radius of 50kt winds or greater, The shortest radius of 50kt winds or greater, Direction of the longest radius of 30kt winds or greater, The longeast radius of 30kt winds or greater

+-------+------------------+-----------------------+------------------+-----------------+------------------+-----------------+--------------------+----------------------+-----------------------+------------------+----------------------------+--------------------------------------------------------+--------------------------------------------+--------------------------------------------+--------------------------------------------------------+--------------------------------------------+--------------------------------------------+--------------------------------+
|summary|               _c0|International number ID|              year|            month|               day|             hour|               grade|Latitude of the center|Longitude of the center|  Central pressure|Maximum sustained wind speed|Direction of the longest radius of 50kt winds or greater|The longeast radius of 50kt winds or greater|The shortest radius of 50kt winds or greater|Direction of the longest radius of 30kt w

                                                                                

In [83]:
df.show(10)

+---+-----------------------+----+-----+---+----+--------------------+----------------------+-----------------------+----------------+----------------------------+--------------------------------------------------------+--------------------------------------------+--------------------------------------------+--------------------------------------------------------+--------------------------------------------+--------------------------------------------+--------------------------------+
|_c0|International number ID|year|month|day|hour|               grade|Latitude of the center|Longitude of the center|Central pressure|Maximum sustained wind speed|Direction of the longest radius of 50kt winds or greater|The longeast radius of 50kt winds or greater|The shortest radius of 50kt winds or greater|Direction of the longest radius of 30kt winds or greater|The longeast radius of 30kt winds or greater|The shortest radius of 30kt winds or greater|Indicator of landfall or passage|
+---+-------------

24/12/23 21:05:19 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , International number ID, year, month, day, hour, grade, Latitude of the center, Longitude of the center, Central pressure, Maximum sustained wind speed, Direction of the longest radius of 50kt winds or greater, The longeast radius of 50kt winds or greater, The shortest radius of 50kt winds or greater, Direction of the longest radius of 30kt winds or greater, The longeast radius of 30kt winds or greater, The shortest radius of 30kt winds or greater, Indicator of landfall or passage
 Schema: _c0, International number ID, year, month, day, hour, grade, Latitude of the center, Longitude of the center, Central pressure, Maximum sustained wind speed, Direction of the longest radius of 50kt winds or greater, The longeast radius of 50kt winds or greater, The shortest radius of 50kt winds or greater, Direction of the longest radius of 30kt winds or greater, The longeast radius of 30kt winds or greater

In [84]:
info.show(10)

24/12/23 21:05:19 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , International number ID, Tropical cyclone number ID, Flag of the last data line, Difference between the time of the last data and the time of the final analysis, Name, Latest Revision
 Schema: _c0, International number ID, Tropical cyclone number ID, Flag of the last data line, Difference between the time of the last data and the time of the final analysis, Name, Latest Revision
Expected: _c0 but found: 
CSV file: file:///home/fangjiabin/spark/typhoon_info.csv


+---+-----------------------+--------------------------+--------------------------+-------------------------------------------------------------------------------+--------------------+---------------+
|_c0|International number ID|Tropical cyclone number ID|Flag of the last data line|Difference between the time of the last data and the time of the final analysis|                Name|Latest Revision|
+---+-----------------------+--------------------------+--------------------------+-------------------------------------------------------------------------------+--------------------+---------------+
|  0|                   5101|                      NULL|               Dissipation|                                                                              6|                    |     1990-10-17|
|  1|                   5102|                      NULL|               Dissipation|                                                                              6|             GEORGIA|     2013-03

In [85]:
df = df.withColumn(
    "Indicator of landfall or passage",
    when(col("Indicator of landfall or passage") == " ", 0)
    .when(col("Indicator of landfall or passage") == "#", 1)
    .otherwise(col("Indicator of landfall or passage"))
)
df.select("Indicator of landfall or passage").groupBy("Indicator of landfall or passage").count().show()


+--------------------------------+-----+
|Indicator of landfall or passage|count|
+--------------------------------+-----+
|                               0|68379|
|                               1|  245|
+--------------------------------+-----+



### used data:
1. 台风模式分析
所需数据列：
- International number ID：台风的国际编号
- year：年份
- month：月份
- day：日期
- hour：小时
- Latitude of the center：台风中心的纬度
- Longitude of the center：台风中心的经度
- bgrade：台风的等级
- Central pressure：台风中心的气压
- Maximum sustained wind speed：最大持续风速
2. 台风路径聚类
所需数据列：
- International number ID：台风的国际编号
- Latitude of the center：台风中心的纬度
- Longitude of the center：台风中心的经度
3. 强度变化预测
所需数据列：
- International number ID：台风的国际编号
- year：年份
- month：月份
- day：日期
- hour：小时
- grade：台风的等级
- Central pressure：台风中心的气压
- Maximum sustained wind speed：最大持续风速
4. 区域风险评估
所需数据列：
- International number ID：台风的国际编号
- year：年份
- month：月份
- day：日期
- hour：小时
- Latitude of the center：台风中心的纬度
- Longitude of the center：台风中心的经度
- grade：台风的等级
- Indicator of landfall or passage：台风是否登陆或经过陆地


In [86]:
# 选择台风模式分析所需的列
df_mode_analysis = df.select(
    col("International number ID").cast("integer").alias("storm_id"),
    col("year").cast("integer"),
    col("month").cast("integer"),
    col("day").cast("integer"),
    col("hour").cast("integer"),
    (col("Latitude of the center") / 10).alias("latitude"),
    (col("Longitude of the center") / 10).alias("longitude"),
    col("grade"),
    col("Central pressure").cast("integer"),
    col("Maximum sustained wind speed").cast("integer")
)

# 选择台风路径聚类所需的列
df_path_clustering = df.select(
    col("International number ID").cast("integer").alias("storm_id"),
    (col("Latitude of the center") / 10).alias("latitude"),
    (col("Longitude of the center") / 10).alias("longitude")
)

# 选择强度变化预测所需的列
df_intensity_prediction = df.select(
    col("International number ID").cast("integer").alias("storm_id"),
    col("year").cast("integer"),
    col("month").cast("integer"),
    col("day").cast("integer"),
    col("hour").cast("integer"),
    col("grade"),
    col("Central pressure").cast("integer"),
    col("Maximum sustained wind speed").cast("integer")
)

# 选择区域风险评估所需的列
df_risk_assessment = df.select(
    col("International number ID").cast("integer").alias("storm_id"),
    col("year").cast("integer"),
    col("month").cast("integer"),
    col("day").cast("integer"),
    col("hour").cast("integer"),
    (col("Latitude of the center") / 10).alias("latitude"),
    (col("Longitude of the center") / 10).alias("longitude"),
    col("grade"),
    col("Indicator of landfall or passage")
)

In [87]:

df_mode_analysis.show()


+--------+----+-----+---+----+--------+---------+--------------------+----------------+----------------------------+
|storm_id|year|month|day|hour|latitude|longitude|               grade|Central pressure|Maximum sustained wind speed|
+--------+----+-----+---+----+--------+---------+--------------------+----------------+----------------------------+
|    5101|1951|    2| 19|   6|    20.0|    138.5| Tropical Depression|            1010|                        NULL|
|    5101|1951|    2| 19|  12|    20.0|    138.5| Tropical Depression|            1010|                        NULL|
|    5101|1951|    2| 19|  18|    23.0|    142.1| Tropical Depression|            1000|                        NULL|
|    5101|1951|    2| 20|   0|    25.0|    146.0|Tropical Cyclone ...|             994|                        NULL|
|    5101|1951|    2| 20|   6|    27.6|    150.6|Tropical Cyclone ...|             994|                        NULL|
|    5101|1951|    2| 20|  12|    28.9|    153.3|Tropical Cyclon

In [88]:
df_path_clustering.show()


+--------+--------+---------+
|storm_id|latitude|longitude|
+--------+--------+---------+
|    5101|    20.0|    138.5|
|    5101|    20.0|    138.5|
|    5101|    23.0|    142.1|
|    5101|    25.0|    146.0|
|    5101|    27.6|    150.6|
|    5101|    28.9|    153.3|
|    5101|    31.3|    157.5|
|    5101|    32.6|    162.1|
|    5101|    33.9|    166.0|
|    5101|    36.0|    170.0|
|    5102|     5.7|    158.3|
|    5102|     6.0|    159.4|
|    5102|     6.4|    160.4|
|    5102|     6.7|    161.4|
|    5102|     7.0|    162.5|
|    5102|     7.3|    163.5|
|    5102|     7.5|    164.7|
|    5102|     7.6|    166.2|
|    5102|     7.8|    167.4|
|    5102|     8.0|    168.5|
+--------+--------+---------+
only showing top 20 rows



In [89]:
df_intensity_prediction.show()


+--------+----+-----+---+----+--------------------+----------------+----------------------------+
|storm_id|year|month|day|hour|               grade|Central pressure|Maximum sustained wind speed|
+--------+----+-----+---+----+--------------------+----------------+----------------------------+
|    5101|1951|    2| 19|   6| Tropical Depression|            1010|                        NULL|
|    5101|1951|    2| 19|  12| Tropical Depression|            1010|                        NULL|
|    5101|1951|    2| 19|  18| Tropical Depression|            1000|                        NULL|
|    5101|1951|    2| 20|   0|Tropical Cyclone ...|             994|                        NULL|
|    5101|1951|    2| 20|   6|Tropical Cyclone ...|             994|                        NULL|
|    5101|1951|    2| 20|  12|Tropical Cyclone ...|             994|                        NULL|
|    5101|1951|    2| 20|  18|Tropical Cyclone ...|             992|                        NULL|
|    5101|1951|    2

In [90]:
df_risk_assessment.show()

+--------+----+-----+---+----+--------+---------+--------------------+--------------------------------+
|storm_id|year|month|day|hour|latitude|longitude|               grade|Indicator of landfall or passage|
+--------+----+-----+---+----+--------+---------+--------------------+--------------------------------+
|    5101|1951|    2| 19|   6|    20.0|    138.5| Tropical Depression|                               0|
|    5101|1951|    2| 19|  12|    20.0|    138.5| Tropical Depression|                               0|
|    5101|1951|    2| 19|  18|    23.0|    142.1| Tropical Depression|                               0|
|    5101|1951|    2| 20|   0|    25.0|    146.0|Tropical Cyclone ...|                               0|
|    5101|1951|    2| 20|   6|    27.6|    150.6|Tropical Cyclone ...|                               0|
|    5101|1951|    2| 20|  12|    28.9|    153.3|Tropical Cyclone ...|                               0|
|    5101|1951|    2| 20|  18|    31.3|    157.5|Tropical Cyclon

In [93]:
df_mode_analysis.coalesce(1).write.mode("overwrite").option("header",True).csv("data/mode_analysis.csv")
df_path_clustering.coalesce(1).write.mode("overwrite").option("header",True).csv("data/path_clustering.csv")
df_intensity_prediction.coalesce(1).write.mode("overwrite").option("header",True).csv("data/intensity_prediction.csv")
df_risk_assessment.coalesce(1).write.mode("overwrite").option("header",True).csv("data/risk_assessment.csv")

In [94]:
spark.stop()