In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_extract,count,when,avg,lag,format_number,concat, lit,row_number,concat_ws,collect_list,expr
from pyspark.sql.window import Window
from pyspark.sql.types import StringType
import pandas as pd

In [None]:
from pyspark.sql.functions import col, regexp_extract,count,when
import pandas as pd
# 创建 SparkSession
spark = SparkSession.builder.appName("OlympicsEDA").getOrCreate()
# 定义文件路径
csv_file_path = "E:\\hku\\cloud cluster\\ex4\\after1958_filled2.csv"


semicolon_csv_file_path = "E:\\hku\\cloud cluster\\ex4\\after1958_filled2_semicolon.csv"

# 使用 Pandas 读取原始 CSV 文件，并将其转换为分号分隔符
df = pd.read_csv(csv_file_path, encoding="UTF-8")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.to_csv(semicolon_csv_file_path, sep=';', index=False, encoding="UTF-8")
# 读取使用分号分隔符的新 CSV 文件
spark_df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("encoding", "UTF-8") \
    .option("delimiter", ";") \
    .csv(semicolon_csv_file_path)




In [None]:
# 打印 Schema
spark_df.printSchema()

# 显示数据的前几行
spark_df.show(50)

# 获取数值列的统计信息aa
spark_df.describe().show()

# 计算总行数
total_rows = spark_df.count()
print(f"Total Rows: {total_rows}")

# 打印 DataFrame 的列名和数据类型
print("DataFrame 的列名和数据类型:")
print(spark_df.dtypes)

# 第一个eda问题：Distribution and development trends of sports participation between male and female athletes

In [None]:
# 1. 按性别和年份分组，计算每组的运动员人数
gender_trend_df = spark_df.groupBy("Sex", "Year").agg(count("ID").alias("Participant_Count"))

# 2. 对结果进行排序
sorted_gender_trend_df = gender_trend_df.orderBy("Year", "Sex")

# 查看前21行
sorted_gender_trend_df.show(21)


In [None]:
# 1. 按性别和年份分组，计算每组的运动员人数
gender_trend_df = spark_df.groupBy("Sex", "Year").agg(
    count("ID").alias("Participant_Count")
)

# 2. 计算年增长率和参与比例
window_spec = Window.partitionBy("Sex").orderBy("Year")
gender_trend_df = gender_trend_df.withColumn(
    "Prev_Year_Participant", lag("Participant_Count").over(window_spec)
).withColumn(
    "Growth_Rate", 
    ((col("Participant_Count") - col("Prev_Year_Participant")) / col("Prev_Year_Participant") * 100).cast("decimal(10,3)")
).withColumn(
    "Growth_Rate", concat(col("Growth_Rate"), lit("%"))
)

# 3. 计算每年的总参与人数和参与比例
total_participants_df = spark_df.groupBy("Year").agg(count("ID").alias("Total_Participants"))
gender_ratio_df = gender_trend_df.join(total_participants_df, on="Year").withColumn(
    "Participation_Ratio", (col("Participant_Count") / col("Total_Participants") * 100).cast("decimal(10,3)")
).withColumn(
    "Participation_Ratio", concat(col("Participation_Ratio"), lit("%"))
)


In [None]:
# 捕获物理计划
gender_ratio_df.explain()

# 4. 展示所有数据
gender_ratio_df.orderBy("Year", "Sex").show(n=200000, truncate=False)



Inference: Even though the overall trend of both the graphs is on the rise. However, after the 1996 we see that there was a slight dip in the number of male participants.


# 第二个EDA问题：dentification of the sports infrastructure and traditional strong events of various countries

In [None]:
medal_counts = spark_df.groupBy("Year", "NOC").agg(count(when(col("Medal").isNotNull(), 1)).alias("MedalCount"))

# 使用窗口函数找出每年奖牌总数排名前十的国

windowSpec = Window.partitionBy("Year").orderBy(col("MedalCount").desc())
top_10_countries = medal_counts.withColumn("rank", row_number().over(windowSpec)).filter(col("rank") <= 10).drop("rank")

# 将结果显示为每年奖牌总数排名前十的国家和对应的奖牌数
result = top_10_countries.groupBy("Year").agg(concat_ws(", ", collect_list(concat_ws("-", col("NOC"), col("MedalCount")))).alias("Top10Countries"))

# 展示结果
result.show(20, truncate=False)

In [None]:
sport_leaders = spark_df.groupBy("NOC", "Sport").agg(count(when(col("Medal").isNotNull(), 1)).alias("MedalCount"))

# 找出每项运动的主导国家
windowSpec = Window.partitionBy("Sport").orderBy(col("MedalCount").desc())
sport_leaders = sport_leaders.withColumn("rank", row_number().over(windowSpec)).filter(col("rank") == 1).drop("rank")

# 合并同一个国家的所有主导项目
from pyspark.sql.functions import collect_list

merged_sport_leaders = sport_leaders.groupBy("NOC").agg(concat_ws(", ", collect_list(concat_ws("-", col("Sport"), col("MedalCount")))).alias("DominantSports"))

# 展示结果
merged_sport_leaders.show(40, truncate=False)

# 第四个EDA问题：Do host countries win significantly more medals compared to the Olympics before and after they host?

In [None]:
host_countries = {
    1896: 'GRE', 1900: 'FRA', 1904: 'USA', 1908: 'GBR', 1912: 'SWE', 1920: 'BEL',
    1924: 'FRA', 1928: 'NED', 1932: 'USA', 1936: 'GER', 1948: 'GBR', 1952: 'FIN',
    1956: 'AUS', 1960: 'ITA', 1964: 'JPN', 1968: 'MEX', 1972: 'FRG', 1976: 'CAN',
    1980: 'URS', 1984: 'USA', 1988: 'KOR', 1992: 'ESP', 1996: 'USA', 2000: 'AUS',
    2004: 'GRE', 2008: 'CHN', 2012: 'GBR', 2016: 'BRA'
}
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import os

# 设置环境变量
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.10'  # 替换为您的Python 3.10路径
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3.10'  # 替换为您的Python 3.10路径
# UDF函数来获取主办国
def get_host_country(year):
    return host_countries.get(year, None)

# 注册UDF
get_host_country_udf = udf(get_host_country, StringType())

# 增加主办国列
spark_df = spark_df.withColumn("Host_Country", get_host_country_udf(col("Year")))
spark_df.show()