In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql import functions as F

# Khởi tạo Spark session
spark = SparkSession.builder.appName('Laliga').getOrCreate()

# Định nghĩa schema cho tệp CSV
schema = StructType([
    StructField("Div", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("HomeTeam", StringType(), True),
    StructField("AwayTeam", StringType(), True),
    StructField("FTHG", IntegerType(), True),  # Full Time Home Team Goals
    StructField("FTAG", IntegerType(), True),  # Full Time Away Team Goals
    StructField("FTR", StringType(), True),   # Full Time Result (H, D, A)
    StructField("HTHG", IntegerType(), True), # Half Time Home Team Goals
    StructField("HTAG", IntegerType(), True), # Half Time Away Team Goals
    StructField("HTR", StringType(), True),   # Half Time Result (H, D, A)
    StructField("HS", IntegerType(), True),   # Home Team Shots
    StructField("AS", IntegerType(), True),   # Away Team Shots
    StructField("HST", IntegerType(), True),  # Home Team Shots on Target
    StructField("AST", IntegerType(), True),  # Away Team Shots on Target
    StructField("HF", IntegerType(), True),   # Home Team Fouls
    StructField("AF", IntegerType(), True),   # Away Team Fouls
    StructField("HC", IntegerType(), True),   # Home Team Corners
    StructField("AC", IntegerType(), True),   # Away Team Corners
    StructField("HY", IntegerType(), True),   # Home Team Yellow Cards
    StructField("AY", IntegerType(), True),   # Away Team Yellow Cards
    StructField("HR", IntegerType(), True),   # Home Team Red Cards
    StructField("AR", IntegerType(), True),   # Away Team Red Cards
    StructField("B365H", FloatType(), True),  # Bet365 Home Team Odds
    StructField("B365D", FloatType(), True),  # Bet365 Draw Odds
    StructField("B365A", FloatType(), True),  # Bet365 Away Team Odds
    StructField("BWH", FloatType(), True),    # BetWorld Home Team Odds
    StructField("BWD", FloatType(), True),    # BetWorld Draw Odds
    StructField("BWA", FloatType(), True),    # BetWorld Away Team Odds
    StructField("IWH", FloatType(), True),    # Interwetten Home Team Odds
    StructField("IWD", FloatType(), True),    # Interwetten Draw Odds
    StructField("IWA", FloatType(), True),    # Interwetten Away Team Odds
    StructField("LBH", FloatType(), True),    # Ladbrokes Home Team Odds
    StructField("LBD", FloatType(), True),    # Ladbrokes Draw Odds
    StructField("LBA", FloatType(), True),    # Ladbrokes Away Team Odds
    StructField("PSH", FloatType(), True),    # Pinnacle Sports Home Team Odds
    StructField("PSD", FloatType(), True),    # Pinnacle Sports Draw Odds
    StructField("PSA", FloatType(), True),    # Pinnacle Sports Away Team Odds
    StructField("WHH", FloatType(), True),    # William Hill Home Team Odds
    StructField("WHD", FloatType(), True),    # William Hill Draw Odds
    StructField("WHA", FloatType(), True),    # William Hill Away Team Odds
    StructField("VCH", FloatType(), True),    # VC Bet Home Team Odds
    StructField("VCD", FloatType(), True),    # VC Bet Draw Odds
    StructField("VCA", FloatType(), True),    # VC Bet Away Team Odds
    StructField("Bb1X2", FloatType(), True),  # 1X2 Odds (Home, Draw, Away)
    StructField("BbMxH", FloatType(), True),  # Max Home Odds for 1X2
    StructField("BbAvH", FloatType(), True),  # Avg Home Odds for 1X2
    StructField("BbMxD", FloatType(), True),  # Max Draw Odds for 1X2
    StructField("BbAvD", FloatType(), True),  # Avg Draw Odds for 1X2
    StructField("BbMxA", FloatType(), True),  # Max Away Odds for 1X2
    StructField("BbAvA", FloatType(), True),  # Avg Away Odds for 1X2
    StructField("BbOU", FloatType(), True),   # Over/Under Odds
    StructField("BbMx>2.5", FloatType(), True), # Max Odds for Over 2.5 Goals
    StructField("BbAv>2.5", FloatType(), True), # Avg Odds for Over 2.5 Goals
    StructField("BbMx<2.5", FloatType(), True), # Max Odds for Under 2.5 Goals
    StructField("BbAv<2.5", FloatType(), True), # Avg Odds for Under 2.5 Goals
    StructField("BbAH", FloatType(), True),   # Asian Handicap Odds
    StructField("BbAHh", FloatType(), True),  # Asian Handicap Handicap Odds
    StructField("BbMxAHH", FloatType(), True), # Max Asian Handicap Home Odds
    StructField("BbAvAHH", FloatType(), True), # Avg Asian Handicap Home Odds
    StructField("BbMxAHA", FloatType(), True), # Max Asian Handicap Away Odds
    StructField("BbAvAHA", FloatType(), True), # Avg Asian Handicap Away Odds
    StructField("PSCH", FloatType(), True),   # Pinnacle Sports Home Handicap Odds
    StructField("PSCD", FloatType(), True),   # Pinnacle Sports Draw Handicap Odds
    StructField("PSCA", FloatType(), True)    # Pinnacle Sports Away Handicap Odds
])

# Đọc dữ liệu từ thư mục (streaming)
df_stream = spark.readStream.format("csv") \
    .option("header", "true") \
    .schema(schema) \
    .load("DataStream/")  # Đảm bảo đây là thư mục chứa tệp


# Lọc các trận đấu mà đội khách thắng (FTR = 'A')
away_wins_df = df_stream.filter(df_stream["FTR"] == "A")

# Nhóm theo đội khách (AwayTeam) và tính tổng số trận thắng và tổng số cú sút trúng đích (AST)
result_df = away_wins_df.groupBy("AwayTeam").agg(
    F.count("FTR").alias("away_wins_count"),  # Số trận thắng trên sân khách
    F.sum("AST").alias("total_shots_on_target")  # Tổng số cú sút trúng đích
)

# Sắp xếp theo số trận thắng giảm dần và lấy 3 đội có số trận thắng cao nhất
top_3_teams_df = result_df.orderBy(F.col("away_wins_count").desc()).limit(3)

# Xuất kết quả ra console
query = top_3_teams_df.writeStream.outputMode("complete").format("console").start()

# Đợi cho đến khi stream dừng
query.awaitTermination()


25/04/17 12:22:14 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/20/3zj4bhnd5gl57qd2cr57vsk40000gn/T/temporary-2fcf18ab-482a-42bc-8b03-92a7b0add774. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/04/17 12:22:14 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-----------+---------------+---------------------+
|   AwayTeam|away_wins_count|total_shots_on_target|
+-----------+---------------+---------------------+
|  Barcelona|             12|                   89|
| Ath Madrid|             11|                   54|
|Real Madrid|             10|                   69|
+-----------+---------------+---------------------+

