[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fisamz/Repositorio_MCDAA/blob/main/Tarea4_5/Tarea4_5.ipynb)

# Tarea 4 y 5 — MLlib PySpark
**Alumno:** Fisam Zavala  
**Dataset:** Resultados de futbol & momios de casas de apuestas.  
**Fuente:** [European Soccer Database](https://www.kaggle.com/datasets/hugomathien/soccer)


In [3]:
#%pip install pyspark
import pyspark
pyspark.__version__

'4.1.1'


XXXXXXX


In [None]:
# (Opcional / referencia) Descarga desde Kaggle:
# !pip install kaggle
# !kaggle datasets download -d hugomathien/soccer
# !unzip soccer.zip -d data/

#import sqlite3, pandas as pd

#conn = sqlite3.connect("../data/database.sqlite")

#df_match = pd.read_sql("SELECT * FROM Match", conn)
#df_match.to_csv("../data/match.csv", index=False)

#conn.close()


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Tarea4_MLlib") \
    .master("local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")


In [5]:
from pyspark.sql import functions as F

df_match = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("../data/match.csv")
)

print("Filas:", df_match.count())
df_match.printSchema()
df_match.show(5, truncate=False)


                                                                                

Filas: 27383
root
 |-- id: string (nullable = true)
 |-- country_id: string (nullable = true)
 |-- league_id: integer (nullable = true)
 |-- season: string (nullable = true)
 |-- stage: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- match_api_id: integer (nullable = true)
 |-- home_team_api_id: integer (nullable = true)
 |-- away_team_api_id: integer (nullable = true)
 |-- home_team_goal: integer (nullable = true)
 |-- away_team_goal: integer (nullable = true)
 |-- home_player_X1: double (nullable = true)
 |-- home_player_X2: double (nullable = true)
 |-- home_player_X3: double (nullable = true)
 |-- home_player_X4: double (nullable = true)
 |-- home_player_X5: double (nullable = true)
 |-- home_player_X6: double (nullable = true)
 |-- home_player_X7: double (nullable = true)
 |-- home_player_X8: double (nullable = true)
 |-- home_player_X9: double (nullable = true)
 |-- home_player_X10: double (nullable = true)
 |-- home_player_X11: double (nullable = true)
 |--

In [6]:
from pyspark.sql.functions import col

# ============ 1. Crear label ============
df = df_match.withColumn(
    "label",
    (col("home_team_goal") > col("away_team_goal")).cast("int")
)

# ============ 2. Función prob implícita ============
def implied_prob(df, h, d, a, prefix):

    df = df.withColumn(f"{prefix}_ph", 1/col(h)) \
           .withColumn(f"{prefix}_pd", 1/col(d)) \
           .withColumn(f"{prefix}_pa", 1/col(a))

    df = df.withColumn(
        f"{prefix}_sum",
        col(f"{prefix}_ph")+col(f"{prefix}_pd")+col(f"{prefix}_pa")
    )

    df = df.withColumn(f"{prefix}_ph", col(f"{prefix}_ph")/col(f"{prefix}_sum")) \
           .withColumn(f"{prefix}_pd", col(f"{prefix}_pd")/col(f"{prefix}_sum")) \
           .withColumn(f"{prefix}_pa", col(f"{prefix}_pa")/col(f"{prefix}_sum"))

    return df


# ============ 3. Aplicar a casas ============
df = implied_prob(df, "B365H","B365D","B365A","b365")
df = implied_prob(df, "BWH","BWD","BWA","bw")
df = implied_prob(df, "IWH","IWD","IWA","iw")
df = implied_prob(df, "WHH","WHD","WHA","wh")


# ============ 4. Promedio ============
df = df.withColumn(
    "p_home_avg",
    (col("b365_ph")+col("bw_ph")+col("iw_ph")+col("wh_ph"))/4
).withColumn(
    "p_draw_avg",
    (col("b365_pd")+col("bw_pd")+col("iw_pd")+col("wh_pd"))/4
).withColumn(
    "p_away_avg",
    (col("b365_pa")+col("bw_pa")+col("iw_pa")+col("wh_pa"))/4
)

# Feature extra
df = df.withColumn(
    "fav_strength",
    col("p_home_avg") - col("p_away_avg")
)


In [7]:
df.select(
    "p_home_avg",
    "p_draw_avg",
    "p_away_avg",
    "fav_strength",
    "label"
).show(5)


+-------------------+-------------------+-------------------+--------------------+-----+
|         p_home_avg|         p_draw_avg|         p_away_avg|        fav_strength|label|
+-------------------+-------------------+-------------------+--------------------+-----+
| 0.5134253755813658|0.27204496342190043|0.21452966099673376| 0.29889571458463204|    0|
| 0.4793486838444711|0.27564460780362987|0.24500670835189906| 0.23434197549257205|    0|
|0.36347576467707754| 0.2768749038016514|0.35964933152127115|0.003826433155806...|    0|
| 0.6302556885423144|0.23262589986618726|0.13711841159149835| 0.49313727695081605|    1|
|0.19870182784368112| 0.2617382683146468| 0.5395599038416721|-0.34085807599799095|    0|
+-------------------+-------------------+-------------------+--------------------+-----+
only showing top 5 rows


In [8]:
from pyspark.sql import functions as F

df2 = (
    df
    # año de temporada: "2008/2009" -> 2008
    .withColumn("season_year", F.substring("season", 1, 4).cast("int"))
    # aseguramos tipo numérico en ids
    .withColumn("home_team_api_id", F.col("home_team_api_id").cast("int"))
    .withColumn("away_team_api_id", F.col("away_team_api_id").cast("int"))
    .withColumn("league_id", F.col("league_id").cast("int"))
    .withColumn("country_id", F.col("country_id").cast("int"))
    .withColumn("stage", F.col("stage").cast("int"))
)

feature_cols = [
    "p_home_avg","p_draw_avg","p_away_avg","fav_strength",
    "season_year","stage","league_id","country_id",
    "home_team_api_id","away_team_api_id"
]

df2 = df2.dropna(subset=feature_cols + ["label"])
print("Filas para modelar:", df2.count())


26/02/20 12:20:05 ERROR Executor: Exception in task 1.0 in stage 7.0 (TID 21) 8]
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '<shoton>' of the type "STRING" cannot be cast to "INT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
line 11 in cell [8]

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:147)
	at org.apache.spark.sql.catalyst.util.UTF8StringUtils$.withException(UTF8StringUtils.scala:51)
	at org.apache.spark.sql.catalyst.util.UTF8StringUtils$.toIntExact(UTF8StringUtils.scala:34)
	at org.apache.spark.sql.catalyst.util.UTF8StringUtils.toIntExact(UTF8StringUtils.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.hashAgg_doAggregateWithoutKey_0$(Unknown Source)
	at org.apache

NumberFormatException: [CAST_INVALID_INPUT] The value '<shoton>' of the type "STRING" cannot be cast to "INT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
line 11 in cell [8]
