# Extract and Clean Data from all_matches.csv

Primero creamos un Spark Context

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import  (StructType, StructField, DateType, BooleanType, DoubleType, IntegerType, StringType, TimestampType)
from pyspark.sql.functions import col, udf
import os

spark = SparkSession.builder.master("local[1]").appName("tenis-matches").getOrCreate()

In [71]:
raw_file_path = os.path.join(os.path.abspath(os.path.pardir), "dataset", "raw", "all_matches.csv")
matches = spark.read.csv("file:///" + raw_file_path, header = True)

In [72]:
print("There are {} matches.".format(matches.count()))

There are 4295827 matches.


### Eliminaremos algunas columnas del dataset que no nos sirven

In [4]:
matches.printSchema()

root
 |-- start_date: string (nullable = true)
 |-- end_date: string (nullable = true)
 |-- location: string (nullable = true)
 |-- court_surface: string (nullable = true)
 |-- prize_money: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- year: string (nullable = true)
 |-- player_id: string (nullable = true)
 |-- player_name: string (nullable = true)
 |-- opponent_id: string (nullable = true)
 |-- opponent_name: string (nullable = true)
 |-- tournament: string (nullable = true)
 |-- round: string (nullable = true)
 |-- num_sets: string (nullable = true)
 |-- sets_won: string (nullable = true)
 |-- games_won: string (nullable = true)
 |-- games_against: string (nullable = true)
 |-- tiebreaks_won: string (nullable = true)
 |-- tiebreaks_total: string (nullable = true)
 |-- serve_rating: string (nullable = true)
 |-- aces: string (nullable = true)
 |-- double_faults: string (nullable = true)
 |-- first_serve_made: string (nullable = true)
 |-- first_serve_attempted:

Eliminaremos las siguientes columnas que no serán útiles para nuestro análisis:
- end_date
- location
- prize_money
- currency
- round
- serve_rating
- aces
- double_faults
- first_serve_made
- first_serve_attempted
- first_serve_points_made
- first_serve_points_attempted
- second_serve_points_made
- second_serve_points_attempted
- break_points_saved
- break_points_against
- service_games_won
- return_rating
- first_serve_return_points_made
- first_serve_return_points_attempted
- second_serve_return_points_made
- second_serve_return_points_attempted
- break_points_made
- break_points_attempted
- return_games_played
- service_points_won
- service_points_attempted
- return_points_won
- return_points_attempted
- total_points_won
- total_points
- duration
- seed
- masters
- round_num
- nation

In [5]:
matches = matches.drop("end_date")\
                .drop("location")\
                .drop("prize_money")\
                .drop("currency")\
                .drop("round")\
                .drop("serve_rating")\
                .drop("aces")\
                .drop("double_faults")\
                .drop("first_serve_made")\
                .drop("first_serve_attempted")\
                .drop("first_serve_points_made")\
                .drop("first_serve_points_attempted")\
                .drop("second_serve_points_made")\
                .drop("second_serve_points_attempted")\
                .drop("break_points_saved")\
                .drop("break_points_against")\
                .drop("service_games_won")\
                .drop("return_rating")\
                .drop("first_serve_return_points_made")\
                .drop("first_serve_return_points_attempted")\
                .drop("second_serve_return_points_made")\
                .drop("second_serve_return_points_attempted")\
                .drop("break_points_made")\
                .drop("break_points_attempted")\
                .drop("return_games_played")\
                .drop("service_points_won")\
                .drop("service_points_attempted")\
                .drop("return_points_won")\
                .drop("return_points_attempted")\
                .drop("total_points_won")\
                .drop("total_points")\
                .drop("duration")\
                .drop("seed")\
                .drop("masters")\
                .drop("round_num")\
                .drop("nation")

In [6]:
matches.printSchema()

root
 |-- start_date: string (nullable = true)
 |-- court_surface: string (nullable = true)
 |-- year: string (nullable = true)
 |-- player_id: string (nullable = true)
 |-- player_name: string (nullable = true)
 |-- opponent_id: string (nullable = true)
 |-- opponent_name: string (nullable = true)
 |-- tournament: string (nullable = true)
 |-- num_sets: string (nullable = true)
 |-- sets_won: string (nullable = true)
 |-- games_won: string (nullable = true)
 |-- games_against: string (nullable = true)
 |-- tiebreaks_won: string (nullable = true)
 |-- tiebreaks_total: string (nullable = true)
 |-- player_victory: string (nullable = true)
 |-- retirement: string (nullable = true)
 |-- won_first_set: string (nullable = true)
 |-- doubles: string (nullable = true)



Eliminaremos registros con muchos valores nulos

In [7]:
# define functions to iterate in a generic way in dataframe

def print_null_value_count(df, column_name):
    null_values_count = df.where(df[column_name].isNull()).count()
    print("Cantidad de valores nulos para {0} : {1}.".format(column_name, null_values_count))
    
def get_null_values_in_dataframe(df):
    for column in df.columns:
        print_null_value_count(df, column)

In [8]:
get_null_values_in_dataframe(matches)

Cantidad de valores nulos para start_date : 0.
Cantidad de valores nulos para court_surface : 13628.
Cantidad de valores nulos para year : 0.
Cantidad de valores nulos para player_id : 0.
Cantidad de valores nulos para player_name : 3941422.
Cantidad de valores nulos para opponent_id : 0.
Cantidad de valores nulos para opponent_name : 3941422.
Cantidad de valores nulos para tournament : 0.
Cantidad de valores nulos para num_sets : 2388.
Cantidad de valores nulos para sets_won : 2388.
Cantidad de valores nulos para games_won : 2388.
Cantidad de valores nulos para games_against : 2388.
Cantidad de valores nulos para tiebreaks_won : 2388.
Cantidad de valores nulos para tiebreaks_total : 2388.
Cantidad de valores nulos para player_victory : 2388.
Cantidad de valores nulos para retirement : 7216.
Cantidad de valores nulos para won_first_set : 64485.
Cantidad de valores nulos para doubles : 0.


En base a los datos obtenidos, es que se decide eliminar las siguientes columnas debido a que muchos valores son nulos, y además esta será obtenida de otro dataset:
- player_name
- opponent_name

In [9]:
matches = matches.drop("player_name").drop("opponent_name")

Luego, eliminaremos los datos donde tengan alguna propiedad nula.

In [10]:
matches = matches.na.drop()

Comprobamos no existan datos con propiedades nulas

In [11]:
get_null_values_in_dataframe(matches)

Cantidad de valores nulos para start_date : 0.
Cantidad de valores nulos para court_surface : 0.
Cantidad de valores nulos para year : 0.
Cantidad de valores nulos para player_id : 0.
Cantidad de valores nulos para opponent_id : 0.
Cantidad de valores nulos para tournament : 0.
Cantidad de valores nulos para num_sets : 0.
Cantidad de valores nulos para sets_won : 0.
Cantidad de valores nulos para games_won : 0.
Cantidad de valores nulos para games_against : 0.
Cantidad de valores nulos para tiebreaks_won : 0.
Cantidad de valores nulos para tiebreaks_total : 0.
Cantidad de valores nulos para player_victory : 0.
Cantidad de valores nulos para retirement : 0.
Cantidad de valores nulos para won_first_set : 0.
Cantidad de valores nulos para doubles : 0.


In [12]:
print("Nueva cantidad de partidos : {0}".format(matches.select("*").count()))

Nueva cantidad de partidos : 4213346


### Modificaremos los tipos de algunos datos para que sean mas amigables luego

In [40]:
matches.printSchema()

root
 |-- start_date: string (nullable = true)
 |-- court_surface: string (nullable = true)
 |-- year: string (nullable = true)
 |-- player_id: string (nullable = true)
 |-- opponent_id: string (nullable = true)
 |-- tournament: string (nullable = true)
 |-- num_sets: string (nullable = true)
 |-- sets_won: string (nullable = true)
 |-- games_won: string (nullable = true)
 |-- games_against: string (nullable = true)
 |-- tiebreaks_won: string (nullable = true)
 |-- tiebreaks_total: string (nullable = true)
 |-- player_victory: string (nullable = true)
 |-- retirement: string (nullable = true)
 |-- won_first_set: string (nullable = true)
 |-- doubles: string (nullable = true)



Observaremos los tipos que deberían ser booleanos

In [41]:
boolean_columns = ["player_victory", "retirement", "won_first_set", "doubles"]
matches.select(boolean_columns).show(5)

+--------------+----------+-------------+-------+
|player_victory|retirement|won_first_set|doubles|
+--------------+----------+-------------+-------+
|             f|         f|            f|      f|
|             t|         f|            t|      f|
|             f|         f|            f|      f|
|             f|         f|            t|      t|
|             t|         f|            t|      f|
+--------------+----------+-------------+-------+
only showing top 5 rows



Pondremos valores de True y False, en lugar de f y t.

In [45]:
from pyspark.sql.functions import col,udf

boolParse = udf(lambda x: True if x == "t" else False, BooleanType())

def parse_boolean_columns(df, columns):
    for column in columns:
        df = df.withColumn(column, boolParse(col(column)))
    return df

In [46]:
matches = parse_boolean_columns(matches, boolean_columns)

In [47]:
matches.select(boolean_columns).show(5)

+--------------+----------+-------------+-------+
|player_victory|retirement|won_first_set|doubles|
+--------------+----------+-------------+-------+
|         false|     false|        false|  false|
|          true|     false|         true|  false|
|         false|     false|        false|  false|
|         false|     false|         true|   true|
|          true|     false|         true|  false|
+--------------+----------+-------------+-------+
only showing top 5 rows



In [48]:
processed_file_path = os.path.join(os.path.abspath(os.path.pardir), "dataset", "processed", "all_matches.csv")
matches.write.format("csv").option("header", True).mode('overwrite').save("file:///" + processed_file_path)

## Creacion de archivo para ejecutar extraccion de datos

In [62]:
get_processed_data_script_file = os.path.join(os.path.pardir, "process_all_matches.py")

In [73]:
%%writefile $get_processed_data_script_file
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import  (StructType, StructField, DateType, BooleanType, DoubleType, IntegerType, StringType, TimestampType)
from pyspark.sql.functions import col, udf
import os

def extract_data():
    spark = SparkSession.builder.master("local[1]").appName("tenis-matches").getOrCreate()
    
    raw_file_path = os.path.join(os.path.abspath(os.path.pardir), "dataset", "raw", "all_matches.csv")
    matches = spark.read.csv("file:///" + raw_file_path, header = True)
    
    matches = matches.drop("end_date")\
                .drop("location")\
                .drop("prize_money")\
                .drop("currency")\
                .drop("round")\
                .drop("serve_rating")\
                .drop("aces")\
                .drop("double_faults")\
                .drop("first_serve_made")\
                .drop("first_serve_attempted")\
                .drop("first_serve_points_made")\
                .drop("first_serve_points_attempted")\
                .drop("second_serve_points_made")\
                .drop("second_serve_points_attempted")\
                .drop("break_points_saved")\
                .drop("break_points_against")\
                .drop("service_games_won")\
                .drop("return_rating")\
                .drop("first_serve_return_points_made")\
                .drop("first_serve_return_points_attempted")\
                .drop("second_serve_return_points_made")\
                .drop("second_serve_return_points_attempted")\
                .drop("break_points_made")\
                .drop("break_points_attempted")\
                .drop("return_games_played")\
                .drop("service_points_won")\
                .drop("service_points_attempted")\
                .drop("return_points_won")\
                .drop("return_points_attempted")\
                .drop("total_points_won")\
                .drop("total_points")\
                .drop("duration")\
                .drop("seed")\
                .drop("masters")\
                .drop("round_num")\
                .drop("nation")\
                .drop("player_name")\
                .drop("opponent_name")
    
    matches = matches.na.drop()
        
    boolean_columns = ["player_victory", "retirement", "won_first_set", "doubles"]
    matches = parse_boolean_columns(matches, boolean_columns)
    
    return matches

def parse_boolean_columns(df, columns):
    boolParse = udf(lambda x: True if x == "t" else False, BooleanType())
    for column in columns:
        df = df.withColumn(column, boolParse(col(column)))
    return df

if __name__ == '__main__':
    df = extract_data()
    processed_file_path = os.path.join(os.path.abspath(os.path.pardir), "dataset", "processed", "all_matches.csv")
    df.write.format("csv").option("header", True).mode('overwrite').save("file:///" + processed_file_path)

Overwriting ..\process_all_matches.py


In [74]:
!python $get_processed_data_script_file

SUCCESS: The process with PID 11300 (child process of PID 19540) has been terminated.
SUCCESS: The process with PID 19540 (child process of PID 3984) has been terminated.
SUCCESS: The process with PID 3984 (child process of PID 1476) has been terminated.


The system cannot find the path specified.
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2020-11-09 22:48:59,033 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
2020-11-09 22:48:59,034 WARN util.Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
2020-11-09 22:48:59,034 WARN util.Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
Traceback (most recent call last):
  File "..\process_all_matches.py", line 67, in <module>
    df = extract_data()
  File "..\process_all_matches.py", line 11, in extract_data
    raw_file_path = os.path.join(os.path.abspath(os.path.pardir), "dataset", "raw", "all_matches.csv")
NameError: name 'os' is not defined
