In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('practica').getOrCreate()

In [4]:
spark

# 1. **Introducción**

In [5]:
df_pyspark = spark.read.csv('Pokemon.csv')

In [6]:
 df_pyspark.show()

+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|_c0|                 _c1|   _c2|   _c3|  _c4|_c5|   _c6|    _c7|    _c8|    _c9| _c10|      _c11|     _c12|
+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|     65|     65|   45|         1|    False|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    False|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83|    100|    100|   80|         1|    False|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123|    122|    120|   80|         1|    False|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    False|
|  5|          Char

El método option permite añadir un key y un value como string, para agregar opciones de lectura

In [7]:
# Establecemos que la primera línea son los encabezados de las columnas

df_pyspark = spark.read.option('header','true').csv('Pokemon.csv')

In [8]:
df_pyspark.show()

+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|     65|     65|   45|         1|    False|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    False|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83|    100|    100|   80|         1|    False|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123|    122|    120|   80|         1|    False|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    False|
|  5|          Charmeleon|  Fire|  null|  405| 58|    64|     58|     80|     65|   80|         1|    False|
|  6|           Cha

In [9]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [10]:
# Tenemos métodos parecidos a pandas

df_pyspark.head()     # Devuelve primera fila 

Row(#='1', Name='Bulbasaur', Type 1='Grass', Type 2='Poison', Total='318', HP='45', Attack='49', Defense='49', Sp. Atk='65', Sp. Def='65', Speed='45', Generation='1', Legendary='False')

In [11]:
df_pyspark.head(4)

[Row(#='1', Name='Bulbasaur', Type 1='Grass', Type 2='Poison', Total='318', HP='45', Attack='49', Defense='49', Sp. Atk='65', Sp. Def='65', Speed='45', Generation='1', Legendary='False'),
 Row(#='2', Name='Ivysaur', Type 1='Grass', Type 2='Poison', Total='405', HP='60', Attack='62', Defense='63', Sp. Atk='80', Sp. Def='80', Speed='60', Generation='1', Legendary='False'),
 Row(#='3', Name='Venusaur', Type 1='Grass', Type 2='Poison', Total='525', HP='80', Attack='82', Defense='83', Sp. Atk='100', Sp. Def='100', Speed='80', Generation='1', Legendary='False'),
 Row(#='3', Name='VenusaurMega Venusaur', Type 1='Grass', Type 2='Poison', Total='625', HP='80', Attack='100', Defense='123', Sp. Atk='122', Sp. Def='120', Speed='80', Generation='1', Legendary='False')]

In [12]:
# Similar a info() de pandas. Información de los tipos de las columnas

df_pyspark.printSchema()

root
 |-- #: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Type 1: string (nullable = true)
 |-- Type 2: string (nullable = true)
 |-- Total: string (nullable = true)
 |-- HP: string (nullable = true)
 |-- Attack: string (nullable = true)
 |-- Defense: string (nullable = true)
 |-- Sp. Atk: string (nullable = true)
 |-- Sp. Def: string (nullable = true)
 |-- Speed: string (nullable = true)
 |-- Generation: string (nullable = true)
 |-- Legendary: string (nullable = true)



# 2. **Manejo Dataframes**

Observamos que el tipo de dato es todo string, cuando muchos de ellos son int. Esto es debido a que por defecto PySpark establece como string todos los tipos

In [13]:
df_pyspark.printSchema()

root
 |-- #: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Type 1: string (nullable = true)
 |-- Type 2: string (nullable = true)
 |-- Total: string (nullable = true)
 |-- HP: string (nullable = true)
 |-- Attack: string (nullable = true)
 |-- Defense: string (nullable = true)
 |-- Sp. Atk: string (nullable = true)
 |-- Sp. Def: string (nullable = true)
 |-- Speed: string (nullable = true)
 |-- Generation: string (nullable = true)
 |-- Legendary: string (nullable = true)



Se soluciona con el parámetro *inferSchema*, del método *csv*

In [14]:
df_pyspark = spark.read.option('header','true').csv('Pokemon.csv',inferSchema=True)
df_pyspark.printSchema()

root
 |-- #: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Type 1: string (nullable = true)
 |-- Type 2: string (nullable = true)
 |-- Total: integer (nullable = true)
 |-- HP: integer (nullable = true)
 |-- Attack: integer (nullable = true)
 |-- Defense: integer (nullable = true)
 |-- Sp. Atk: integer (nullable = true)
 |-- Sp. Def: integer (nullable = true)
 |-- Speed: integer (nullable = true)
 |-- Generation: integer (nullable = true)
 |-- Legendary: boolean (nullable = true)



La etiqueta nullable significa que la columna puede albergar nulos (que sea True no significa que tenga nulos, si no que puede tenerlos)

Otra forma de hacerlo

In [15]:
df_pyspark = spark.read.csv('Pokemon.csv',inferSchema=True,header=True)
df_pyspark.printSchema()

root
 |-- #: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Type 1: string (nullable = true)
 |-- Type 2: string (nullable = true)
 |-- Total: integer (nullable = true)
 |-- HP: integer (nullable = true)
 |-- Attack: integer (nullable = true)
 |-- Defense: integer (nullable = true)
 |-- Sp. Atk: integer (nullable = true)
 |-- Sp. Def: integer (nullable = true)
 |-- Speed: integer (nullable = true)
 |-- Generation: integer (nullable = true)
 |-- Legendary: boolean (nullable = true)



In [16]:
df_pyspark.show()

+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|     65|     65|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83|    100|    100|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123|    122|    120|   80|         1|    false|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    false|
|  5|          Charmeleon|  Fire|  null|  405| 58|    64|     58|     80|     65|   80|         1|    false|
|  6|           Cha

In [17]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

Obtener nombre de columnas, y seleccionar columnas

In [18]:
df_pyspark.columns

['#',
 'Name',
 'Type 1',
 'Type 2',
 'Total',
 'HP',
 'Attack',
 'Defense',
 'Sp. Atk',
 'Sp. Def',
 'Speed',
 'Generation',
 'Legendary']

In [19]:
# Me devulve un dataframe

df_pyspark.select('Name')

DataFrame[Name: string]

In [20]:
df_pyspark.select('Name').show()

+--------------------+
|                Name|
+--------------------+
|           Bulbasaur|
|             Ivysaur|
|            Venusaur|
|VenusaurMega Venu...|
|          Charmander|
|          Charmeleon|
|           Charizard|
|CharizardMega Cha...|
|CharizardMega Cha...|
|            Squirtle|
|           Wartortle|
|           Blastoise|
|BlastoiseMega Bla...|
|            Caterpie|
|             Metapod|
|          Butterfree|
|              Weedle|
|              Kakuna|
|            Beedrill|
|BeedrillMega Beed...|
+--------------------+
only showing top 20 rows



In [21]:
# Selección de múltiples columnas

df_pyspark.select(['Name','Type 1','Total']).show()

+--------------------+------+-----+
|                Name|Type 1|Total|
+--------------------+------+-----+
|           Bulbasaur| Grass|  318|
|             Ivysaur| Grass|  405|
|            Venusaur| Grass|  525|
|VenusaurMega Venu...| Grass|  625|
|          Charmander|  Fire|  309|
|          Charmeleon|  Fire|  405|
|           Charizard|  Fire|  534|
|CharizardMega Cha...|  Fire|  634|
|CharizardMega Cha...|  Fire|  634|
|            Squirtle| Water|  314|
|           Wartortle| Water|  405|
|           Blastoise| Water|  530|
|BlastoiseMega Bla...| Water|  630|
|            Caterpie|   Bug|  195|
|             Metapod|   Bug|  205|
|          Butterfree|   Bug|  395|
|              Weedle|   Bug|  195|
|              Kakuna|   Bug|  205|
|            Beedrill|   Bug|  395|
|BeedrillMega Beed...|   Bug|  495|
+--------------------+------+-----+
only showing top 20 rows



**IMPORTANTE:** En PySpark no funciona el slicing

In [22]:
df_pyspark['Name']

Column<'Name'>

Chequear los tipos de dato

In [23]:
# Devuelve lista de tuplas

df_pyspark.dtypes

[('#', 'int'),
 ('Name', 'string'),
 ('Type 1', 'string'),
 ('Type 2', 'string'),
 ('Total', 'int'),
 ('HP', 'int'),
 ('Attack', 'int'),
 ('Defense', 'int'),
 ('Sp. Atk', 'int'),
 ('Sp. Def', 'int'),
 ('Speed', 'int'),
 ('Generation', 'int'),
 ('Legendary', 'boolean')]

Describe

In [24]:
# Mexzcla object y no object

df_pyspark.describe().show()

+-------+------------------+----------------+------+------+------------------+------------------+-----------------+------------------+----------------+-----------------+------------------+------------------+
|summary|                 #|            Name|Type 1|Type 2|             Total|                HP|           Attack|           Defense|         Sp. Atk|          Sp. Def|             Speed|        Generation|
+-------+------------------+----------------+------+------+------------------+------------------+-----------------+------------------+----------------+-----------------+------------------+------------------+
|  count|               800|             800|   800|   414|               800|               800|              800|               800|             800|              800|               800|               800|
|   mean|         362.81375|            null|  null|  null|          435.1025|          69.25875|         79.00125|           73.8425|           72.82|          71.9025

Añadir y eliminar columnas de un dataframe

Se añaden con el método del dataframe *withcolumn* que nos permite editar columnas. Introducimos los parámetros:
- **col_name** : Nombre de la nueva columna
- **col** : Expresión para el valor de la nueva columna

**NOTA:** No es un método permanente, genera una copia del dataframe con la nueva columna.

In [25]:
df_pyspark.withColumn('Total + 100', df_pyspark['Total'] + 100 ).show()

+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+-----------+
|  #|                Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|Total + 100|
+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+-----------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|     65|     65|   45|         1|    false|        418|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    false|        505|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83|    100|    100|   80|         1|    false|        625|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123|    122|    120|   80|         1|    false|        725|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    false|        409|
|  5|          Charmeleon|  Fire

In [26]:
# NO figura la nueva columna

df_pyspark.show()

+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|     65|     65|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83|    100|    100|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123|    122|    120|   80|         1|    false|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    false|
|  5|          Charmeleon|  Fire|  null|  405| 58|    64|     58|     80|     65|   80|         1|    false|
|  6|           Cha

In [27]:
df_pyspark = df_pyspark.withColumn('Total + 100', df_pyspark['Total'] + 100 )
df_pyspark.show()

+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+-----------+
|  #|                Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|Total + 100|
+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+-----------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|     65|     65|   45|         1|    false|        418|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    false|        505|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83|    100|    100|   80|         1|    false|        625|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123|    122|    120|   80|         1|    false|        725|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    false|        409|
|  5|          Charmeleon|  Fire

Eliminar columnas

In [28]:
df_pyspark.drop('Total + 100').show()

+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|     65|     65|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83|    100|    100|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123|    122|    120|   80|         1|    false|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    false|
|  5|          Charmeleon|  Fire|  null|  405| 58|    64|     58|     80|     65|   80|         1|    false|
|  6|           Cha

In [29]:
# Debemos asignar de nuevo la variable para qué sea permanente
df_pyspark = df_pyspark.drop('Total + 100')

In [30]:
df_pyspark.show()

+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|     65|     65|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83|    100|    100|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123|    122|    120|   80|         1|    false|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    false|
|  5|          Charmeleon|  Fire|  null|  405| 58|    64|     58|     80|     65|   80|         1|    false|
|  6|           Cha

Renombrar columnas

Se usa el método withColumnRenamed(), donde indicamos el valor del nombre de la columna, y el nuevo valor

In [31]:
# No es una operación con inplace como las anteriores. Crea copia
df_pyspark.withColumnRenamed('Total','Total_Renamed').show()

+---+--------------------+------+------+-------------+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type 1|Type 2|Total_Renamed| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-------------+---+------+-------+-------+-------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|          318| 45|    49|     49|     65|     65|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|          405| 60|    62|     63|     80|     80|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|          525| 80|    82|     83|    100|    100|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|          625| 80|   100|    123|    122|    120|   80|         1|    false|
|  4|          Charmander|  Fire|  null|          309| 39|    52|     43|     60|     50|   65|         1|    false|
|  5|          Charmeleon|  Fire|  null|          405| 58|    64

# 3. **Manejo de nulos** 

In [32]:
# Cargamos un nuevo dataset que incluya valores nulos
df_null = spark.read.csv('Pokemon_null.csv',header=True,inferSchema=True)
df_null.show()

+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  #|                Name|Type_1|Type_2|Total| HP|Attack|Defense|Sp_Atk|Sp_Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|  65.0|  65.0|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|  80.0|  80.0|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83| 100.0| 100.0|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123| 122.0| 120.0|   80|         1|    false|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|  60.0|  50.0|   65|         1|    false|
|  5|          Charmeleon|  Fire|  null|  405| 58|    64|     58|  80.0|  65.0|   80|         1|    false|
|  6|           Charizard|  Fire|Flyi

Tenemos nulos en las columnas Type_2, Sp_Atk y Sp_Def

In [33]:
# Elimina todas las filas que tengan algún nulo
df_null.na.drop().show()

+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  #|                Name|Type_1|Type_2|Total| HP|Attack|Defense|Sp_Atk|Sp_Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|  65.0|  65.0|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|  80.0|  80.0|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83| 100.0| 100.0|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123| 122.0| 120.0|   80|         1|    false|
|  6|           Charizard|  Fire|Flying|  534| 78|    84|     78| 109.0|  85.0|  100|         1|    false|
|  6|CharizardMega Cha...|  Fire|Dragon|  634| 78|   130|    111| 130.0|  85.0|  100|         1|    false|
|  6|CharizardMega Cha...|  Fire|Flyi

In [34]:
df_null.na.drop()

DataFrame[#: int, Name: string, Type_1: string, Type_2: string, Total: int, HP: int, Attack: int, Defense: int, Sp_Atk: double, Sp_Def: double, Speed: int, Generation: int, Legendary: boolean]

El método na.drop() tiene un parámetro *how* que tiene valor predeterminado 'any', y puede tener valor 'all':
- **any:** Elimina todas las filas donde haya al menos un nulo
- **all:** Elimina todas las filas donde todos los valores sean nulos

In [35]:
# No ha eliminado ninguna fila, ya que no existe ninguna con todos los valores nulos
df_null.na.drop(how='all').show()

+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  #|                Name|Type_1|Type_2|Total| HP|Attack|Defense|Sp_Atk|Sp_Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|  65.0|  65.0|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|  80.0|  80.0|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83| 100.0| 100.0|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123| 122.0| 120.0|   80|         1|    false|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|  60.0|  50.0|   65|         1|    false|
|  5|          Charmeleon|  Fire|  null|  405| 58|    64|     58|  80.0|  65.0|   80|         1|    false|
|  6|           Charizard|  Fire|Flyi

Otro parámentro que acepta na.drop(), es *thresh*, permite complementar el parámetro how. Es un vaor int, que especifica el número mínimo de no nulos que debe tener una fila para que la podamos eliminar

In [36]:
# Muestra todas las filas que tengan al menos 11 valores no nulos
df_null.na.drop(how='any',thresh=11).show()

+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  #|                Name|Type_1|Type_2|Total| HP|Attack|Defense|Sp_Atk|Sp_Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|  65.0|  65.0|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|  80.0|  80.0|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83| 100.0| 100.0|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123| 122.0| 120.0|   80|         1|    false|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|  60.0|  50.0|   65|         1|    false|
|  5|          Charmeleon|  Fire|  null|  405| 58|    64|     58|  80.0|  65.0|   80|         1|    false|
|  6|           Charizard|  Fire|Flyi

In [37]:
# La columna # cuenta también como valor no nulo
df_null.na.drop(how='any',thresh=12).show()

+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  #|                Name|Type_1|Type_2|Total| HP|Attack|Defense|Sp_Atk|Sp_Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|  65.0|  65.0|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|  80.0|  80.0|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83| 100.0| 100.0|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123| 122.0| 120.0|   80|         1|    false|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|  60.0|  50.0|   65|         1|    false|
|  5|          Charmeleon|  Fire|  null|  405| 58|    64|     58|  80.0|  65.0|   80|         1|    false|
|  6|           Charizard|  Fire|Flyi

El último parámetro que acepta drop es *subset*. Permite especificar una columna o columnas del dataset, esto hará que si en las columna/s especificadas está presente un nulo se eliminará

In [38]:
df_null.na.drop(subset=['Type_2']).show()

+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  #|                Name|Type_1|Type_2|Total| HP|Attack|Defense|Sp_Atk|Sp_Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|  65.0|  65.0|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|  80.0|  80.0|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83| 100.0| 100.0|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123| 122.0| 120.0|   80|         1|    false|
|  6|           Charizard|  Fire|Flying|  534| 78|    84|     78| 109.0|  85.0|  100|         1|    false|
|  6|CharizardMega Cha...|  Fire|Dragon|  634| 78|   130|    111| 130.0|  85.0|  100|         1|    false|
|  6|CharizardMega Cha...|  Fire|Flyi

In [39]:
df_null.na.drop(subset=['Type_2','Sp_Atk']).show()

+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  #|                Name|Type_1|Type_2|Total| HP|Attack|Defense|Sp_Atk|Sp_Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|  65.0|  65.0|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|  80.0|  80.0|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83| 100.0| 100.0|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123| 122.0| 120.0|   80|         1|    false|
|  6|           Charizard|  Fire|Flying|  534| 78|    84|     78| 109.0|  85.0|  100|         1|    false|
|  6|CharizardMega Cha...|  Fire|Dragon|  634| 78|   130|    111| 130.0|  85.0|  100|         1|    false|
|  6|CharizardMega Cha...|  Fire|Flyi

#### Relleno de nulos

In [40]:
# Completa lo nulos con 0, en aquellas columnas donde el tipo de dato sea acorde. En este caso int
# La columna Type_2 no ha sido modificada, ya que los valores de esta columna son strings
df_null.na.fill(0).show()

+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  #|                Name|Type_1|Type_2|Total| HP|Attack|Defense|Sp_Atk|Sp_Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|  65.0|  65.0|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|  80.0|  80.0|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83| 100.0| 100.0|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123| 122.0| 120.0|   80|         1|    false|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|  60.0|  50.0|   65|         1|    false|
|  5|          Charmeleon|  Fire|  null|  405| 58|    64|     58|  80.0|  65.0|   80|         1|    false|
|  6|           Charizard|  Fire|Flyi

In [41]:
# En este ha rellenado nulos en la columna Type_2
df_null.na.fill('Missing_Value').show()

+---+--------------------+------+-------------+-----+---+------+-------+------+------+-----+----------+---------+
|  #|                Name|Type_1|       Type_2|Total| HP|Attack|Defense|Sp_Atk|Sp_Def|Speed|Generation|Legendary|
+---+--------------------+------+-------------+-----+---+------+-------+------+------+-----+----------+---------+
|  1|           Bulbasaur| Grass|       Poison|  318| 45|    49|     49|  65.0|  65.0|   45|         1|    false|
|  2|             Ivysaur| Grass|       Poison|  405| 60|    62|     63|  80.0|  80.0|   60|         1|    false|
|  3|            Venusaur| Grass|       Poison|  525| 80|    82|     83| 100.0| 100.0|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|       Poison|  625| 80|   100|    123| 122.0| 120.0|   80|         1|    false|
|  4|          Charmander|  Fire|Missing_Value|  309| 39|    52|     43|  60.0|  50.0|   65|         1|    false|
|  5|          Charmeleon|  Fire|Missing_Value|  405| 58|    64|     58|  80.0|  65.0|  

#### Rellenado con Imputer

Imputer es una clase de PySpark, que nos permite rellenar nulos de distintas maneras. A través de un objeto Imputer podemos especificar el comportamiento de cada columno para rellenar los valores nulos de las mismas

In [42]:
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=['Sp_Atk','Sp_Def'],    # Establecemos las columnas que tomaremos como entrada
                 outputCols=[f"{c}_imputed" for c in ['Sp_Atk','Sp_Def']]    # Nombre de las columnas resultantes
                 ).setStrategy('mean')     # setStrategy, permite definir como se transforman las columnas. En este caso con el valor de la media

Usamos la instancia Imputer que hemos creado para completar los valores nulos

A través del método fit() y transform(), podemos crear un dataframe con los valores nulos rellenados

In [43]:
imputer.fit(df_null).transform(df_null).show()

+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+-----------------+-----------------+
|  #|                Name|Type_1|Type_2|Total| HP|Attack|Defense|Sp_Atk|Sp_Def|Speed|Generation|Legendary|   Sp_Atk_imputed|   Sp_Def_imputed|
+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+-----------------+-----------------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|  65.0|  65.0|   45|         1|    false|             65.0|             65.0|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|  80.0|  80.0|   60|         1|    false|             80.0|             80.0|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83| 100.0| 100.0|   80|         1|    false|            100.0|            100.0|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123| 122.0| 120.0|   80|         1|    false|            122.0|            120.0|

In [44]:
imputer_median = Imputer(inputCols=['Sp_Atk', 'Sp_Def'],
                         outputCols=[f"{c}_imputed" for c in ['Sp_Atk', 'Sp_Def']]
                         ).setStrategy('median')     # setStrategy, lo definimos ahora con la mediana

In [45]:
imputer_median.fit(df_null).transform(df_null).show()

+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+--------------+--------------+
|  #|                Name|Type_1|Type_2|Total| HP|Attack|Defense|Sp_Atk|Sp_Def|Speed|Generation|Legendary|Sp_Atk_imputed|Sp_Def_imputed|
+---+--------------------+------+------+-----+---+------+-------+------+------+-----+----------+---------+--------------+--------------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|  65.0|  65.0|   45|         1|    false|          65.0|          65.0|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|  80.0|  80.0|   60|         1|    false|          80.0|          80.0|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83| 100.0| 100.0|   80|         1|    false|         100.0|         100.0|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123| 122.0| 120.0|   80|         1|    false|         122.0|         120.0|
|  4|          Charmander|  Fire|  null| 

# 4. **Operaciones de filtrado** 

In [46]:
df_pyspark = spark.read.csv('Pokemon.csv',header=True,inferSchema=True)
df_pyspark.show()

+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|     65|     65|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83|    100|    100|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123|    122|    120|   80|         1|    false|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    false|
|  5|          Charmeleon|  Fire|  null|  405| 58|    64|     58|     80|     65|   80|         1|    false|
|  6|           Cha

#### Los filtrados se hacen con el método filter, incluyendo la condición en los parámetros

Pokemons con Attack < 78

In [47]:
df_pyspark.filter('Attack<78').show()

+---+----------+--------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|      Name|  Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+----------+--------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  1| Bulbasaur|   Grass|Poison|  318| 45|    49|     49|     65|     65|   45|         1|    false|
|  2|   Ivysaur|   Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    false|
|  4|Charmander|    Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    false|
|  5|Charmeleon|    Fire|  null|  405| 58|    64|     58|     80|     65|   80|         1|    false|
|  7|  Squirtle|   Water|  null|  314| 44|    48|     65|     50|     64|   43|         1|    false|
|  8| Wartortle|   Water|  null|  405| 59|    63|     80|     65|     80|   58|         1|    false|
| 10|  Caterpie|     Bug|  null|  195| 45|    30|     35|     20|     20|   45|         1| 

Podemos completarlo con el método select

In [51]:
df_pyspark.filter('Attack<78').select(['Name','HP']).show()

+----------+---+
|      Name| HP|
+----------+---+
| Bulbasaur| 45|
|   Ivysaur| 60|
|Charmander| 39|
|Charmeleon| 58|
|  Squirtle| 44|
| Wartortle| 59|
|  Caterpie| 45|
|   Metapod| 50|
|Butterfree| 60|
|    Weedle| 40|
|    Kakuna| 45|
|    Pidgey| 40|
| Pidgeotto| 63|
|   Rattata| 30|
|   Spearow| 40|
|     Ekans| 35|
|   Pikachu| 35|
| Sandshrew| 50|
|  Nidoran♀| 55|
|  Nidorina| 70|
+----------+---+
only showing top 20 rows



Similar a pandas

In [52]:
df_pyspark.filter(df_pyspark['Attack']<78).show()

+---+----------+--------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|      Name|  Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+----------+--------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  1| Bulbasaur|   Grass|Poison|  318| 45|    49|     49|     65|     65|   45|         1|    false|
|  2|   Ivysaur|   Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    false|
|  4|Charmander|    Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    false|
|  5|Charmeleon|    Fire|  null|  405| 58|    64|     58|     80|     65|   80|         1|    false|
|  7|  Squirtle|   Water|  null|  314| 44|    48|     65|     50|     64|   43|         1|    false|
|  8| Wartortle|   Water|  null|  405| 59|    63|     80|     65|     80|   58|         1|    false|
| 10|  Caterpie|     Bug|  null|  195| 45|    30|     35|     20|     20|   45|         1| 

#### Condiciones múltiples

In [55]:
# Se ha de usar con & o | y las condiciones entre paréntesis
df_pyspark.filter((df_pyspark['Attack'] < 78) &
                  (df_pyspark['Defense'] > 50)).show()

+---+--------------------+-------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name| Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+-------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  2|             Ivysaur|  Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    false|
|  5|          Charmeleon|   Fire|  null|  405| 58|    64|     58|     80|     65|   80|         1|    false|
|  7|            Squirtle|  Water|  null|  314| 44|    48|     65|     50|     64|   43|         1|    false|
|  8|           Wartortle|  Water|  null|  405| 59|    63|     80|     65|     80|   58|         1|    false|
| 11|             Metapod|    Bug|  null|  205| 50|    20|     55|     25|     25|   30|         1|    false|
| 17|           Pidgeotto| Normal|Flying|  349| 63|    60|     55|     50|     50|   71|         1|    false|
| 27|     

# 5. **GroupBy y funciones agregadas** 

Para hacer un gropby se usa el método groupBy() que tienen los dataframe de Spark. Primero definimos la columna por la que se quiere agrupar, y luego indicamos la función de agregación

In [76]:
# Para aplicar un groupby no podemos tener valores nulos en el dataset
df_pyspark =df_pyspark[['Name','Type 1','Total','HP','Attack','Defense']]

In [80]:
df_pyspark.groupBy('Type 1').mean().show()

+--------+------------------+-----------------+------------------+------------------+
|  Type 1|        avg(Total)|          avg(HP)|       avg(Attack)|      avg(Defense)|
+--------+------------------+-----------------+------------------+------------------+
|   Water|430.45535714285717|          72.0625| 74.15178571428571| 72.94642857142857|
|  Poison|399.14285714285717|            67.25| 74.67857142857143| 68.82142857142857|
|   Steel| 487.7037037037037|65.22222222222223| 92.70370370370371|126.37037037037037|
|    Rock|            453.75|65.36363636363636| 92.86363636363636|100.79545454545455|
|     Ice| 433.4583333333333|             72.0|             72.75| 71.41666666666667|
|   Ghost|          439.5625|          64.4375|          73.78125|           81.1875|
|   Fairy| 413.1764705882353|74.11764705882354|61.529411764705884| 65.70588235294117|
| Psychic|475.94736842105266|70.63157894736842| 71.45614035087719|  67.6842105263158|
|  Dragon|         550.53125|          83.3125|       

In [81]:
# Ahora vemos el máximo de cada columna agrupada por tipo de Pokemon
df_pyspark.groupBy('Type 1').max().show()

+--------+----------+-------+-----------+------------+
|  Type 1|max(Total)|max(HP)|max(Attack)|max(Defense)|
+--------+----------+-------+-----------+------------+
|   Water|       770|    170|        155|         180|
|  Poison|       535|    105|        106|         120|
|   Steel|       700|    100|        150|         230|
|    Rock|       700|    123|        165|         200|
|     Ice|       580|    110|        130|         184|
|   Ghost|       680|    150|        165|         145|
|   Fairy|       680|    126|        131|          95|
| Psychic|       780|    190|        190|         160|
|  Dragon|       780|    125|        180|         130|
|  Flying|       580|     85|        115|          80|
|     Bug|       600|     86|        185|         230|
|Electric|       610|     90|        123|         115|
|    Fire|       680|    115|        160|         140|
|  Ground|       770|    115|        180|         160|
|    Dark|       680|    126|        150|         125|
|Fighting|

Con el método agg, puedo definir la función de agregación que se aplica a cada columna

In [88]:
# Nos muestra el máximo en la columna Total, la media en la columna HP y la suma en la columna Attack
df_pyspark.groupBy('Type 1').agg({'Total':'max','HP':'mean','Attack':'sum'}).show()

+--------+----------+-----------------+-----------+
|  Type 1|max(Total)|          avg(HP)|sum(Attack)|
+--------+----------+-----------------+-----------+
|   Water|       770|          72.0625|       8305|
|  Poison|       535|            67.25|       2091|
|   Steel|       700|65.22222222222223|       2503|
|    Rock|       700|65.36363636363636|       4086|
|     Ice|       580|             72.0|       1746|
|   Ghost|       680|          64.4375|       2361|
|   Fairy|       680|74.11764705882354|       1046|
| Psychic|       780|70.63157894736842|       4073|
|  Dragon|       780|          83.3125|       3588|
|  Flying|       580|            70.75|        315|
|     Bug|       600|56.88405797101449|       4897|
|Electric|       610|59.79545454545455|       3040|
|    Fire|       680|69.90384615384616|       4408|
|  Ground|       770|         73.78125|       3064|
|    Dark|       680|66.80645161290323|       2740|
|Fighting|       625|69.85185185185185|       2613|
|   Grass|  

# 6. **Operaciones de Machine Learning** 

### Regresión Lineal

La metodología para implementar regresión lineal en PySpark requiere que definnamos las variables independientes. \
En este ejemplo vamos a tratar de predecir la variable 'Total' a partir de 'HP', 'Attack' y 'Defense'

In [91]:
df_pyspark.show()

+--------------------+------+-----+---+------+-------+
|                Name|Type 1|Total| HP|Attack|Defense|
+--------------------+------+-----+---+------+-------+
|           Bulbasaur| Grass|  318| 45|    49|     49|
|             Ivysaur| Grass|  405| 60|    62|     63|
|            Venusaur| Grass|  525| 80|    82|     83|
|VenusaurMega Venu...| Grass|  625| 80|   100|    123|
|          Charmander|  Fire|  309| 39|    52|     43|
|          Charmeleon|  Fire|  405| 58|    64|     58|
|           Charizard|  Fire|  534| 78|    84|     78|
|CharizardMega Cha...|  Fire|  634| 78|   130|    111|
|CharizardMega Cha...|  Fire|  634| 78|   104|     78|
|            Squirtle| Water|  314| 44|    48|     65|
|           Wartortle| Water|  405| 59|    63|     80|
|           Blastoise| Water|  530| 79|    83|    100|
|BlastoiseMega Bla...| Water|  630| 79|   103|    120|
|            Caterpie|   Bug|  195| 45|    30|     35|
|             Metapod|   Bug|  205| 50|    20|     55|
|         

In [93]:
# agrupamos todas las variables independientes
from pyspark.ml.feature import VectorAssembler

feat_asem = VectorAssembler(inputCols=['HP', 'Attack', 'Defense'],
                            outputCol='Indep_feat')

In [94]:
output = feat_asem.transform(df_pyspark)
output.show()

+--------------------+------+-----+---+------+-------+------------------+
|                Name|Type 1|Total| HP|Attack|Defense|        Indep_feat|
+--------------------+------+-----+---+------+-------+------------------+
|           Bulbasaur| Grass|  318| 45|    49|     49|  [45.0,49.0,49.0]|
|             Ivysaur| Grass|  405| 60|    62|     63|  [60.0,62.0,63.0]|
|            Venusaur| Grass|  525| 80|    82|     83|  [80.0,82.0,83.0]|
|VenusaurMega Venu...| Grass|  625| 80|   100|    123|[80.0,100.0,123.0]|
|          Charmander|  Fire|  309| 39|    52|     43|  [39.0,52.0,43.0]|
|          Charmeleon|  Fire|  405| 58|    64|     58|  [58.0,64.0,58.0]|
|           Charizard|  Fire|  534| 78|    84|     78|  [78.0,84.0,78.0]|
|CharizardMega Cha...|  Fire|  634| 78|   130|    111|[78.0,130.0,111.0]|
|CharizardMega Cha...|  Fire|  634| 78|   104|     78| [78.0,104.0,78.0]|
|            Squirtle| Water|  314| 44|    48|     65|  [44.0,48.0,65.0]|
|           Wartortle| Water|  405| 59

Se observa que crea una nueva columna de vectores, con las variables independientes que hemos definido

Nos quedamos únicamente con las columnas necesarias para entrenar al modelo

In [95]:
data = output.select('Total','Indep_feat')
data.show()

+-----+------------------+
|Total|        Indep_feat|
+-----+------------------+
|  318|  [45.0,49.0,49.0]|
|  405|  [60.0,62.0,63.0]|
|  525|  [80.0,82.0,83.0]|
|  625|[80.0,100.0,123.0]|
|  309|  [39.0,52.0,43.0]|
|  405|  [58.0,64.0,58.0]|
|  534|  [78.0,84.0,78.0]|
|  634|[78.0,130.0,111.0]|
|  634| [78.0,104.0,78.0]|
|  314|  [44.0,48.0,65.0]|
|  405|  [59.0,63.0,80.0]|
|  530| [79.0,83.0,100.0]|
|  630|[79.0,103.0,120.0]|
|  195|  [45.0,30.0,35.0]|
|  205|  [50.0,20.0,55.0]|
|  395|  [60.0,45.0,50.0]|
|  195|  [40.0,35.0,30.0]|
|  205|  [45.0,25.0,50.0]|
|  395|  [65.0,90.0,40.0]|
|  495| [65.0,150.0,40.0]|
+-----+------------------+
only showing top 20 rows



Importamos el método de regresión lineal de PySpark

In [96]:
from pyspark.ml.regression import LinearRegression

In [97]:
# Separamos los datos de entrenamiento y testeo

train_data, test_data = data.randomSplit([0.8,0.2])  # Separación aleatoria del 80% para entrenar y el 20% para testeo

Se crea una instancia de LinearRegression

In [99]:
# Definimos las variables independientes (featuresCol) y la columna objetivo (labelCol)
reg = LinearRegression(featuresCol='Indep_feat', labelCol='Total')

Entrenamos el modelo

In [100]:
reg = reg.fit(train_data)

Podemos examinar los coeficientes del modelo

In [101]:
reg.coefficients

DenseVector([1.5895, 1.627, 1.2738])

In [105]:
reg.intercept

101.53502717221144

Predicción

In [107]:
y_hat_train = reg.evaluate(test_data)
y_hat_train.predictions.show()

+-----+----------------+------------------+
|Total|      Indep_feat|        prediction|
+-----+----------------+------------------+
|  195|[45.0,45.0,35.0]| 290.8615076606625|
|  200|[20.0,15.0,20.0]| 183.2066927861551|
|  205|[45.0,25.0,50.0]| 277.4293251117482|
|  215|[35.0,46.0,34.0]|275.31959153157436|
|  245|[40.0,55.0,30.0]|  292.814657615686|
|  250|[40.0,40.0,40.0]| 281.1482248426511|
|  250|[70.0,20.0,50.0]| 309.0320571011322|
|  260|[50.0,45.0,45.0]| 311.5474075935881|
|  263|[45.0,65.0,34.0]| 322.1274038566878|
|  266|[31.0,45.0,90.0]|338.66943304205734|
|  269|[40.0,30.0,32.0]|254.68766488746394|
|  273|[46.0,57.0,40.0]|318.34403744554845|
|  280|[55.0,40.0,40.0]| 304.9908239690472|
|  285|[45.0,30.0,50.0]| 285.5642583330243|
|  288|[48.0,48.0,48.0]|  317.070864376739|
|  295|[30.0,40.0,70.0]| 303.4682594307676|
|  300|[40.0,40.0,55.0]| 300.2557751788414|
|  300|[40.0,55.0,70.0]|343.76812517886015|
|  300|[40.0,55.0,80.0]|356.50649206965375|
|  300|[45.0,75.0,60.0]| 371.517

Puedo saber más parámetros de la predicción

In [108]:
y_hat_train.meanAbsoluteError

43.11486038903257

In [110]:
y_hat_train.meanSquaredError

1476.9755980985456

# 7. **Variables categóricas** 

In [111]:
from pyspark.ml.feature import StringIndexer

In [112]:
df_pyspark.show()

+--------------------+------+-----+---+------+-------+
|                Name|Type 1|Total| HP|Attack|Defense|
+--------------------+------+-----+---+------+-------+
|           Bulbasaur| Grass|  318| 45|    49|     49|
|             Ivysaur| Grass|  405| 60|    62|     63|
|            Venusaur| Grass|  525| 80|    82|     83|
|VenusaurMega Venu...| Grass|  625| 80|   100|    123|
|          Charmander|  Fire|  309| 39|    52|     43|
|          Charmeleon|  Fire|  405| 58|    64|     58|
|           Charizard|  Fire|  534| 78|    84|     78|
|CharizardMega Cha...|  Fire|  634| 78|   130|    111|
|CharizardMega Cha...|  Fire|  634| 78|   104|     78|
|            Squirtle| Water|  314| 44|    48|     65|
|           Wartortle| Water|  405| 59|    63|     80|
|           Blastoise| Water|  530| 79|    83|    100|
|BlastoiseMega Bla...| Water|  630| 79|   103|    120|
|            Caterpie|   Bug|  195| 45|    30|     35|
|             Metapod|   Bug|  205| 50|    20|     55|
|         

Vamos a trasformar la variable categórica Type 1

In [113]:
indexer = StringIndexer(inputCol='Type 1', outputCol='Type_index')

In [116]:
df_index = indexer.fit(df_pyspark).transform(df_pyspark)
df_index.show()

+--------------------+------+-----+---+------+-------+----------+
|                Name|Type 1|Total| HP|Attack|Defense|Type_index|
+--------------------+------+-----+---+------+-------+----------+
|           Bulbasaur| Grass|  318| 45|    49|     49|       2.0|
|             Ivysaur| Grass|  405| 60|    62|     63|       2.0|
|            Venusaur| Grass|  525| 80|    82|     83|       2.0|
|VenusaurMega Venu...| Grass|  625| 80|   100|    123|       2.0|
|          Charmander|  Fire|  309| 39|    52|     43|       5.0|
|          Charmeleon|  Fire|  405| 58|    64|     58|       5.0|
|           Charizard|  Fire|  534| 78|    84|     78|       5.0|
|CharizardMega Cha...|  Fire|  634| 78|   130|    111|       5.0|
|CharizardMega Cha...|  Fire|  634| 78|   104|     78|       5.0|
|            Squirtle| Water|  314| 44|    48|     65|       0.0|
|           Wartortle| Water|  405| 59|    63|     80|       0.0|
|           Blastoise| Water|  530| 79|    83|    100|       0.0|
|Blastoise