# Repaso de comandos básicos

In [1]:
import findspark # con esto llamamos a Spark
findspark.init()

In [4]:
from pyspark.sql import SparkSession

# Crear SparkSession

In [5]:
spark = SparkSession.builder.getOrCreate() #creacion de la SparkSession

In [6]:
spark

# Creacion de la aplicacion

In [5]:
# Creamos app
spark = SparkSession.builder.appName('firstSession')\
    .config('spark.master','local[4]')\
    .config('spark.shuffle.sql.partitions',1)\
    .getOrCreate()

# master porque estamos en el portatil, es decir, crear sparksession en modo cliente

In [6]:
spark

In [11]:
# para saber la configuracion de spark
spark.conf.get('spark.shuffle.sql.partitions')

'2'

In [10]:
spark.conf.set('spark.shuffle.sql.partitions',2)

# Crear tabla

In [13]:
cols = ['id','nombre','x']
lista = [
    (1,'bui',2),
    (2,'paco',3)]

In [14]:
lista

[(1, 'bui', 2), (2, 'paco', 3)]

In [15]:
cols

['id', 'nombre', 'x']

In [17]:
df1 = spark.createDataFrame(lista,schema=cols)

In [20]:
df1.count()

2

In [21]:
df1.show()

+---+------+---+
| id|nombre|  x|
+---+------+---+
|  1|   bui|  2|
|  2|  paco|  3|
+---+------+---+



In [22]:
df1.columns

['id', 'nombre', 'x']

In [24]:
df1.printSchema() #infiere los tipos

root
 |-- id: long (nullable = true)
 |-- nombre: string (nullable = true)
 |-- x: long (nullable = true)



In [25]:
df1.describe().show()

+-------+------------------+------+------------------+
|summary|                id|nombre|                 x|
+-------+------------------+------+------------------+
|  count|                 2|     2|                 2|
|   mean|               1.5|  null|               2.5|
| stddev|0.7071067811865476|  null|0.7071067811865476|
|    min|                 1|   bui|                 2|
|    max|                 2|  paco|                 3|
+-------+------------------+------+------------------+



In [26]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [30]:
schema1 = StructType(
   [
       StructField('id',IntegerType(),True),
       StructField('nombre',StringType(),True),
       StructField('y',IntegerType(),True)
   ] )

In [31]:
df2 = spark.createDataFrame(lista,schema=schema1)

In [32]:
df2.show()

+---+------+---+
| id|nombre|  y|
+---+------+---+
|  1|   bui|  2|
|  2|  paco|  3|
+---+------+---+



In [33]:
df2.printSchema()

root
 |-- id: integer (nullable = true)
 |-- nombre: string (nullable = true)
 |-- y: integer (nullable = true)



# A partir de un csv

In [34]:
df = spark.read.csv('formula_one.csv',header=True) #sep = ','

In [35]:
df.count()

25140

In [36]:
df.show()

+--------+----------+----+---------+----------+----------+-----------+--------------------+--------+------+------+--------+-----+----+-----+---------+--------------------+----------+-----------+--------------------+
|driverId| driverRef|code| forename|   surname|       dob|nationality|               url_x|resultId|raceId|points|position| wins|year|round|circuitId|                name|      date|     time_x|               url_y|
+--------+----------+----+---------+----------+----------+-----------+--------------------+--------+------+------+--------+-----+----+-----+---------+--------------------+----------+-----------+--------------------+
|       1|  hamilton| HAM|    Lewis|  Hamilton|1985-01-07|    British|http://en.wikiped...|       1|    18|  10.0|       1| True|2008|    1|        1|Australian Grand ...|2008-03-16|1:34:50.616|http://en.wikiped...|
|       2|  heidfeld| HEI|     Nick|  Heidfeld|1977-05-10|     German|http://en.wikiped...|       2|    18|   8.0|       2|False|2008|  

In [37]:
df.printSchema()

root
 |-- driverId: string (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- code: string (nullable = true)
 |-- forename: string (nullable = true)
 |-- surname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url_x: string (nullable = true)
 |-- resultId: string (nullable = true)
 |-- raceId: string (nullable = true)
 |-- points: string (nullable = true)
 |-- position: string (nullable = true)
 |-- wins: string (nullable = true)
 |-- year: string (nullable = true)
 |-- round: string (nullable = true)
 |-- circuitId: string (nullable = true)
 |-- name: string (nullable = true)
 |-- date: string (nullable = true)
 |-- time_x: string (nullable = true)
 |-- url_y: string (nullable = true)



# Creacion de Parquet

In [39]:
# vamos a guardarlo como parquet
df.write.parquet('parquet_example',mode='overwrite') #parquet_example es la carpeta

In [40]:
#leemos el parque
dfp = spark.read.parquet('parquet_example')

In [41]:
dfp.show(5)

+--------+----------+----+--------+----------+----------+-----------+--------------------+--------+------+------+--------+-----+----+-----+---------+--------------------+----------+-----------+--------------------+
|driverId| driverRef|code|forename|   surname|       dob|nationality|               url_x|resultId|raceId|points|position| wins|year|round|circuitId|                name|      date|     time_x|               url_y|
+--------+----------+----+--------+----------+----------+-----------+--------------------+--------+------+------+--------+-----+----+-----+---------+--------------------+----------+-----------+--------------------+
|       1|  hamilton| HAM|   Lewis|  Hamilton|1985-01-07|    British|http://en.wikiped...|       1|    18|  10.0|       1| True|2008|    1|        1|Australian Grand ...|2008-03-16|1:34:50.616|http://en.wikiped...|
|       2|  heidfeld| HEI|    Nick|  Heidfeld|1977-05-10|     German|http://en.wikiped...|       2|    18|   8.0|       2|False|2008|    1| 

In [42]:
dfp.printSchema()

root
 |-- driverId: string (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- code: string (nullable = true)
 |-- forename: string (nullable = true)
 |-- surname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url_x: string (nullable = true)
 |-- resultId: string (nullable = true)
 |-- raceId: string (nullable = true)
 |-- points: string (nullable = true)
 |-- position: string (nullable = true)
 |-- wins: string (nullable = true)
 |-- year: string (nullable = true)
 |-- round: string (nullable = true)
 |-- circuitId: string (nullable = true)
 |-- name: string (nullable = true)
 |-- date: string (nullable = true)
 |-- time_x: string (nullable = true)
 |-- url_y: string (nullable = true)



In [43]:
dfp.describe().show()

+-------+------------------+---------+-----+--------+---------+----------+-----------+--------------------+------------------+-----------------+------------------+-----------------+-----+------------------+-----------------+------------------+--------------------+----------+-----------------+--------------------+
|summary|          driverId|driverRef| code|forename|  surname|       dob|nationality|               url_x|          resultId|           raceId|            points|         position| wins|              year|            round|         circuitId|                name|      date|           time_x|               url_y|
+-------+------------------+---------+-----+--------+---------+----------+-----------+--------------------+------------------+-----------------+------------------+-----------------+-----+------------------+-----------------+------------------+--------------------+----------+-----------------+--------------------+
|  count|             25140|    25140|25140|   25140|  

In [44]:
df_pandas = dfp.toPandas() #esto no es para manejar grandes vol de datos

In [45]:
df_pandas.head()

Unnamed: 0,driverId,driverRef,code,forename,surname,dob,nationality,url_x,resultId,raceId,points,position,wins,year,round,circuitId,name,date,time_x,url_y
0,1,hamilton,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton,1,18,10.0,1,True,2008,1,1,Australian Grand Prix,2008-03-16,1:34:50.616,http://en.wikipedia.org/wiki/2008_Australian_G...
1,2,heidfeld,HEI,Nick,Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld,2,18,8.0,2,False,2008,1,1,Australian Grand Prix,2008-03-16,+5.478,http://en.wikipedia.org/wiki/2008_Australian_G...
2,3,rosberg,ROS,Nico,Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg,3,18,6.0,3,False,2008,1,1,Australian Grand Prix,2008-03-16,+8.163,http://en.wikipedia.org/wiki/2008_Australian_G...
3,4,alonso,ALO,Fernando,Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso,4,18,5.0,4,False,2008,1,1,Australian Grand Prix,2008-03-16,+17.181,http://en.wikipedia.org/wiki/2008_Australian_G...
4,5,kovalainen,KOV,Heikki,Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen,5,18,4.0,5,False,2008,1,1,Australian Grand Prix,2008-03-16,+18.014,http://en.wikipedia.org/wiki/2008_Australian_G...


In [46]:
type(df_pandas)

pandas.core.frame.DataFrame

In [47]:
spark.stop()

# Más cosas

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('columnTransform').getOrCreate()

In [7]:
df = spark.read.parquet('parquet_example')

In [8]:
df.show(2)

+--------+---------+----+--------+--------+----------+-----------+--------------------+--------+------+------+--------+-----+----+-----+---------+--------------------+----------+-----------+--------------------+
|driverId|driverRef|code|forename| surname|       dob|nationality|               url_x|resultId|raceId|points|position| wins|year|round|circuitId|                name|      date|     time_x|               url_y|
+--------+---------+----+--------+--------+----------+-----------+--------------------+--------+------+------+--------+-----+----+-----+---------+--------------------+----------+-----------+--------------------+
|       1| hamilton| HAM|   Lewis|Hamilton|1985-01-07|    British|http://en.wikiped...|       1|    18|  10.0|       1| True|2008|    1|        1|Australian Grand ...|2008-03-16|1:34:50.616|http://en.wikiped...|
|       2| heidfeld| HEI|    Nick|Heidfeld|1977-05-10|     German|http://en.wikiped...|       2|    18|   8.0|       2|False|2008|    1|        1|Austra

In [9]:
df.printSchema()

root
 |-- driverId: string (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- code: string (nullable = true)
 |-- forename: string (nullable = true)
 |-- surname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url_x: string (nullable = true)
 |-- resultId: string (nullable = true)
 |-- raceId: string (nullable = true)
 |-- points: string (nullable = true)
 |-- position: string (nullable = true)
 |-- wins: string (nullable = true)
 |-- year: string (nullable = true)
 |-- round: string (nullable = true)
 |-- circuitId: string (nullable = true)
 |-- name: string (nullable = true)
 |-- date: string (nullable = true)
 |-- time_x: string (nullable = true)
 |-- url_y: string (nullable = true)



In [16]:
sel_col = ['driverId','forename','surname']
sel_col

['driverId', 'forename', 'surname']

In [17]:
df_sel = df.select(sel_col)

In [18]:
df_sel.show(5)

+--------+--------+----------+
|driverId|forename|   surname|
+--------+--------+----------+
|       1|   Lewis|  Hamilton|
|       2|    Nick|  Heidfeld|
|       3|    Nico|   Rosberg|
|       4|Fernando|    Alonso|
|       5|  Heikki|Kovalainen|
+--------+--------+----------+
only showing top 5 rows



In [26]:
#Renombrar las columnas
df_sel = df_sel.withColumnRenamed("driverId","id")

In [27]:
df_sel.show()

+---+---------+----------+
| id| forename|   surname|
+---+---------+----------+
|  1|    Lewis|  Hamilton|
|  2|     Nick|  Heidfeld|
|  3|     Nico|   Rosberg|
|  4| Fernando|    Alonso|
|  5|   Heikki|Kovalainen|
|  6|   Kazuki|  Nakajima|
|  7|Sébastien|  Bourdais|
|  8|     Kimi| Räikkönen|
|  9|   Robert|    Kubica|
| 10|     Timo|     Glock|
| 11|   Takuma|      Sato|
| 12|   Nelson|Piquet Jr.|
| 13|   Felipe|     Massa|
| 14|    David| Coulthard|
| 15|    Jarno|    Trulli|
| 16|   Adrian|     Sutil|
| 17|     Mark|    Webber|
| 18|   Jenson|    Button|
| 19|  Anthony|  Davidson|
| 20|Sebastian|    Vettel|
+---+---------+----------+
only showing top 20 rows



In [29]:
#ordenar columnas
df_sel.sort('surname')[["surname","forename"]].show(5)

+---------+--------+
|  surname|forename|
+---------+--------+
|    Abate|   Carlo|
|    Abate|   Carlo|
|Abecassis|  George|
|Abecassis|  George|
|  Acheson|   Kenny|
+---------+--------+
only showing top 5 rows



In [30]:
from pyspark.sql import functions as F

In [31]:
df_sel.sort(F.desc('surname'))[["surname","forename"]].show(5)

+---------+--------+
|  surname|forename|
+---------+--------+
|Étancelin|Philippe|
|Étancelin|Philippe|
|Étancelin|Philippe|
|Étancelin|Philippe|
|Étancelin|Philippe|
+---------+--------+
only showing top 5 rows



In [32]:
# cambiar tipo de columnas
from pyspark.sql.types import DoubleType, IntegerType, StringType

In [33]:
df_sel = df_sel.withColumn("id",F.col("id").cast(DoubleType()))

In [35]:
df_sel.printSchema()

root
 |-- id: double (nullable = true)
 |-- forename: string (nullable = true)
 |-- surname: string (nullable = true)



In [36]:
# Filtros
df_filter = df_sel.filter(F.col("id")>100)

In [39]:
df_filter.count()

16169

In [40]:
df_sel.count()

25140

In [45]:
df_filter2 = df_sel.filter((F.col("forename")=="Fernando")&(F.col("surname")=="Alonso"))

In [46]:
df_filter2.show()

+---+--------+-------+
| id|forename|surname|
+---+--------+-------+
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
|4.0|Fernando| Alonso|
+---+--------+-------+
only showing top 20 rows



In [51]:
df_year = df.withColumn('year2',F.split("date",'\-')[0].cast(IntegerType()))

In [52]:
df_year.show(2)

+--------+---------+----+--------+--------+----------+-----------+--------------------+--------+------+------+--------+-----+----+-----+---------+--------------------+----------+-----------+--------------------+-----+
|driverId|driverRef|code|forename| surname|       dob|nationality|               url_x|resultId|raceId|points|position| wins|year|round|circuitId|                name|      date|     time_x|               url_y|year2|
+--------+---------+----+--------+--------+----------+-----------+--------------------+--------+------+------+--------+-----+----+-----+---------+--------------------+----------+-----------+--------------------+-----+
|       1| hamilton| HAM|   Lewis|Hamilton|1985-01-07|    British|http://en.wikiped...|       1|    18|  10.0|       1| True|2008|    1|        1|Australian Grand ...|2008-03-16|1:34:50.616|http://en.wikiped...| 2008|
|       2| heidfeld| HEI|    Nick|Heidfeld|1977-05-10|     German|http://en.wikiped...|       2|    18|   8.0|       2|False|200

In [53]:
df_year.printSchema()

root
 |-- driverId: string (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- code: string (nullable = true)
 |-- forename: string (nullable = true)
 |-- surname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url_x: string (nullable = true)
 |-- resultId: string (nullable = true)
 |-- raceId: string (nullable = true)
 |-- points: string (nullable = true)
 |-- position: string (nullable = true)
 |-- wins: string (nullable = true)
 |-- year: string (nullable = true)
 |-- round: string (nullable = true)
 |-- circuitId: string (nullable = true)
 |-- name: string (nullable = true)
 |-- date: string (nullable = true)
 |-- time_x: string (nullable = true)
 |-- url_y: string (nullable = true)
 |-- year2: integer (nullable = true)



In [54]:
spark.stop()