<a href="https://colab.research.google.com/github/gabrielfernandorey/EDVAI/blob/main/PySpark/PySpark_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PySpark

### Instalación y carga de Pyspark

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=07a978b557063632b0b17d5c5bebb85a4cc29d003a3f2c0eac784803796da279
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('test_pyspark').getOrCreate()

### Librerías necesarias

In [4]:
from pyspark.sql.types import StringType, BooleanType, FloatType, IntegerType, DoubleType, DateType
import pyspark.sql.functions as F
from pyspark.sql.functions import sum, col, desc, asc, count, countDistinct, round, max, min, avg
from pyspark.sql.functions import to_timestamp,date_format
from pyspark.sql.window import Window

from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, HasInputCols, HasOutputCols, Param, Params, TypeConverters
from pyspark import keyword_only
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml import Model
from pyspark.ml import Estimator

from datetime import datetime
import numpy as np

### Importamos datos

In [5]:
!wget https://data-engineer-edvai.s3.amazonaws.com/f1/results.csv

--2023-05-03 21:38:02--  https://data-engineer-edvai.s3.amazonaws.com/f1/results.csv
Resolving data-engineer-edvai.s3.amazonaws.com (data-engineer-edvai.s3.amazonaws.com)... 52.216.1.240, 52.216.177.147, 3.5.29.167, ...
Connecting to data-engineer-edvai.s3.amazonaws.com (data-engineer-edvai.s3.amazonaws.com)|52.216.1.240|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1633624 (1.6M) [text/csv]
Saving to: ‘results.csv’


2023-05-03 21:38:03 (103 MB/s) - ‘results.csv’ saved [1633624/1633624]



In [6]:
!wget https://data-engineer-edvai.s3.amazonaws.com/f1/drivers.csv

--2023-05-03 21:38:15--  https://data-engineer-edvai.s3.amazonaws.com/f1/drivers.csv
Resolving data-engineer-edvai.s3.amazonaws.com (data-engineer-edvai.s3.amazonaws.com)... 52.216.217.201, 52.216.51.193, 52.217.123.17, ...
Connecting to data-engineer-edvai.s3.amazonaws.com (data-engineer-edvai.s3.amazonaws.com)|52.216.217.201|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93568 (91K) [text/csv]
Saving to: ‘drivers.csv’


2023-05-03 21:38:15 (17.6 MB/s) - ‘drivers.csv’ saved [93568/93568]



In [7]:
!wget https://data-engineer-edvai.s3.amazonaws.com/f1/constructors.csv

--2023-05-03 21:38:31--  https://data-engineer-edvai.s3.amazonaws.com/f1/constructors.csv
Resolving data-engineer-edvai.s3.amazonaws.com (data-engineer-edvai.s3.amazonaws.com)... 52.217.232.81, 52.217.16.68, 3.5.8.173, ...
Connecting to data-engineer-edvai.s3.amazonaws.com (data-engineer-edvai.s3.amazonaws.com)|52.217.232.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17384 (17K) [text/csv]
Saving to: ‘constructors.csv’


2023-05-03 21:38:31 (35.2 MB/s) - ‘constructors.csv’ saved [17384/17384]



In [None]:
# Con esta alternativa algunos tipos de datos no se importan de forma adecuada
# df = spark.read.option("header","true").csv("*.csv")

In [8]:
# Con esta alternativa los tipos de datos se importan de forma adecuada (siempre verificar!)
df_results = spark.read.option("header","true").option("inferSchema", "true") .csv("results.csv")

In [9]:
df_drivers = spark.read.option("header","true").option("inferSchema", "true") .csv("drivers.csv")

In [10]:
df_constructors = spark.read.option("header","true").option("inferSchema", "true") .csv("constructors.csv")

### Visualizamos schema

In [11]:
df_results.printSchema()

root
 |-- resultId: integer (nullable = true)
 |-- raceId: integer (nullable = true)
 |-- driverId: integer (nullable = true)
 |-- constructorId: integer (nullable = true)
 |-- number: string (nullable = true)
 |-- grid: integer (nullable = true)
 |-- position: string (nullable = true)
 |-- positionText: string (nullable = true)
 |-- positionOrder: integer (nullable = true)
 |-- points: double (nullable = true)
 |-- laps: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- milliseconds: string (nullable = true)
 |-- fastestLap: string (nullable = true)
 |-- rank: string (nullable = true)
 |-- fastestLapTime: string (nullable = true)
 |-- fastestLapSpeed: string (nullable = true)
 |-- statusId: integer (nullable = true)



In [14]:
df_results.show(5)

+--------+------+--------+-------------+------+----+--------+------------+-------------+------+----+-----------+------------+----------+----+--------------+---------------+--------+
|resultId|raceId|driverId|constructorId|number|grid|position|positionText|positionOrder|points|laps|       time|milliseconds|fastestLap|rank|fastestLapTime|fastestLapSpeed|statusId|
+--------+------+--------+-------------+------+----+--------+------------+-------------+------+----+-----------+------------+----------+----+--------------+---------------+--------+
|       1|    18|       1|            1|    22|   1|       1|           1|            1|  10.0|  58|1:34:50.616|     5690616|        39|   2|      1:27.452|        218.300|       1|
|       2|    18|       2|            2|     3|   5|       2|           2|            2|   8.0|  58|     +5.478|     5696094|        41|   3|      1:27.739|        217.586|       1|
|       3|    18|       3|            3|     7|   7|       3|           3|            3|  

In [12]:
df_drivers.printSchema()

root
 |-- driverId: integer (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- number: string (nullable = true)
 |-- code: string (nullable = true)
 |-- forename: string (nullable = true)
 |-- surname: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)



In [15]:
df_drivers.show(5)

+--------+----------+------+----+--------+----------+----------+-----------+--------------------+
|driverId| driverRef|number|code|forename|   surname|       dob|nationality|                 url|
+--------+----------+------+----+--------+----------+----------+-----------+--------------------+
|       1|  hamilton|    44| HAM|   Lewis|  Hamilton|1985-01-07|    British|http://en.wikiped...|
|       2|  heidfeld|    \N| HEI|    Nick|  Heidfeld|1977-05-10|     German|http://en.wikiped...|
|       3|   rosberg|     6| ROS|    Nico|   Rosberg|1985-06-27|     German|http://en.wikiped...|
|       4|    alonso|    14| ALO|Fernando|    Alonso|1981-07-29|    Spanish|http://en.wikiped...|
|       5|kovalainen|    \N| KOV|  Heikki|Kovalainen|1981-10-19|    Finnish|http://en.wikiped...|
+--------+----------+------+----+--------+----------+----------+-----------+--------------------+
only showing top 5 rows



In [13]:
df_constructors.printSchema()

root
 |-- constructorId: integer (nullable = true)
 |-- constructorRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)



In [17]:
df_constructors.show(5)

+-------------+--------------+----------+-----------+--------------------+
|constructorId|constructorRef|      name|nationality|                 url|
+-------------+--------------+----------+-----------+--------------------+
|            1|       mclaren|   McLaren|    British|http://en.wikiped...|
|            2|    bmw_sauber|BMW Sauber|     German|http://en.wikiped...|
|            3|      williams|  Williams|    British|http://en.wikiped...|
|            4|       renault|   Renault|     French|http://en.wikiped...|
|            5|    toro_rosso|Toro Rosso|    Italian|http://en.wikiped...|
+-------------+--------------+----------+-----------+--------------------+
only showing top 5 rows



### insertar en la tabla driver_results los corredores con mayor cantidad de puntos en la historia.

In [18]:
# Creamos vista
df_results.createOrReplaceTempView("vista_results")

In [None]:
df_a = spark.sql("select VendorId, cast(tpep_pickup_datetime as date), cast(payment_type as int), total_amount from yellow_tripdata where payment_type = 1")

In [None]:
df_5.show(10)

+--------+--------------------+------------+------------+
|VendorId|tpep_pickup_datetime|payment_type|total_amount|
+--------+--------------------+------------+------------+
|       1|          2021-01-01|           1|       51.95|
|       1|          2021-01-01|           1|       36.35|
|       2|          2021-01-01|           1|       24.36|
|       1|          2021-01-01|           1|       14.15|
|       1|          2021-01-01|           1|       18.95|
|       2|          2021-01-01|           1|        24.3|
|       2|          2021-01-01|           1|       10.79|
|       2|          2021-01-01|           1|       14.16|
|       2|          2021-01-01|           1|        10.3|
|       2|          2021-01-01|           1|       12.09|
+--------+--------------------+------------+------------+
only showing top 10 rows

