In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.3/spark-3.2.3-bin-hadoop3.2.tgz
!tar xf spark-3.2.3-bin-hadoop3.2.tgz
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.3-bin-hadoop3.2"
!pip install -q findspark
!pip install -q pyspark

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

La diferencia con el dataframe de Pandas es que el de PySpark puede manejar gran volumen de datos y distribuir su procesamiento en muchas máquinas

### Crear dataframes

#### A partir de un rdd

In [None]:
# Crear dataframe desde RDD. Df son datos estructurados,
# entonces se puede proponer un esquema explícita o implícitamente
rdd = sc.parallelize([item for item in range(10)]).map(lambda x: (x, x**2))
rdd.collect()

[(0, 0),
 (1, 1),
 (2, 4),
 (3, 9),
 (4, 16),
 (5, 25),
 (6, 36),
 (7, 49),
 (8, 64),
 (9, 81)]

In [None]:
# Se le entrega el nombre de las columnas
df = rdd.toDF(['número', 'cuadrado'])

In [None]:
# Para ver el esquema
df.printSchema()

root
 |-- número: long (nullable = true)
 |-- cuadrado: long (nullable = true)



In [None]:
# Para ver los primeros registros (por edefecto son los 20 primeros)
df.show()

+------+--------+
|número|cuadrado|
+------+--------+
|     0|       0|
|     1|       1|
|     2|       4|
|     3|       9|
|     4|      16|
|     5|      25|
|     6|      36|
|     7|      49|
|     8|      64|
|     9|      81|
+------+--------+



In [None]:
# Crear un dataframe a partir de RDD con schema
rdd1 = sc.parallelize([(1,'José', 35.5), (2, 'Teresa', 54.3), (3, 'Katia',12.7)])

In [None]:
# Opción 1
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
schema1 = StructType(
    [
        # StructField(nombre_columna, tipo_dato, admite_null)
        StructField('id', IntegerType(), True),
        StructField('nombre', StringType(), True),
        StructField('saldo', DoubleType(), True)
    ]
)

In [None]:
# Opción 2
schema2 = "`id` INT, `nombre` STRING, `saldo` DOUBLE"

In [None]:
df1 = spark.createDataFrame(rdd1, schema = schema1)
df1.printSchema()

root
 |-- id: integer (nullable = true)
 |-- nombre: string (nullable = true)
 |-- saldo: double (nullable = true)



In [None]:
df2 = spark.createDataFrame(rdd1, schema = schema2)
df2.printSchema()

root
 |-- id: integer (nullable = true)
 |-- nombre: string (nullable = true)
 |-- saldo: double (nullable = true)



In [None]:
# Crear un DataFrame a partir de un rango de números
spark.range(1,5).toDF('id').show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
+---+



#### A partir de lectura de archivos


*   spark.read.json(path)
*   spark.read.parquet(path)
*   spark.read.jdbc(path)
*   spark.read.orc(path)
*   spark.read.csv(path)
*   spark.read.text(path)



In [None]:
# Crear un DataFrame mediante la lectura de un archivo de texto
df = spark.read.text('./data/dataTXT.txt')
df.show()
df.show(truncate=False)

+--------------------+
|               value|
+--------------------+
|Estamos en el cur...|
|En este capítulo ...|
|En esta sección e...|
|y en este ejemplo...|
+--------------------+

+-----------------------------------------------------------------------+
|value                                                                  |
+-----------------------------------------------------------------------+
|Estamos en el curso de pyspark                                         |
|En este capítulo estamos estudiando el API SQL de Saprk                |
|En esta sección estamos creado dataframes a partir de fuentes de datos,|
|y en este ejemplo creamos un dataframe a partir de un texto plano      |
+-----------------------------------------------------------------------+



In [None]:
# Crear un DataFrame mediante la lectura de un archivo csv
df1 = spark.read.csv('./data/dataCSV.csv')
df1.show()

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+--------------------+--------------------+
|        _c0|          _c1|                 _c2|                 _c3|        _c4|                 _c5|                 _c6|    _c7|   _c8|     _c9|         _c10|                _c11|             _c12|            _c13|                _c14|                _c15|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+--------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|vid

In [None]:
df1 = spark.read.option('header', 'true').csv('./data/dataCSV.csv')
df1.show()

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...|        CaseyNeistat|         22|2017-11-13T17:13:...|     SHANtell martin| 748374| 57527|    2966|        15954|https://i.ytimg.c...|            False|           Fal

In [None]:
# Leer un archivo de texto con un delimitador diferente
df2 = spark.read.option('header', 'true').option('delimiter', '|').csv('./data/dataTab.txt')
df2.show()

+----+----+----------+-----+
|pais|edad|     fecha|color|
+----+----+----------+-----+
|  MX|  23|2021-02-21| rojo|
|  CA|  56|2021-06-10| azul|
|  US|  32|2020-06-02|verde|
+----+----+----------+-----+



In [None]:
# Crear un DataFrame a partir de un json proporcionando un schema
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
json_schema =  StructType(
    [
     StructField('color', StringType(), True),
     StructField('edad', IntegerType(), True),
     StructField('fecha', DateType(), True),
     StructField('pais', StringType(), True)
    ]
)
df4 = spark.read.schema(json_schema).json('./data/dataJSON.json')
df4.show()

+-----+----+----------+----+
|color|edad|     fecha|pais|
+-----+----+----------+----+
| rojo|null|2021-02-21|  MX|
| azul|null|2021-06-10|  CA|
|verde|null|2020-06-02|  US|
+-----+----+----------+----+



In [None]:
df4.printSchema()

root
 |-- color: string (nullable = true)
 |-- edad: integer (nullable = true)
 |-- fecha: date (nullable = true)
 |-- pais: string (nullable = true)



In [None]:
# Crear un DataFrame a partir de un archivo parquet
df5 = spark.read.parquet('./data/dataPARQUET.parquet')
df5.show()

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...|        CaseyNeistat|         22|2017-11-13T17:13:...|     SHANtell martin| 748374| 57527|    2966|        15954|https://i.ytimg.c...|            False|           Fal

In [None]:
# Otra alternativa para leer desde una fuente de datos parquet en este caso
df6 = spark.read.format('parquet').load('./data/dataPARQUET.parquet')
df6.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



### Trabajo columnas

##### SELECT y SELECT EXPR

In [None]:
df = spark.read.parquet('./data/dataPARQUET.parquet')
df.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [None]:
# Primera alternativa para referirnos a las columnas
df.select('title').show()

+--------------------+
|               title|
+--------------------+
|WE WANT TO TALK A...|
|The Trump Preside...|
|Racist Superman |...|
|Nickelback Lyrics...|
|I Dare You: GOING...|
|2 Weeks with iPho...|
|Roy Moore & Jeff ...|
|5 Ice Cream Gadge...|
|The Greatest Show...|
|Why the rise of t...|
|Dion Lewis' 103-Y...|
|(SPOILERS) 'Shiva...|
|Marshmello - Bloc...|
|Which Countries A...|
|SHOPPING FOR NEW ...|
|    The New SpotMini|
|One Change That W...|
|How does your bod...|
|HomeMade Electric...|
|Founding An Inbre...|
+--------------------+
only showing top 20 rows



In [None]:
# Segunda alternativa
from pyspark.sql.functions import col
df.select(col('title')).show()

+--------------------+
|               title|
+--------------------+
|WE WANT TO TALK A...|
|The Trump Preside...|
|Racist Superman |...|
|Nickelback Lyrics...|
|I Dare You: GOING...|
|2 Weeks with iPho...|
|Roy Moore & Jeff ...|
|5 Ice Cream Gadge...|
|The Greatest Show...|
|Why the rise of t...|
|Dion Lewis' 103-Y...|
|(SPOILERS) 'Shiva...|
|Marshmello - Bloc...|
|Which Countries A...|
|SHOPPING FOR NEW ...|
|    The New SpotMini|
|One Change That W...|
|How does your bod...|
|HomeMade Electric...|
|Founding An Inbre...|
+--------------------+
only showing top 20 rows



In [None]:
df.select(col('video_id')).show()
df.select('video_id', 'trending_date').show()

+-----------+
|   video_id|
+-----------+
|2kyS6SvSYSE|
|1ZAPwfrtAFY|
|5qpjK5DgCt4|
|puqaWrEC7tY|
|d380meD0W0M|
|gHZ1Qz0KiKM|
|39idVpFF7NQ|
|nc99ccSXST0|
|jr9QtXwC9vc|
|TUmyygCMMGA|
|9wRQljFNDW8|
|VifQlJit6A0|
|5E4ZBSInqUU|
|GgVmn66oK_A|
|TaTleo4cOs8|
|kgaO45SyaO4|
|ZAQs-ctOqXQ|
|YVfyYrEmzgM|
|eNSN6qet1kE|
|B5HORANmzHw|
+-----------+
only showing top 20 rows

+-----------+-------------+
|   video_id|trending_date|
+-----------+-------------+
|2kyS6SvSYSE|     17.14.11|
|1ZAPwfrtAFY|     17.14.11|
|5qpjK5DgCt4|     17.14.11|
|puqaWrEC7tY|     17.14.11|
|d380meD0W0M|     17.14.11|
|gHZ1Qz0KiKM|     17.14.11|
|39idVpFF7NQ|     17.14.11|
|nc99ccSXST0|     17.14.11|
|jr9QtXwC9vc|     17.14.11|
|TUmyygCMMGA|     17.14.11|
|9wRQljFNDW8|     17.14.11|
|VifQlJit6A0|     17.14.11|
|5E4ZBSInqUU|     17.14.11|
|GgVmn66oK_A|     17.14.11|
|TaTleo4cOs8|     17.14.11|
|kgaO45SyaO4|     17.14.11|
|ZAQs-ctOqXQ|     17.14.11|
|YVfyYrEmzgM|     17.14.11|
|eNSN6qet1kE|     17.14.11|
|B5HORANmzHw|     17.1

In [None]:
# Esta vía nos dará error
df.select(
    'likes',
    'dislikes',
    ('likes' - 'dislikes')
).show()

In [None]:
# Forma correcta
df.select(
    col('likes'),
    col('dislikes'),
    (col('likes') - col('dislikes')).alias('aceptacion')
).show()

+------+--------+----------+
| likes|dislikes|aceptacion|
+------+--------+----------+
| 57527|    2966|   54561.0|
| 97185|    6146|   91039.0|
|146033|    5339|  140694.0|
| 10172|     666|    9506.0|
|132235|    1989|  130246.0|
|  9763|     511|    9252.0|
| 15993|    2445|   13548.0|
| 23663|     778|   22885.0|
|  3543|     119|    3424.0|
| 12654|    1363|   11291.0|
|   655|      25|     630.0|
|  1576|     303|    1273.0|
|114188|    1333|  112855.0|
|  7848|    1171|    6677.0|
|  7473|     246|    7227.0|
|  9419|      52|    9367.0|
|  8011|     638|    7373.0|
|  5398|      53|    5345.0|
| 11963|      36|   11927.0|
|  8421|     191|    8230.0|
+------+--------+----------+
only showing top 20 rows



In [None]:
# selectExpr
df.selectExpr('likes', 'dislikes', '(likes - dislikes) as aceptacion').show()
df.selectExpr("count(distinct(video_id)) as videos").show()


+------+--------+----------+
| likes|dislikes|aceptacion|
+------+--------+----------+
| 57527|    2966|   54561.0|
| 97185|    6146|   91039.0|
|146033|    5339|  140694.0|
| 10172|     666|    9506.0|
|132235|    1989|  130246.0|
|  9763|     511|    9252.0|
| 15993|    2445|   13548.0|
| 23663|     778|   22885.0|
|  3543|     119|    3424.0|
| 12654|    1363|   11291.0|
|   655|      25|     630.0|
|  1576|     303|    1273.0|
|114188|    1333|  112855.0|
|  7848|    1171|    6677.0|
|  7473|     246|    7227.0|
|  9419|      52|    9367.0|
|  8011|     638|    7373.0|
|  5398|      53|    5345.0|
| 11963|      36|   11927.0|
|  8421|     191|    8230.0|
+------+--------+----------+
only showing top 20 rows

+------+
|videos|
+------+
|  6837|
+------+



##### FILTER y WHERE

In [None]:
df.filter(col('video_id') == '2kyS6SvSYSE').show()

+-----------+-------------+--------------------+-------------+-----------+--------------------+---------------+-------+-----+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|channel_title|category_id|        publish_time|           tags|  views|likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+-------------+-----------+--------------------+---------------+-------+-----+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...| CaseyNeistat|         22|2017-11-13T17:13:...|SHANtell martin| 748374|57527|    2966|        15954|https://i.ytimg.c...|            False|           False|                 False|SHANTELL'S CHANNE...|
|2ky

In [None]:
df1 = spark.read.parquet('./data/datos.parquet').where(col('trending_date') != '17.14.11')
df1.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|            video_id|       trending_date|               title|       channel_title|         category_id|       publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|\nCook with confi...|             recipes|              videos| and restaurant g...| dining destinations|               null|                

In [None]:
df2 = spark.read.parquet('./data/datos.parquet').where(col('likes') > 5000)
df2.filter((col('trending_date') != '17.14.11') & (col('likes') > 7000)).show()

+-----------+-------------+--------------------+--------------------+-----------+-------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|       publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+-------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|YvfYK0EEhK4|     17.15.11|Brent Pella - Why...|         Brent Pella|         23|2017-11-14 15:32:51|"spirit airlines"...| 462490| 14132|     795|          666|https://i.ytimg.c...|            False|           False| 

In [None]:
df2.filter(col('trending_date') != '17.14.11').filter(col('likes') > 7000).show()

+-----------+-------------+--------------------+--------------------+-----------+-------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|       publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+-------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|YvfYK0EEhK4|     17.15.11|Brent Pella - Why...|         Brent Pella|         23|2017-11-14 15:32:51|"spirit airlines"...| 462490| 14132|     795|          666|https://i.ytimg.c...|            False|           False| 

##### DISTINCT y DROPDUPLICATES

In [None]:
df = spark.read.format('parquet').load('./data/dataPARQUET.parquet')
df.count()

48137

In [None]:
# distinct
df_sin_duplicados = df.distinct()

In [None]:
print('El conteo del dataframe original es {}'.format(df.count()))

El conteo del dataframe original es 48137


In [None]:
print('El conteo del dataframe sin duplicados es {}'.format(df_sin_duplicados.count()))

El conteo del dataframe sin duplicados es 41497


In [None]:
# función dropDuplicates
dataframe = spark.createDataFrame([(1, 'azul', 567), (2, 'rojo', 487), (1, 'azul', 345), (2, 'verde', 783)]).toDF('id', 'color', 'importe')
dataframe.show()

+---+-----+-------+
| id|color|importe|
+---+-----+-------+
|  1| azul|    567|
|  2| rojo|    487|
|  1| azul|    345|
|  2|verde|    783|
+---+-----+-------+



In [None]:
dataframe.dropDuplicates(['id', 'color']).show()
# dropDuplicates sin parametros = distinct

+---+-----+-------+
| id|color|importe|
+---+-----+-------+
|  1| azul|    567|
|  2| rojo|    487|
|  2|verde|    783|
+---+-----+-------+



##### SORT y ORDERBY

In [None]:
from pyspark.sql.functions import col
df = df.select(col('likes'), col('views'), col('video_id'), col('dislikes')).dropDuplicates(['video_id'])

In [None]:
# sort
df.sort('likes').show() # Por defecto menor a mayor
from pyspark.sql.functions import desc
df.sort(desc('likes')).show()

+-----+-----+--------------------+--------+
|likes|views|            video_id|dislikes|
+-----+-----+--------------------+--------+
| null| null|Awesome Games Pla...|    null|
| null| null|Filmed by Lucas F...|    null|
| null| null|    Beautiful Thing:|    null|
| null| null|Bon Appétit Test ...|    null|
| null| null|Filmed at the Bee...|    null|
| null| null|Britton Lane: htt...|    null|
| null| null|Allie Sherlock: h...|    null|
| null| null|Browse thousands ...|    null|
| null| null|   ABOUT BON APPÉTIT|    null|
| null| null|Catch Terry Crews...|    null|
| null| null|    ABOUT EPICURIOUS|    null|
| null| null|Check Out My WEBS...|    null|
| null| null|    ABOUT TEEN VOGUE|    null|
| null| null|Check out the Dam...|    null|
| null| null|         ABOUT VOGUE|    null|
| null| null|          City Song:|    null|
| null| null|Filmed at the Wal...|    null|
| null| null|            Clearly:|    null|
| null| null|Black Panther is ...|    null|
| null| null|Cook with confide..

El resultado de orderBy y sort es lo mismo, solo que orderBy es más relacional.

In [None]:
# función orderBy
df.orderBy(col('views')).show()
df.orderBy(col('views').desc()).show()

+-----+-----+--------------------+--------+
|likes|views|            video_id|dislikes|
+-----+-----+--------------------+--------+
| null| null|Catch Terry Crews...|    null|
| null| null|ABOUT ARCHITECTUR...|    null|
| null| null|Check Out My WEBS...|    null|
| null| null|    ABOUT EPICURIOUS|    null|
| null| null|Check out the Dam...|    null|
| null| null|         ABOUT WIRED|    null|
| null| null|          City Song:|    null|
| null| null|   ABOUT BON APPÉTIT|    null|
| null| null|            Clearly:|    null|
| null| null|Browse thousands ...|    null|
| null| null|Cook with confide...|    null|
| null| null|   ABOUT VANITY FAIR|    null|
| null| null|Directed by Lucas...|    null|
| null| null|Britton Lane: htt...|    null|
| null| null|Arts and entertai...|    null|
| null| null|Awesome Games Pla...|    null|
| null| null|    Beautiful Thing:|    null|
| null| null|  Expert travel tips|    null|
| null| null|Black Panther is ...|    null|
| null| null|             Fashio

In [None]:
# funcion limit 
top_10 = df.orderBy(col('views').desc()).limit(10)
top_10.show()

+------+-------+-----------+--------+
| likes|  views|   video_id|dislikes|
+------+-------+-----------+--------+
|126363| 999910|gw82GrEt370|    1034|
| 78088| 999867|cyhU06cXfeU|     690|
| 58552| 998908|QIN5_tJRiyY|    1080|
|151348|9988608|fAIX12F6958|   10274|
| 70972| 998362|LC3fWTXZXxE|    1608|
|  4727|  99796|kOsl3cmK3zg|     152|
|   120|   9977|1L_fPteZOYQ|      11|
|   299|  99674|Yzx_tSlifIw|      95|
|119634| 996318|__1SjDrSMik|    1143|
|  3959|  99619|9ymjcSvEyhk|     158|
+------+-------+-----------+--------+



##### WITHCOLUMN y WITHCOLUMNRENAMED

In [None]:
from pyspark.sql.functions import col
df = spark.read.parquet('./dataPARQUET.parquet')

In [None]:
# withColumn: Agrega columna al df
df_valoracion = df.withColumn('valoracion', col('likes') - col('dislikes'))
df_valoracion.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)
 |-- valoracion: double (nullable = true)



In [None]:
# Agregar varias columnas a la vez
df_valoracion1 = (df.withColumn('valoracion', col('likes') - col('dislikes'))
                    .withColumn('res_div', col('valoracion') % 10)
)
df_valoracion1.printSchema()
df_valoracion1.select(col('likes'), col('dislikes'), col('valoracion'), col('res_div')).show()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)
 |-- valoracion: double (nullable = true)
 |-- res_div: double (nullable = true)

+------+--------+----------+-------+
| likes|dislikes|valoracion|res_div|
+------+--------+----------+-------+
| 57527|    2966|   54561.0|    1.0|
| 97185|    6146|   91039.0|    9.0|
|146033|    5339|  140694.0|    4.0|
| 

In [None]:
# withColumnRenamed: cambia nombre de una columna existente
df_renombrado = df.withColumnRenamed('video_id', 'id')
df_renombrado.printSchema()

root
 |-- id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [None]:
df_error = df.withColumnRenamed('nombre_que_no_existe', 'otro_nombre')
df_error.printSchema()


##### DROP, SAMPLE y RANDOMSPLIT

In [None]:
# drop
df.printSchema()
df_util = df.drop('comments_disabled')
df_util.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views

In [None]:
# Borrar varias
df_util = df.drop('comments_disabled', 'ratings_disabled', 'thumbnail_link')
df_util.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [None]:
# No arroja error si el nombre no existe
df_util = df.drop('comments_disabled', 'ratings_disabled', 'thumbnail_link', 'cafe')
df_util.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [None]:
# sample: selecciona aleatoriamente la fracción de filas indicada
df_muestra = df.sample(0.8)
num_filas = df.count()
num_filas_muestra = df_muestra.count()
print('El 80% de filas del dataframe original es {}'.format(num_filas - (num_filas*0.2)))
print('El numero de filas del dataframe muestra es {}'.format(num_filas_muestra))

El 80% de filas del dataframe original es 38509.6
El numero de filas del dataframe muestra es 38340


In [None]:
df_muestra = df.sample(fraction=0.8, seed=1234)
df_muestra = df.sample(withReplacement=True, fraction=0.8, seed=1234)

In [None]:
# randomSplit: Se usa para entrenar modelos de ML. Devuelve uno o más df, según la cantidad de pesos que se especifiquen. 
# Si los pesos no suman 1, estos se normalizarán para sumar 1.

train, test = df.randomSplit([0.8, 0.2], seed=1234)
train, validation, test = df.randomSplit([0.6, 0.2, 0.2], seed=1234)
train.count()

28808

In [None]:
validation.count()

9698

In [None]:
test.count()

9631

### Trabajo con datos incorrectos o faltantes

In [None]:
# Eliminar filas con al menos un NA
df.count()
df.na.drop().count()
df.na.drop('any').count()
df.dropna().count()

40390

In [None]:
# Eliminar filas con NA en las columnas indicadas
df.na.drop(subset=['views']).count()
df.na.drop(subset=['views', 'dislikes']).count()

41035

In [None]:
from pyspark.sql.functions import col
df.orderBy(col('views')).select(col('views'), col('likes'), col('dislikes')).show()

+-----+-----+--------+
|views|likes|dislikes|
+-----+-----+--------+
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
+-----+-----+--------+
only showing top 20 rows



In [None]:
df.fillna(0).orderBy(col('views')).select(col('views'), col('likes'), col('dislikes')).show()

+-----+-----+--------+
|views|likes|dislikes|
+-----+-----+--------+
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
+-----+-----+--------+
only showing top 20 rows



In [None]:
df.fillna(0, subset=['likes', 'dislikes']).orderBy(col('views')).select(col('views'), col('likes'), col('dislikes')).show()

+-----+-----+--------+
|views|likes|dislikes|
+-----+-----+--------+
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
+-----+-----+--------+
only showing top 20 rows



### Escribir df

In [None]:
df1 = df.repartition(2) # 2 pariticiones
df1.write.format('csv').option('sep', '|').save('./output/csv')

In [None]:
df1.coalesce(1).write.format('csv').option('sep', '|').save('./output/csv2')

In [None]:
df.printSchema()
df.select('comments_disabled').distinct().show()
from pyspark.sql.functions import col
df_limpio = df.filter(col('comments_disabled').isin('True', 'False')) # Dejo solo los valores true o false
df_limpio.write.partitionBy('comments_disabled').parquet('./output/parquet')


root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)

+-----------------+
|comments_disabled|
+-----------------+
|            False|
|             null|
| sports and more.|
|          Wiz Kid|
|             True|
|         farfalle|
+-----------------+

