## CRIANDO CONEXÃO

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("AulaSpark").enableHiveSupport().getOrCreate()

In [6]:
type(spark)

pyspark.sql.session.SparkSession

In [7]:
spark

## CRIAR UM DF

In [8]:
df_aluno = spark.createDataFrame([
    {"id" : 1, "nome": "Fabio"},
    {"id" : 2, "nome": "Yuri"},
    {"id" : 3, "nome": "Fernando"}
])

In [9]:
type(df_aluno)

pyspark.sql.dataframe.DataFrame

In [11]:
help(df_aluno.filter)

Help on method filter in module pyspark.sql.dataframe:

filter(condition: 'ColumnOrName') -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Filters rows using the given condition.
    
    :func:`where` is an alias for :func:`filter`.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    condition : :class:`Column` or str
        a :class:`Column` of :class:`types.BooleanType`
        or a string of SQL expression.
    
    Examples
    --------
    >>> df.filter(df.age > 3).collect()
    [Row(age=5, name='Bob')]
    >>> df.where(df.age == 2).collect()
    [Row(age=2, name='Alice')]
    
    >>> df.filter("age > 3").collect()
    [Row(age=5, name='Bob')]
    >>> df.where("age = 2").collect()
    [Row(age=2, name='Alice')]



## CRIAR DF COM ARQUIVOS

In [16]:
## HDFS
df_csv = spark.read.csv(path='hdfs:/aula/raw/pessoa/pessoas.csv',header=True,sep=',')

In [17]:
df_csv.show()

+----+---------+---+
|  id|     name|age|
+----+---------+---+
|1201|  Marcelo| 25|
|1202|    Luana| 23|
|1203|   Daniel| 39|
|1204|  Eduardo| 23|
|1205|   Carlos| 23|
|1206|     Beto| 27|
|1207|     Joao| 43|
|1208|Francisco| 43|
|1209|   Amanda| 33|
|1210|  Vitoria| 23|
+----+---------+---+



In [20]:
df_json = spark.read.json(path='hdfs:/aula/raw/pessoa/pessoas.json')

In [24]:
df_json.show()

+---+---+-------------+
|age| id|         name|
+---+---+-------------+
| 28|  1|       Helena|
| 44|  2|       Miguel|
| 97|  3|        Alice|
| 28|  4|        Laura|
| 94|  5|      Manuela|
| 71|  6|    Valentina|
| 48|  7|       Sophia|
| 98|  8|     Isabella|
| 60|  9|      Heloísa|
| 46| 10|        Luiza|
| 47| 11|        Júlia|
| 35| 12|       Lorena|
| 32| 13|        Lívia|
| 90| 14|  Maria Luiza|
| 88| 15|      Cecília|
| 36| 16|         Eloá|
| 57| 17|     Giovanna|
| 75| 18|  Maria Clara|
| 97| 19|Maria Eduarda|
| 65| 20|      Mariana|
+---+---+-------------+
only showing top 20 rows



In [26]:
#S3
df_jsonS3 = spark.read.json('s3a://camada-bronze/pessoa')

In [28]:
df_jsonS3.show(5)

+---+---+-------+
|age| id|   name|
+---+---+-------+
| 28|  1| Helena|
| 44|  2| Miguel|
| 97|  3|  Alice|
| 28|  4|  Laura|
| 94|  5|Manuela|
+---+---+-------+
only showing top 5 rows



In [29]:
df_teste = spark.read.load('s3a://camada-bronze/pessoa',format='json')

In [30]:
df_teste.show(5)

+---+---+-------+
|age| id|   name|
+---+---+-------+
| 28|  1| Helena|
| 44|  2| Miguel|
| 97|  3|  Alice|
| 28|  4|  Laura|
| 94|  5|Manuela|
+---+---+-------+
only showing top 5 rows



In [42]:
#local
df_local = spark.read.csv('file:///home/user/pessoa.txt',header=True,sep='@')

In [43]:
df_local.show()

+---+---------+
| id|     nome|
+---+---------+
|  1|    lucas|
|  2|aureliano|
|  3|   sandra|
|  4|  beatriz|
|  5|  eduardo|
|  6|  ernesto|
|  7|   victor|
|  8|alexandre|
+---+---------+



## CRIAR DF COM BD

In [44]:
ulr = 'jdbc:postgresql://postgres:5432/dvdrental'
properties = {
    "user":"admin",
    "password":"admin",
    "driver":"org.postgresql.Driver"
}

In [45]:
df_city = spark.read.jdbc(url=ulr,table='public.city',properties=properties)

In [47]:
df_city.show(10)

+-------+-----------+----------+-------------------+
|city_id|       city|country_id|        last_update|
+-------+-----------+----------+-------------------+
|      2|       Abha|        82|2006-02-15 09:45:25|
|      3|  Abu Dhabi|       101|2006-02-15 09:45:25|
|      4|       Acua|        60|2006-02-15 09:45:25|
|      5|      Adana|        97|2006-02-15 09:45:25|
|      6|Addis Abeba|        31|2006-02-15 09:45:25|
|      7|       Aden|       107|2006-02-15 09:45:25|
|      8|      Adoni|        44|2006-02-15 09:45:25|
|      9| Ahmadnagar|        44|2006-02-15 09:45:25|
|     10|   Akishima|        50|2006-02-15 09:45:25|
|     11|      Akron|       103|2006-02-15 09:45:25|
+-------+-----------+----------+-------------------+
only showing top 10 rows



In [48]:
df_country = spark.read.jdbc(url=ulr,table='public.country',properties=properties)

In [50]:
df_country.show(5)

+----------+--------------+-------------------+
|country_id|       country|        last_update|
+----------+--------------+-------------------+
|         1|   Afghanistan|2006-02-15 09:44:00|
|         2|       Algeria|2006-02-15 09:44:00|
|         3|American Samoa|2006-02-15 09:44:00|
|         4|        Angola|2006-02-15 09:44:00|
|         5|      Anguilla|2006-02-15 09:44:00|
+----------+--------------+-------------------+
only showing top 5 rows



In [51]:
query = '(select c.city_id, c.city,c2.country from public.city c \
        inner join public.country c2 on \
        c.country_id = c2.country_id) as tab'

In [52]:
df_query = spark.read.jdbc(url=ulr,table=query,properties=properties)

In [53]:
df_query.show(5)

+-------+-----------+--------------------+
|city_id|       city|             country|
+-------+-----------+--------------------+
|      2|       Abha|        Saudi Arabia|
|      3|  Abu Dhabi|United Arab Emirates|
|      4|       Acua|              Mexico|
|      5|      Adana|              Turkey|
|      6|Addis Abeba|            Ethiopia|
+-------+-----------+--------------------+
only showing top 5 rows



## AÇÃO E TRANSFORMAÇÃO

In [54]:
## T1
df_city = spark.read.jdbc(url=ulr,table='public.city',properties=properties)

In [55]:
## T2
df_city2 = df_city.filter(df_city.country_id == 101)

In [61]:
## A
df_city2.count()
## T1 + T2 + A

df_city2.explain(extended=True)

== Parsed Logical Plan ==
'Filter (country_id#519 = 101)
+- Relation [city_id#517,city#518,country_id#519,last_update#520] JDBCRelation(public.city) [numPartitions=1]

== Analyzed Logical Plan ==
city_id: int, city: string, country_id: smallint, last_update: timestamp
Filter (cast(country_id#519 as int) = 101)
+- Relation [city_id#517,city#518,country_id#519,last_update#520] JDBCRelation(public.city) [numPartitions=1]

== Optimized Logical Plan ==
Filter (isnotnull(country_id#519) AND (country_id#519 = 101))
+- Relation [city_id#517,city#518,country_id#519,last_update#520] JDBCRelation(public.city) [numPartitions=1]

== Physical Plan ==
*(1) Scan JDBCRelation(public.city) [numPartitions=1] [city_id#517,city#518,country_id#519,last_update#520] PushedFilters: [*IsNotNull(country_id), *EqualTo(country_id,101)], ReadSchema: struct<city_id:int,city:string,country_id:smallint,last_update:timestamp>



## VALIDAR DF

In [66]:
#print
df_city.show(5,truncate=False)

+-------+-----------+----------+-------------------+
|city_id|city       |country_id|last_update        |
+-------+-----------+----------+-------------------+
|2      |Abha       |82        |2006-02-15 09:45:25|
|3      |Abu Dhabi  |101       |2006-02-15 09:45:25|
|4      |Acua       |60        |2006-02-15 09:45:25|
|5      |Adana      |97        |2006-02-15 09:45:25|
|6      |Addis Abeba|31        |2006-02-15 09:45:25|
+-------+-----------+----------+-------------------+
only showing top 5 rows



In [67]:
#describe
df_city.describe().show()

+-------+------------------+--------------------+------------------+
|summary|           city_id|                city|        country_id|
+-------+------------------+--------------------+------------------+
|  count|               652|                 652|               652|
|   mean|326.66257668711654|                null|55.219325153374236|
| stddev|188.62656549041938|                null|31.681053482709448|
|    min|                 1|A Corua (La Corua...|                 1|
|    max|               656|               teste|               111|
+-------+------------------+--------------------+------------------+



In [68]:
#printSchema
df_city.printSchema()

root
 |-- city_id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- country_id: short (nullable = true)
 |-- last_update: timestamp (nullable = true)



In [69]:
#count
df_city.count()

652

In [90]:
#sample
df_city.sample(0.3).show(10)

+-------+-----------+----------+-------------------+
|city_id|       city|country_id|        last_update|
+-------+-----------+----------+-------------------+
|      2|       Abha|        82|2006-02-15 09:45:25|
|      4|       Acua|        60|2006-02-15 09:45:25|
|      6|Addis Abeba|        31|2006-02-15 09:45:25|
|     11|      Akron|       103|2006-02-15 09:45:25|
|     13|  al-Hawiya|        82|2006-02-15 09:45:25|
|     15| al-Qadarif|        89|2006-02-15 09:45:25|
|     22|   Ambattur|        44|2006-02-15 09:45:25|
|     24|     Amroha|        44|2006-02-15 09:45:25|
|     38|    Athenai|        39|2006-02-15 09:45:25|
|     43| Avellaneda|         6|2006-02-15 09:45:25|
+-------+-----------+----------+-------------------+
only showing top 10 rows



In [91]:
#empty
df_city.isEmpty()

False

## MANIPULAR DF

In [93]:
#select
df_city.select('city').show(5,truncate=False)

+-----------+
|city       |
+-----------+
|Abha       |
|Abu Dhabi  |
|Acua       |
|Adana      |
|Addis Abeba|
+-----------+
only showing top 5 rows



In [103]:
df_t = df_city.select('city')

In [104]:
type(df_t)

pyspark.sql.dataframe.DataFrame

In [112]:
#order
df_city.orderBy('city',ascending=False).show(5,truncate=False)

+-------+---------------+----------+--------------------------+
|city_id|city           |country_id|last_update               |
+-------+---------------+----------+--------------------------+
|640    |teste          |15        |2023-02-20 10:42:12.310664|
|639    |teste          |15        |2023-02-18 12:21:40.483955|
|641    |teste          |15        |2023-02-20 11:09:32.443622|
|438    |s-Hertogenbosch|67        |2006-02-15 09:45:25       |
|385    |ostka          |100       |2006-02-15 09:45:25       |
+-------+---------------+----------+--------------------------+
only showing top 5 rows



In [113]:
from pyspark.sql.functions import desc,asc
df_city.orderBy(df_city.city.asc()).show(5,truncate=False)

+-------+------------------------+----------+--------------------------+
|city_id|city                    |country_id|last_update               |
+-------+------------------------+----------+--------------------------+
|1      |A Corua (La Corua) TESTE|87        |2023-03-08 13:55:56.748697|
|656    |ARACAJU                 |11        |2023-06-07 21:15:45.006767|
|2      |Abha                    |82        |2006-02-15 09:45:25       |
|3      |Abu Dhabi               |101       |2006-02-15 09:45:25       |
|4      |Acua                    |60        |2006-02-15 09:45:25       |
+-------+------------------------+----------+--------------------------+
only showing top 5 rows



In [114]:
df_city.orderBy(df_city.city.desc()).show(5,truncate=False)

+-------+---------------+----------+--------------------------+
|city_id|city           |country_id|last_update               |
+-------+---------------+----------+--------------------------+
|640    |teste          |15        |2023-02-20 10:42:12.310664|
|639    |teste          |15        |2023-02-18 12:21:40.483955|
|641    |teste          |15        |2023-02-20 11:09:32.443622|
|438    |s-Hertogenbosch|67        |2006-02-15 09:45:25       |
|385    |ostka          |100       |2006-02-15 09:45:25       |
+-------+---------------+----------+--------------------------+
only showing top 5 rows



In [118]:
#filter where
df_city.filter(df_city.city_id == 2).show()

+-------+----+----------+-------------------+
|city_id|city|country_id|        last_update|
+-------+----+----------+-------------------+
|      2|Abha|        82|2006-02-15 09:45:25|
+-------+----+----------+-------------------+



In [119]:
df_city.where(df_city.city_id == 2).show()

+-------+----+----------+-------------------+
|city_id|city|country_id|        last_update|
+-------+----+----------+-------------------+
|      2|Abha|        82|2006-02-15 09:45:25|
+-------+----+----------+-------------------+



In [123]:
## na drop fill replace
df_city.na.fill('').show()

+-------+--------------------+----------+-------------------+
|city_id|                city|country_id|        last_update|
+-------+--------------------+----------+-------------------+
|      2|                Abha|        82|2006-02-15 09:45:25|
|      3|           Abu Dhabi|       101|2006-02-15 09:45:25|
|      4|                Acua|        60|2006-02-15 09:45:25|
|      5|               Adana|        97|2006-02-15 09:45:25|
|      6|         Addis Abeba|        31|2006-02-15 09:45:25|
|      7|                Aden|       107|2006-02-15 09:45:25|
|      8|               Adoni|        44|2006-02-15 09:45:25|
|      9|          Ahmadnagar|        44|2006-02-15 09:45:25|
|     10|            Akishima|        50|2006-02-15 09:45:25|
|     11|               Akron|       103|2006-02-15 09:45:25|
|     12|              al-Ayn|       101|2006-02-15 09:45:25|
|     13|           al-Hawiya|        82|2006-02-15 09:45:25|
|     14|           al-Manama|        11|2006-02-15 09:45:25|
|     15

In [132]:
#distinct
df_city.select('country_id').distinct().count()

111

In [133]:
#firts
df_city.first()

Row(city_id=2, city='Abha', country_id=82, last_update=datetime.datetime(2006, 2, 15, 9, 45, 25))

In [138]:
#head
df_city.head(3)

[Row(city_id=2, city='Abha', country_id=82, last_update=datetime.datetime(2006, 2, 15, 9, 45, 25)),
 Row(city_id=3, city='Abu Dhabi', country_id=101, last_update=datetime.datetime(2006, 2, 15, 9, 45, 25)),
 Row(city_id=4, city='Acua', country_id=60, last_update=datetime.datetime(2006, 2, 15, 9, 45, 25))]

In [136]:
#limit
df_city.limit(2).show()

+-------+---------+----------+-------------------+
|city_id|     city|country_id|        last_update|
+-------+---------+----------+-------------------+
|      2|     Abha|        82|2006-02-15 09:45:25|
|      3|Abu Dhabi|       101|2006-02-15 09:45:25|
+-------+---------+----------+-------------------+



In [141]:
#tail
df_city.tail(3)

[Row(city_id=653, city='New York', country_id=11, last_update=datetime.datetime(2023, 3, 8, 20, 28, 33, 380279)),
 Row(city_id=655, city='CAMPINAS', country_id=11, last_update=datetime.datetime(2023, 5, 16, 21, 45, 31, 996979)),
 Row(city_id=656, city='ARACAJU', country_id=11, last_update=datetime.datetime(2023, 6, 7, 21, 15, 45, 6767))]

In [142]:
#take
df_city.take(3)

[Row(city_id=2, city='Abha', country_id=82, last_update=datetime.datetime(2006, 2, 15, 9, 45, 25)),
 Row(city_id=3, city='Abu Dhabi', country_id=101, last_update=datetime.datetime(2006, 2, 15, 9, 45, 25)),
 Row(city_id=4, city='Acua', country_id=60, last_update=datetime.datetime(2006, 2, 15, 9, 45, 25))]

In [None]:
##CUIDADOOO
df_city.collect()

In [144]:
## groupby
df_city.groupby('country_id').count().show()

+----------+-----+
|country_id|count|
+----------+-----+
|        31|    1|
|        85|   11|
|        65|    1|
|        53|    1|
|        78|    2|
|       108|    2|
|        34|    4|
|       101|    3|
|        81|    1|
|        28|    3|
|        76|    8|
|        27|    3|
|        26|    1|
|        44|   60|
|       103|   35|
|        12|    3|
|        91|    3|
|        22|    3|
|        93|    3|
|       111|   14|
+----------+-----+
only showing top 20 rows



In [154]:
##agg
df_city \
    .groupby('country_id') \
    .agg({"city":"count"}) \
    .show()

+----------+-----------+
|country_id|count(city)|
+----------+-----------+
|        31|          1|
|        85|         11|
|        65|          1|
|        53|          1|
|        78|          2|
|       108|          2|
|        34|          4|
|       101|          3|
|        81|          1|
|        28|          3|
|        76|          8|
|        27|          3|
|        26|          1|
|        44|         60|
|       103|         35|
|        12|          3|
|        91|          3|
|        22|          3|
|        93|          3|
|       111|         14|
+----------+-----------+
only showing top 20 rows



In [162]:
#join 
df_city \
    .join(df_country,df_city.country_id == df_country.country_id,'inner') \
    .show()

+-------+----------------+----------+-------------------+----------+------------+-------------------+
|city_id|            city|country_id|        last_update|country_id|     country|        last_update|
+-------+----------------+----------+-------------------+----------+------------+-------------------+
|      6|     Addis Abeba|        31|2006-02-15 09:45:25|        31|    Ethiopia|2006-02-15 09:44:00|
|    497|         Springs|        85|2006-02-15 09:45:25|        85|South Africa|2006-02-15 09:44:00|
|    491|      Soshanguve|        85|2006-02-15 09:45:25|        85|South Africa|2006-02-15 09:44:00|
|    437|      Rustenburg|        85|2006-02-15 09:45:25|        85|South Africa|2006-02-15 09:44:00|
|    392|           Paarl|        85|2006-02-15 09:45:25|        85|South Africa|2006-02-15 09:44:00|
|    364|       Newcastle|        85|2006-02-15 09:45:25|        85|South Africa|2006-02-15 09:44:00|
|    273|      Klerksdorp|        85|2006-02-15 09:45:25|        85|South Africa|2

In [167]:
#union
df_city \
    .select('country_id') \
    .union(df_country.select('country_id')) \
    .show(5)

+----------+
|country_id|
+----------+
|        82|
|       101|
|        60|
|        97|
|        31|
+----------+
only showing top 5 rows



In [169]:
#to
j = df_city.toJSON()
j.take(2)

In [177]:
# CUIDADO SE O DF TIVER MUITOS DADOS
import pandas
p = df_city.toPandas()
p

  series = series.astype(t, copy=False)


Unnamed: 0,city_id,city,country_id,last_update
0,2,Abha,82,2006-02-15 09:45:25.000000
1,3,Abu Dhabi,101,2006-02-15 09:45:25.000000
2,4,Acua,60,2006-02-15 09:45:25.000000
3,5,Adana,97,2006-02-15 09:45:25.000000
4,6,Addis Abeba,31,2006-02-15 09:45:25.000000
...,...,...,...,...
647,651,Araraquara,11,2023-03-08 20:22:42.794627
648,652,Divinolandia,11,2023-03-08 20:28:30.059665
649,653,New York,11,2023-03-08 20:28:33.380279
650,655,CAMPINAS,11,2023-05-16 21:45:31.996979


In [180]:
#get partitions
df_city.rdd.getNumPartitions()

1

In [181]:
df_city5 = df_city.repartition(5)

In [182]:
df_city5.rdd.getNumPartitions()

5

In [184]:
#drop column 
df_city.drop('last_update').show(5)

+-------+-----------+----------+
|city_id|       city|country_id|
+-------+-----------+----------+
|      2|       Abha|        82|
|      3|  Abu Dhabi|       101|
|      4|       Acua|        60|
|      5|      Adana|        97|
|      6|Addis Abeba|        31|
+-------+-----------+----------+
only showing top 5 rows



In [185]:
df_city.select('country_id').count()

652

In [187]:
#drop dup
df_city.select('city','country_id').dropDuplicates().count()

635

In [193]:
## criar colunas
from pyspark.sql.functions import monotonically_increasing_id
df_sum = df_city.withColumn('new_id',monotonically_increasing_id())
df_sum.withColumn('soma',df_sum.city_id + df_sum.new_id).show()

+-------+--------------------+----------+-------------------+------+----+
|city_id|                city|country_id|        last_update|new_id|soma|
+-------+--------------------+----------+-------------------+------+----+
|      2|                Abha|        82|2006-02-15 09:45:25|     0|   2|
|      3|           Abu Dhabi|       101|2006-02-15 09:45:25|     1|   4|
|      4|                Acua|        60|2006-02-15 09:45:25|     2|   6|
|      5|               Adana|        97|2006-02-15 09:45:25|     3|   8|
|      6|         Addis Abeba|        31|2006-02-15 09:45:25|     4|  10|
|      7|                Aden|       107|2006-02-15 09:45:25|     5|  12|
|      8|               Adoni|        44|2006-02-15 09:45:25|     6|  14|
|      9|          Ahmadnagar|        44|2006-02-15 09:45:25|     7|  16|
|     10|            Akishima|        50|2006-02-15 09:45:25|     8|  18|
|     11|               Akron|       103|2006-02-15 09:45:25|     9|  20|
|     12|              al-Ayn|       1

In [195]:
#rename column
df_city.withColumnRenamed('last_update','ultima_alteracao').show()

+-------+--------------------+----------+-------------------+
|city_id|                city|country_id|   ultima_alteracao|
+-------+--------------------+----------+-------------------+
|      2|                Abha|        82|2006-02-15 09:45:25|
|      3|           Abu Dhabi|       101|2006-02-15 09:45:25|
|      4|                Acua|        60|2006-02-15 09:45:25|
|      5|               Adana|        97|2006-02-15 09:45:25|
|      6|         Addis Abeba|        31|2006-02-15 09:45:25|
|      7|                Aden|       107|2006-02-15 09:45:25|
|      8|               Adoni|        44|2006-02-15 09:45:25|
|      9|          Ahmadnagar|        44|2006-02-15 09:45:25|
|     10|            Akishima|        50|2006-02-15 09:45:25|
|     11|               Akron|       103|2006-02-15 09:45:25|
|     12|              al-Ayn|       101|2006-02-15 09:45:25|
|     13|           al-Hawiya|        82|2006-02-15 09:45:25|
|     14|           al-Manama|        11|2006-02-15 09:45:25|
|     15

In [196]:
df_csv.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)



In [203]:
#altera tipos
from pyspark.sql.functions import col
df_csv.withColumn("id",col("id").cast('integer')).printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)



In [204]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col
df_csv.withColumn("age",col("age").cast(IntegerType())).printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [207]:
#funcoes
def teste(df):
    print(df.city)

df_city.foreach(teste)

## SQL

In [208]:
#cria view
df_city.createOrReplaceTempView('city')

In [213]:
spark.sql('select * from city limit 5').show()

+-------+-----------+----------+-------------------+
|city_id|       city|country_id|        last_update|
+-------+-----------+----------+-------------------+
|      2|       Abha|        82|2006-02-15 09:45:25|
|      3|  Abu Dhabi|       101|2006-02-15 09:45:25|
|      4|       Acua|        60|2006-02-15 09:45:25|
|      5|      Adana|        97|2006-02-15 09:45:25|
|      6|Addis Abeba|        31|2006-02-15 09:45:25|
+-------+-----------+----------+-------------------+



In [217]:
spark.sql('show tables from default').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|    teste|      false|
|  default|   teste2|      false|
|  default|     city|      false|
|  default|   teste3|      false|
|         |     city|       true|
+---------+---------+-----------+



## TUNNING

In [218]:
# 3.2 +
spark.conf.set('spark.sql.adaptive.enabled','true')

In [219]:
## CACHE
df_city.cache()

DataFrame[city_id: int, city: string, country_id: smallint, last_update: timestamp]

In [222]:
df_city.show()

+-------+--------------------+----------+-------------------+
|city_id|                city|country_id|        last_update|
+-------+--------------------+----------+-------------------+
|      2|                Abha|        82|2006-02-15 09:45:25|
|      3|           Abu Dhabi|       101|2006-02-15 09:45:25|
|      4|                Acua|        60|2006-02-15 09:45:25|
|      5|               Adana|        97|2006-02-15 09:45:25|
|      6|         Addis Abeba|        31|2006-02-15 09:45:25|
|      7|                Aden|       107|2006-02-15 09:45:25|
|      8|               Adoni|        44|2006-02-15 09:45:25|
|      9|          Ahmadnagar|        44|2006-02-15 09:45:25|
|     10|            Akishima|        50|2006-02-15 09:45:25|
|     11|               Akron|       103|2006-02-15 09:45:25|
|     12|              al-Ayn|       101|2006-02-15 09:45:25|
|     13|           al-Hawiya|        82|2006-02-15 09:45:25|
|     14|           al-Manama|        11|2006-02-15 09:45:25|
|     15

In [223]:
#persist
df_city.persist()

DataFrame[city_id: int, city: string, country_id: smallint, last_update: timestamp]

In [224]:
df_city.unpersist()

DataFrame[city_id: int, city: string, country_id: smallint, last_update: timestamp]

In [231]:
#HINT
spark.sql('select /*+ REPARTITION(5) */ * from city').explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange RoundRobinPartitioning(5), REPARTITION_BY_NUM, [plan_id=3539]
   +- Scan JDBCRelation(public.city) [numPartitions=1] [city_id#517,city#518,country_id#519,last_update#520] PushedFilters: [], ReadSchema: struct<city_id:int,city:string,country_id:smallint,last_update:timestamp>




In [232]:
# VARIAVEL BROADCAST
var = 'teste'
broadcastvar = spark.sparkContext.broadcast(var)

## SALVAR DADOS

In [235]:
#HADOOP
df_city.write.csv('hdfs:///aula/process/csv',header=True,sep=';',mode='overwrite')

In [240]:
df_city.repartition(1).write.json('hdfs:///aula/process/json',mode='overwrite')

In [238]:
df_city.write.parquet('hdfs:///aula/process/parquet')

In [239]:
df_city.write.orc('hdfs:///aula/process/orc')

In [241]:
#S3
df_city.write.parquet('s3a://camada-prata/parquet')

In [244]:
df_city.write.parquet('s3a://camada-prata/parquet',mode='append')

## LER ARQUIVO COM SQL

In [243]:
spark.sql('select * from parquet.`s3a://camada-prata/parquet`').show()

+-------+--------------------+----------+-------------------+
|city_id|                city|country_id|        last_update|
+-------+--------------------+----------+-------------------+
|      2|                Abha|        82|2006-02-15 09:45:25|
|      3|           Abu Dhabi|       101|2006-02-15 09:45:25|
|      4|                Acua|        60|2006-02-15 09:45:25|
|      5|               Adana|        97|2006-02-15 09:45:25|
|      6|         Addis Abeba|        31|2006-02-15 09:45:25|
|      7|                Aden|       107|2006-02-15 09:45:25|
|      8|               Adoni|        44|2006-02-15 09:45:25|
|      9|          Ahmadnagar|        44|2006-02-15 09:45:25|
|     10|            Akishima|        50|2006-02-15 09:45:25|
|     11|               Akron|       103|2006-02-15 09:45:25|
|     12|              al-Ayn|       101|2006-02-15 09:45:25|
|     13|           al-Hawiya|        82|2006-02-15 09:45:25|
|     14|           al-Manama|        11|2006-02-15 09:45:25|
|     15

## SALVAR EM DB

In [247]:
df_city.write \
    .jdbc(url=ulr,table='public.aula',properties=properties,mode='append')

## SALVAR NO HIVE

In [249]:
df_city.write.format('hive').saveAsTable('default.city_aula')

In [251]:
df_city.write.format('hive').insertInto('default.city_aula2')

## PARTICIONADO

In [254]:
df_city.write.partitionBy('country_id').format('parquet').mode('append').save('/aula/process/partition')

## DELTA LAKE - LAKEHOUSE

In [1]:
from delta import *
import pyspark
import json
import requests

In [2]:
spark = pyspark.sql.SparkSession.builder.appName('Delta') \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .enableHiveSupport().getOrCreate()

In [3]:
def loadData (qtde):
    list = []
    for x in range (qtde):
        print(x)
        r = requests.get('https://random-data-api.com/api/v2/users')
        list.append(r.json())
        req = spark.read.json(spark.sparkContext.parallelize(list))
        req = req.select( \
         'email' \
        ,'first_name' \
        ,'last_name' \
        ,'gender' \
        ,'id' \
        ,'username' \
                 )
    return req

In [4]:
df = loadData(5)

0
1
2
3
4


In [6]:
df.show(truncate=False)

+-----------------------+----------+---------+-----------+----+-------------+
|email                  |first_name|last_name|gender     |id  |username     |
+-----------------------+----------+---------+-----------+----+-------------+
|josef.beier@email.com  |Josef     |Beier    |Non-binary |2359|josef.beier  |
|donte.zulauf@email.com |Donte     |Zulauf   |Male       |6916|donte.zulauf |
|roy.murray@email.com   |Roy       |Murray   |Agender    |8822|roy.murray   |
|booker.lehner@email.com|Booker    |Lehner   |Agender    |2954|booker.lehner|
|ed.daugherty@email.com |Ed        |Daugherty|Genderqueer|2350|ed.daugherty |
+-----------------------+----------+---------+-----------+----+-------------+



In [7]:
path = '/aula/process/user'

In [8]:
df.write.format('delta').mode('overwrite').save(path)

In [9]:
deltaTable = DeltaTable.forPath(spark,path)

In [10]:
type(deltaTable)

delta.tables.DeltaTable

In [11]:
d = spark.read.format('delta').load(path)

In [12]:
type(d)

pyspark.sql.dataframe.DataFrame

In [14]:
deltaTable.toDF().show()

+--------------------+----------+---------+-----------+----+-------------+
|               email|first_name|last_name|     gender|  id|     username|
+--------------------+----------+---------+-----------+----+-------------+
|ed.daugherty@emai...|        Ed|Daugherty|Genderqueer|2350| ed.daugherty|
|booker.lehner@ema...|    Booker|   Lehner|    Agender|2954|booker.lehner|
|josef.beier@email...|     Josef|    Beier| Non-binary|2359|  josef.beier|
|donte.zulauf@emai...|     Donte|   Zulauf|       Male|6916| donte.zulauf|
|roy.murray@email.com|       Roy|   Murray|    Agender|8822|   roy.murray|
+--------------------+----------+---------+-----------+----+-------------+



In [22]:
## MEGE
#NOVOS DADOS
raw = loadData(20)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [23]:
raw.show()

+--------------------+----------+----------+-----------+----+-------------------+
|               email|first_name| last_name|     gender|  id|           username|
+--------------------+----------+----------+-----------+----+-------------------+
|dierdre.murazik@e...|   Dierdre|   Murazik|   Bigender|7767|    dierdre.murazik|
|roger.mitchell@em...|     Roger|  Mitchell|       Male|1186|     roger.mitchell|
|jesusa.kling@emai...|    Jesusa|     Kling|Genderqueer|1092|       jesusa.kling|
|marilynn.littel@e...|  Marilynn|    Littel| Non-binary| 138|    marilynn.littel|
|cruz.pfeffer@emai...|      Cruz|   Pfeffer|    Agender|4136|       cruz.pfeffer|
|lashandra.connell...| Lashandra|  Connelly| Non-binary|2459| lashandra.connelly|
|gia.buckridge@ema...|       Gia| Buckridge|Genderfluid|5419|      gia.buckridge|
|kieth.daugherty@e...|     Kieth| Daugherty| Non-binary|6062|    kieth.daugherty|
|pete.gleason@emai...|      Pete|   Gleason| Non-binary|6940|       pete.gleason|
|patricia.kihn@e

In [24]:
from delta.tables import *
from pyspark.sql.functions import *

In [38]:
deltaTable.alias('process') \
    .merge(
        raw.alias('raw'),'process.id = raw.id') \
    .whenNotMatchedInsert(values =  {'email' : col('email') \
                                   ,'first_name' : col('first_name') \
                                   ,'last_name' : col('last_name') \
                                   ,'gender' : col('gender') \
                                   ,'id' : col('id') \
                                   ,'username' : col('username') \
                                    }) \
    .whenMatchedUpdate (set =  {'email' : col('raw.email') \
                               ,'first_name' : col('raw.first_name') \
                               ,'last_name' : col('raw.last_name') \
                               ,'gender' : col('raw.gender') \
                               ,'username' : col('raw.username') \
                                }) \
    .execute()

In [39]:
d2 = spark.read.format('delta').load(path)

In [40]:
d2.filter('id = 544').show()

+-----+----------+---------+---------+---+--------------+
|email|first_name|last_name|   gender| id|      username|
+-----+----------+---------+---------+---+--------------+
|     |     teste|  Stanton|Genderflu|544|nannie.stanton|
+-----+----------+---------+---------+---+--------------+



In [33]:
df_fake = spark.createDataFrame([
{"email" : "", "first_name": 'teste', "last_name" :    'Stanton' ,   "gender":"Genderflu", "id":544,"username": "nannie.stanton"}])

In [34]:
df_fake.show()

+-----+----------+---------+---+---------+--------------+
|email|first_name|   gender| id|last_name|      username|
+-----+----------+---------+---+---------+--------------+
|     |     teste|Genderflu|544|  Stanton|nannie.stanton|
+-----+----------+---------+---+---------+--------------+



In [37]:
raw= df_fake