In [1]:
import pyspark
from delta import *
from delta.tables import *
from pyspark.sql.functions import *

builder = pyspark.sql.SparkSession.builder.appName("pos").master("spark://spark-master:7077") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.access.key","datalake") \
    .config("spark.hadoop.fs.s3a.secret.key","datalake") \
    .config("spark.hadoop.fs.s3a.endpoint","http://minio:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()

In [2]:
spark

In [3]:
type(spark)

pyspark.sql.session.SparkSession

In [8]:
df_alunos = spark.read.csv('s3a://camada-bronze/pos/ALUNOS.csv',header=True)

In [9]:
df_notas = spark.read.csv('s3a://camada-bronze/pos/NOTAS.csv',header=True)

In [17]:
df_alunos_notas = df_alunos.join(df_notas,df_alunos.id == df_notas.id)

In [19]:
df_aprovados = df_alunos_notas.where(df_alunos_notas.nota > 7)

In [21]:
df_aprovados.show()

+---+------+---+----+
| id|  nome| id|nota|
+---+------+---+----+
|  3|  DARA|  3|   8|
|  4|DEIVIS|  4|  10|
+---+------+---+----+



In [24]:
df_aprovados.select('nome','nota').write.csv('s3a://camada-ouro/pos/aprovados',mode='overwrite')

In [29]:
df_results = spark.read.csv('s3a://camada-bronze/futebol/',header=False)

In [30]:
df_results.show()

+----------+----------------+--------+---+---+--------+---------+-------------------+-----+
|       _c0|             _c1|     _c2|_c3|_c4|     _c5|      _c6|                _c7|  _c8|
+----------+----------------+--------+---+---+--------+---------+-------------------+-----+
|1872-11-30|        Scotland| England|  0|  0|Friendly|  Glasgow|           Scotland|FALSE|
|1873-03-08|         England|Scotland|  4|  2|Friendly|   London|            England|FALSE|
|1874-03-07|        Scotland| England|  2|  1|Friendly|  Glasgow|           Scotland|FALSE|
|1875-03-06|         England|Scotland|  2|  2|Friendly|   London|            England|FALSE|
|1876-03-04|        Scotland| England|  3|  0|Friendly|  Glasgow|           Scotland|FALSE|
|1876-03-25|        Scotland|   Wales|  4|  0|Friendly|  Glasgow|           Scotland|FALSE|
|1877-03-03|         England|Scotland|  1|  3|Friendly|   London|            England|FALSE|
|1877-03-05|           Wales|Scotland|  0|  2|Friendly|  Wrexham|              W

In [31]:
df_results.count()

40839

In [38]:
df_results = df_results \
    .withColumnRenamed('_c0','dt') \
    .withColumnRenamed('_c1','mandante') \
    .withColumnRenamed('_c2','visitante') \
    .withColumnRenamed('_c3','gol_mandante') \
    .withColumnRenamed('_c4','gol_visitante') \
    .withColumnRenamed('_c5','torneio') \
    .withColumnRenamed('_c6','cidade') \
    .withColumnRenamed('_c7','pais') \
    .drop('_c8')

In [39]:
df_results.show()

+----------+----------------+---------+------------+-------------+--------+---------+-------------------+
|        dt|        mandante|visitante|gol_mandante|gol_visitante| torneio|   cidade|               pais|
+----------+----------------+---------+------------+-------------+--------+---------+-------------------+
|1872-11-30|        Scotland|  England|           0|            0|Friendly|  Glasgow|           Scotland|
|1873-03-08|         England| Scotland|           4|            2|Friendly|   London|            England|
|1874-03-07|        Scotland|  England|           2|            1|Friendly|  Glasgow|           Scotland|
|1875-03-06|         England| Scotland|           2|            2|Friendly|   London|            England|
|1876-03-04|        Scotland|  England|           3|            0|Friendly|  Glasgow|           Scotland|
|1876-03-25|        Scotland|    Wales|           4|            0|Friendly|  Glasgow|           Scotland|
|1877-03-03|         England| Scotland|       

In [None]:
df_results \
    .select('torneio') \
    .distinct() \
    .orderBy('torneio') \
    .show(100,False)

+------------------------------------+
|torneio                             |
+------------------------------------+
|ABCS Tournament                     |
|AFC Asian Cup                       |
|AFC Asian Cup qualification         |
|AFC Challenge Cup                   |
|AFC Challenge Cup qualification     |
|AFF Championship                    |
|AFF Championship qualification      |
|African Cup of Nations              |
|African Cup of Nations qualification|
|African Nations Championship        |
|Amílcar Cabral Cup                  |
|Atlantic Cup                        |
|Atlantic Heritage Cup               |
|Balkan Cup                          |
|Baltic Cup                          |
|Brazil Independence Cup             |
|British Championship                |
|CCCF Championship                   |
|CECAFA Cup                          |
|CFU Caribbean Cup                   |
|CFU Caribbean Cup qualification     |
|CONCACAF Championship               |
|CONCACAF Championship qu

In [46]:
df_results.createOrReplaceTempView('resultados')

In [51]:
spark.sql('select distinct torneio From resultados order by torneio').show(100,False)

+------------------------------------+
|torneio                             |
+------------------------------------+
|ABCS Tournament                     |
|AFC Asian Cup                       |
|AFC Asian Cup qualification         |
|AFC Challenge Cup                   |
|AFC Challenge Cup qualification     |
|AFF Championship                    |
|AFF Championship qualification      |
|African Cup of Nations              |
|African Cup of Nations qualification|
|African Nations Championship        |
|Amílcar Cabral Cup                  |
|Atlantic Cup                        |
|Atlantic Heritage Cup               |
|Balkan Cup                          |
|Baltic Cup                          |
|Brazil Independence Cup             |
|British Championship                |
|CCCF Championship                   |
|CECAFA Cup                          |
|CFU Caribbean Cup                   |
|CFU Caribbean Cup qualification     |
|CONCACAF Championship               |
|CONCACAF Championship qu

In [52]:
df_results \
    .select('torneio') \
    .distinct() \
    .orderBy('torneio') \
    .show(100,False)

+------------------------------------+
|torneio                             |
+------------------------------------+
|ABCS Tournament                     |
|AFC Asian Cup                       |
|AFC Asian Cup qualification         |
|AFC Challenge Cup                   |
|AFC Challenge Cup qualification     |
|AFF Championship                    |
|AFF Championship qualification      |
|African Cup of Nations              |
|African Cup of Nations qualification|
|African Nations Championship        |
|Amílcar Cabral Cup                  |
|Atlantic Cup                        |
|Atlantic Heritage Cup               |
|Balkan Cup                          |
|Baltic Cup                          |
|Brazil Independence Cup             |
|British Championship                |
|CCCF Championship                   |
|CECAFA Cup                          |
|CFU Caribbean Cup                   |
|CFU Caribbean Cup qualification     |
|CONCACAF Championship               |
|CONCACAF Championship qu

In [66]:
df_jogosBrasil = df_results \
    .where((df_results.mandante == 'Brazil') | (df_results.visitante == 'Brazil')) 

In [67]:
df_jogosBrasil.show()

+----------+---------+---------+------------+-------------+------------+--------------+---------+
|        dt| mandante|visitante|gol_mandante|gol_visitante|     torneio|        cidade|     pais|
+----------+---------+---------+------------+-------------+------------+--------------+---------+
|1914-09-20|Argentina|   Brazil|           3|            0|    Friendly|  Buenos Aires|Argentina|
|1914-09-27|Argentina|   Brazil|           0|            1|   Copa Roca|  Buenos Aires|Argentina|
|1916-07-08|   Brazil|    Chile|           1|            1|Copa América|  Buenos Aires|Argentina|
|1916-07-10|Argentina|   Brazil|           1|            1|Copa América|  Buenos Aires|Argentina|
|1916-07-12|   Brazil|  Uruguay|           1|            2|Copa América|  Buenos Aires|Argentina|
|1916-07-18|  Uruguay|   Brazil|           0|            1|    Friendly|    Montevideo|  Uruguay|
|1917-10-03|Argentina|   Brazil|           4|            2|Copa América|    Montevideo|  Uruguay|
|1917-10-07|  Urugua

In [70]:
df_jogosBrasil \
    .groupby('torneio') \
    .agg({'torneio':'count'}) \
    .orderBy('count(torneio)') \
    .show(100,False)

+----------------------------+--------------+
|torneio                     |count(torneio)|
+----------------------------+--------------+
|King's Cup                  |1             |
|Lunar New Year Cup          |1             |
|Rous Cup                    |2             |
|Mundialito                  |3             |
|USA Cup                     |3             |
|Tournoi de France           |3             |
|Brazil Independence Cup     |4             |
|Atlantic Cup                |5             |
|Copa Bernardo O'Higgins     |10            |
|Gold Cup                    |14            |
|Pan American Championship   |16            |
|Copa Oswaldo Cruz           |16            |
|Copa Rio Branco             |18            |
|Copa Roca                   |23            |
|Confederations Cup          |33            |
|FIFA World Cup qualification|104           |
|FIFA World Cup              |109           |
|Copa América                |184           |
|Friendly                    |428 

In [71]:
import requests

In [72]:
r = requests.get('https://random-data-api.com/api/v2/users')

In [74]:
r.json()

{'id': 2792,
 'uid': 'c97e8ed7-6f1e-4d44-aac2-204264eddf47',
 'password': 'gxEJP21cku',
 'first_name': 'Hortensia',
 'last_name': 'Kerluke',
 'username': 'hortensia.kerluke',
 'email': 'hortensia.kerluke@email.com',
 'avatar': 'https://robohash.org/optioinciduntratione.png?size=300x300&set=set1',
 'gender': 'Genderfluid',
 'phone_number': '+1 663.059.0860 x2760',
 'social_insurance_number': '239406408',
 'date_of_birth': '1977-02-16',
 'employment': {'title': 'Accounting Coordinator',
  'key_skill': 'Organisation'},
 'address': {'city': 'West Domingamouth',
  'street_name': 'Malena Freeway',
  'street_address': '6607 Loura Ridges',
  'zip_code': '24994-2098',
  'state': 'Montana',
  'country': 'United States',
  'coordinates': {'lat': -62.16084211469928, 'lng': -83.47745684568388}},
 'credit_card': {'cc_number': '6771-8938-9514-3503'},
 'subscription': {'plan': 'Starter',
  'status': 'Blocked',
  'payment_method': 'Credit card',
  'term': 'Full subscription'}}