In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round, upper, desc, asc, asc_nulls_last, asc_nulls_first, desc_nulls_first, desc_nulls_last, lit, when
from pyspark.sql.types import *

In [2]:
import findspark

findspark.init()

In [3]:
spark = SparkSession.builder.getOrCreate()

## Criando Data Fram usando csv

In [4]:
schema = 'registro_ancine INTEGER,\
          razao_social STRING,\
          cnpj STRING,\
          atividade_economica STRING,\
          classificacao_atividade STRING'

In [5]:
df = spark.read.format('csv')\
               .option('header', True)\
               .option('sep', ';')\
               .option('encoding', 'ISO-8859-1')\
               .schema(schema)\
               .load('./data/cnae.csv')

In [6]:
print(df.count())

96704


In [8]:
options_dict = {
    'encoding': 'ISO-8859-1',
    'sep': ';',
    'escape': '\'',
    'format': 'csv'
}

df = spark.read.options(**options_dict)\
               .schema(schema)\
               .load('./data/cnae.csv')

In [9]:
df.printSchema()

root
 |-- registro_ancine: integer (nullable = true)
 |-- razao_social: string (nullable = true)
 |-- cnpj: string (nullable = true)
 |-- atividade_economica: string (nullable = true)
 |-- classificacao_atividade: string (nullable = true)



In [7]:
df.show(5)

+---------------+--------------------+------------------+--------------------+-----------------------+
|registro_ancine|        razao_social|              cnpj| atividade_economica|classificacao_atividade|
+---------------+--------------------+------------------+--------------------+-----------------------+
|             10|CONSPIRAÇÃO FILME...|02.020.661/0001-04|Atividades de pro...|              PRINCIPAL|
|             10|CONSPIRAÇÃO FILME...|02.020.661/0001-04|Estúdios cinemato...|             SECUNDARIA|
|            100|RADAR CINEMA E TE...|02.947.857/0001-49|Atividades de pro...|              PRINCIPAL|
|            100|RADAR CINEMA E TE...|02.947.857/0001-49|Atividades de int...|             SECUNDARIA|
|            100|RADAR CINEMA E TE...|02.947.857/0001-49|Atividades de pós...|             SECUNDARIA|
+---------------+--------------------+------------------+--------------------+-----------------------+
only showing top 5 rows



## Salvando o DF em JSON

In [8]:
df.write.format('json').save('./data/cnae.json')

AnalysisException: path file:/home/guilherme/Cursos/Bootcamp_IGTI_Engenheiro_de_Dados_CLOUD/modulo-03/pratica/data/cnae.json already exists.

In [None]:
df_json = spark.read.format('json')\
                    .load('./data/cnae.json')

In [None]:
df_json.printSchema()

root
 |-- atividade_economica: string (nullable = true)
 |-- classificacao_atividade: string (nullable = true)
 |-- cnpj: string (nullable = true)
 |-- razao_social: string (nullable = true)
 |-- registro_ancine: long (nullable = true)



In [None]:
df_json.show(5)

+--------------------+-----------------------+------------------+--------------------+---------------+
| atividade_economica|classificacao_atividade|              cnpj|        razao_social|registro_ancine|
+--------------------+-----------------------+------------------+--------------------+---------------+
|Impressão de mate...|             SECUNDARIA|22.799.456/0001-04|APC SERVIÇOS DE P...|          43658|
|Impressão de mate...|             SECUNDARIA|22.799.456/0001-04|APC SERVIÇOS DE P...|          43658|
|Serviços de acaba...|             SECUNDARIA|22.799.456/0001-04|APC SERVIÇOS DE P...|          43658|
|Atividades de pós...|              PRINCIPAL|33.508.812/0001-00|RICARDO ARISTEU A...|          43659|
|Atividades de pro...|             SECUNDARIA|33.508.812/0001-00|RICARDO ARISTEU A...|          43659|
+--------------------+-----------------------+------------------+--------------------+---------------+
only showing top 5 rows



In [None]:
df_json.count()

96704

## Salvando o DF em ORC

In [None]:
df.write.format('orc').save('./data/cnae.orc')

In [None]:
df_orc = spark.read.format('orc')\
                   .load('./data/cnae.orc')

In [None]:
df_orc.printSchema()

root
 |-- registro_ancine: integer (nullable = true)
 |-- razao_social: string (nullable = true)
 |-- cnpj: string (nullable = true)
 |-- atividade_economica: string (nullable = true)
 |-- classificacao_atividade: string (nullable = true)



## Salvando o DF em parquet

In [None]:
df.write.format('parquet').save('./data/cnae.parquet')

In [None]:
df_parquet = spark.read.format('parquet')\
                       .load('./data/cnae.parquet')

In [None]:
df_parquet.printSchema()

root
 |-- registro_ancine: integer (nullable = true)
 |-- razao_social: string (nullable = true)
 |-- cnpj: string (nullable = true)
 |-- atividade_economica: string (nullable = true)
 |-- classificacao_atividade: string (nullable = true)



In [None]:
df_parquet.show(5)

+---------------+--------------------+------------------+--------------------+-----------------------+
|registro_ancine|        razao_social|              cnpj| atividade_economica|classificacao_atividade|
+---------------+--------------------+------------------+--------------------+-----------------------+
|          34777|TELEOBJETIVA ESTU...|18.260.028/0001-23|Atividades de pro...|              PRINCIPAL|
|          34777|TELEOBJETIVA ESTU...|18.260.028/0001-23|Atividades de pro...|             SECUNDARIA|
|          34777|TELEOBJETIVA ESTU...|18.260.028/0001-23|Outras atividades...|             SECUNDARIA|
|          34777|TELEOBJETIVA ESTU...|18.260.028/0001-23|Produção de filme...|             SECUNDARIA|
|          34779|AGENCIA DE DESENV...|21.542.573/0001-17|Atividades de ass...|              PRINCIPAL|
+---------------+--------------------+------------------+--------------------+-----------------------+
only showing top 5 rows



## Manipulação de Dados

In [None]:
df_titles = spark.read.format('csv')\
                      .option("header", "true")\
                      .option('sep', '\t')\
                      .option('inferSchema', 'true')\
                      .load('./data/title_basics')

In [None]:
df_titles.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [None]:
df_titles.show(5)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [None]:
df_titles.count()

8364115

In [None]:
df_titles.select('tconst', 'primaryTitle', 'runtimeMinutes')\
         .withColumn('runtimeHour', round(col('runtimeMinutes').cast('Int') / 60, 3))\
         .show(5)

+---------+--------------------+--------------+-----------+
|   tconst|        primaryTitle|runtimeMinutes|runtimeHour|
+---------+--------------------+--------------+-----------+
|tt0000001|          Carmencita|             1|      0.017|
|tt0000002|Le clown et ses c...|             5|      0.083|
|tt0000003|      Pauvre Pierrot|             4|      0.067|
|tt0000004|         Un bon bock|            12|        0.2|
|tt0000005|    Blacksmith Scene|             1|      0.017|
+---------+--------------------+--------------+-----------+
only showing top 5 rows



In [None]:
df_titles.select('tconst', 'primaryTitle', 'runtimeMinutes')\
         .withColumn('runtimeHour', round(col('runtimeMinutes').cast('Int') / 60, 3))\
         .withColumn('runtimeHoursPlus', col('runtimeHour') + 2)\
         .show(5)

+---------+--------------------+--------------+-----------+----------------+
|   tconst|        primaryTitle|runtimeMinutes|runtimeHour|runtimeHoursPlus|
+---------+--------------------+--------------+-----------+----------------+
|tt0000001|          Carmencita|             1|      0.017|           2.017|
|tt0000002|Le clown et ses c...|             5|      0.083|           2.083|
|tt0000003|      Pauvre Pierrot|             4|      0.067|           2.067|
|tt0000004|         Un bon bock|            12|        0.2|             2.2|
|tt0000005|    Blacksmith Scene|             1|      0.017|           2.017|
+---------+--------------------+--------------+-----------+----------------+
only showing top 5 rows



## Seleção de Colunas

In [None]:
df_titles.columns

['tconst',
 'titleType',
 'primaryTitle',
 'originalTitle',
 'isAdult',
 'startYear',
 'endYear',
 'runtimeMinutes',
 'genres']

In [None]:
df_titles.select('tconst', 'primaryTitle', 'genres').show(5)

+---------+--------------------+--------------------+
|   tconst|        primaryTitle|              genres|
+---------+--------------------+--------------------+
|tt0000001|          Carmencita|   Documentary,Short|
|tt0000002|Le clown et ses c...|     Animation,Short|
|tt0000003|      Pauvre Pierrot|Animation,Comedy,...|
|tt0000004|         Un bon bock|     Animation,Short|
|tt0000005|    Blacksmith Scene|        Comedy,Short|
+---------+--------------------+--------------------+
only showing top 5 rows



In [None]:
cols = ['tconst', 'primaryTitle', 'genres']
df_titles.select(cols).show(5)

+---------+--------------------+--------------------+
|   tconst|        primaryTitle|              genres|
+---------+--------------------+--------------------+
|tt0000001|          Carmencita|   Documentary,Short|
|tt0000002|Le clown et ses c...|     Animation,Short|
|tt0000003|      Pauvre Pierrot|Animation,Comedy,...|
|tt0000004|         Un bon bock|     Animation,Short|
|tt0000005|    Blacksmith Scene|        Comedy,Short|
+---------+--------------------+--------------------+
only showing top 5 rows



In [None]:
df_titles.select('tconst', 'primaryTitle', upper('genres').alias('genres_upper')).show(5)

+---------+--------------------+--------------------+
|   tconst|        primaryTitle|        genres_upper|
+---------+--------------------+--------------------+
|tt0000001|          Carmencita|   DOCUMENTARY,SHORT|
|tt0000002|Le clown et ses c...|     ANIMATION,SHORT|
|tt0000003|      Pauvre Pierrot|ANIMATION,COMEDY,...|
|tt0000004|         Un bon bock|     ANIMATION,SHORT|
|tt0000005|    Blacksmith Scene|        COMEDY,SHORT|
+---------+--------------------+--------------------+
only showing top 5 rows



In [None]:
df_titles.select('startYear').distinct().show()

+---------+
|startYear|
+---------+
|     1903|
|     1953|
|     1897|
|     1957|
|     1987|
|     1956|
|     1936|
|     2016|
|     2020|
|     2012|
|     1958|
|     1910|
|     1943|
|     1915|
|     1972|
|     1931|
|     2026|
|     1911|
|     1926|
|     1938|
+---------+
only showing top 20 rows



In [None]:
df_titles.dropDuplicates().show(10)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000033|    short|  Horse Trick Riders|          La voltige|      0|     1895|     \N|             1|Comedy,Documentar...|
|tt0000132|    short|          Card Party|Une partie de cartes|      0|     1896|     \N|             1|     Biography,Short|
|tt0214902|    short|        The Magician|         Le magicien|      0|     1898|     \N|             1|Fantasy,Horror,Short|
|tt0225248|    short|Canada Vignettes:...|Canada Vignettes:...|      0|     1979|     \N|             1|     Animation,Short|
|tt0245776|    short|       The Biter Bit|       The Biter Bit|      0|     1899|     \N|             1|        Comedy

In [None]:
df_titles.filter(col('titleType') == 'movie').show(10)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000502|    movie|            Bohemios|            Bohemios|      0|     1905|     \N|           100|                  \N|
|tt0000574|    movie|The Story of the ...|The Story of the ...|      0|     1906|     \N|            70|Action,Adventure,...|
|tt0000591|    movie|    The Prodigal Son|   L'enfant prodigue|      0|     1907|     \N|            90|               Drama|
|tt0000615|    movie|  Robbery Under Arms|  Robbery Under Arms|      0|     1907|     \N|            \N|               Drama|
|tt0000630|    movie|              Hamlet|              Amleto|      0|     1908|     \N|            \N|              

In [None]:
df_titles.filter((col('titleType') == 'movie') & (col('runtimeMinutes') < 90)).show(10)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000574|    movie|The Story of the ...|The Story of the ...|      0|     1906|     \N|            70|Action,Adventure,...|
|tt0001184|    movie|Don Juan de Serra...|Don Juan de Serra...|      0|     1910|     \N|            58|     Adventure,Drama|
|tt0001258|    movie|The White Slave T...|Den hvide slaveha...|      0|     1910|     \N|            45|               Drama|
|tt0001285|    movie|   The Life of Moses|   The Life of Moses|      0|     1909|     \N|            50|Biography,Drama,F...|
|tt0001498|    movie|The Battle of Tra...|The Battle of Tra...|      0|     1911|     \N|            51|              

In [None]:
df_titles.filter((col('titleType') == 'movie') & (col('primaryTitle').like('%Avengers%'))).show(10)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0034639|    movie|        The Avengers|   The Day Will Dawn|      0|     1942|     \N|            98|           Drama,War|
|tt0036194|    movie|The People's Aven...|   Narodnye mstiteli|      0|     1943|     \N|            55|     Documentary,War|
|tt0058651|    movie|  The Three Avengers| Gli invincibili tre|      0|     1964|     \N|           101|Action,Adventure,...|
|tt0069746|    movie|Avengers of the Reef|Avengers of the Reef|      0|     1973|     \N|            84|    Adventure,Family|
|tt0074513|    movie|The Shaolin Avengers|Fang Shi Yu yu Hu...|      0|     1976|     \N|            97|        Action

## Ordenação de Data Frame

In [None]:
df_titles.orderBy('startYear').show(10)

+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|           genres|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+
| tt3155794|    short|    Passage de Venus|    Passage de Venus|      0|     1874|     \N|             1|Documentary,Short|
|tt14495706|    short|   La Rosace Magique|   La Rosace Magique|      0|     1877|     \N|             1|  Animation,Short|
| tt2221420|    short|Sallie Gardner at...|Sallie Gardner at...|      0|     1878|     \N|             1|Documentary,Short|
|tt12592084|    short|   Le singe musicien|   Le singe musicien|      0|     1878|     \N|             1|  Animation,Short|
|tt15320514|    short|   Skeleton of Horse|   Skeleton of Horse|      0|     1881|     \N|             1|  Animation,Short|
| tt7816

In [None]:
df_titles.orderBy(desc(col('startYear').cast('Int'))).show(10)

+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt13907072|    movie|Skyscraper on an ...|Skyscraper on an ...|      0|     2028|     \N|            \N|    Adventure,Sci-Fi|
|tt12072406|tvEpisode|              Hunter|              Hunter|      0|     2028|     \N|            \N| Action,Comedy,Drama|
| tt5637536|    movie|            Avatar 5|            Avatar 5|      0|     2028|     \N|            \N|Action,Adventure,...|
| tt9317988|    short| Space: The Traveler| Space: The Traveler|      0|     2027|     \N|            30|        Sci-Fi,Short|
|tt15144960|    movie|    Time Wars: WWIII|    Time Wars: WWIII|      0|     2027|     \N|            \N|      

In [None]:
df_titles\
    .withColumn('startYear', col('startYear').cast('Int'))\
    .orderBy(asc_nulls_last('startYear'))\
    .show(10)

+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|           genres|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+
| tt3155794|    short|    Passage de Venus|    Passage de Venus|      0|     1874|     \N|             1|Documentary,Short|
|tt14495706|    short|   La Rosace Magique|   La Rosace Magique|      0|     1877|     \N|             1|  Animation,Short|
|tt12592084|    short|   Le singe musicien|   Le singe musicien|      0|     1878|     \N|             1|  Animation,Short|
| tt2221420|    short|Sallie Gardner at...|Sallie Gardner at...|      0|     1878|     \N|             1|Documentary,Short|
|tt15320514|    short|   Skeleton of Horse|   Skeleton of Horse|      0|     1881|     \N|             1|  Animation,Short|
| tt7816

## Renomeando Colunas

In [None]:
df_titles.withColumnRenamed('primaryTitle', 'nome_filme').show(10)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|          nome_filme|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [None]:
df_renamed = df_titles

for c in df_titles.columns:
    df_renamed = df_renamed.withColumnRenamed(c, c + '_suffix')

df_renamed.limit(10).toPandas()

Unnamed: 0,tconst_suffix,titleType_suffix,primaryTitle_suffix,originalTitle_suffix,isAdult_suffix,startYear_suffix,endYear_suffix,runtimeMinutes_suffix,genres_suffix
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
7,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894,\N,1,"Documentary,Short"
8,tt0000009,short,Miss Jerry,Miss Jerry,0,1894,\N,40,"Romance,Short"
9,tt0000010,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,0,1895,\N,1,"Documentary,Short"


## Criando e Alterando Colunas

In [None]:
df_titles.select('tconst', 'primaryTitle', 'runtimeMinutes')\
         .withColumn('runtimeHour', round(col('runtimeMinutes').cast('Int') / 60, 3))\
         .withColumn('runtimeHoursPlus', col('runtimeHour') + 2)\
         .show(5)

+---------+--------------------+--------------+-----------+----------------+
|   tconst|        primaryTitle|runtimeMinutes|runtimeHour|runtimeHoursPlus|
+---------+--------------------+--------------+-----------+----------------+
|tt0000001|          Carmencita|             1|      0.017|           2.017|
|tt0000002|Le clown et ses c...|             5|      0.083|           2.083|
|tt0000003|      Pauvre Pierrot|             4|      0.067|           2.067|
|tt0000004|         Un bon bock|            12|        0.2|             2.2|
|tt0000005|    Blacksmith Scene|             1|      0.017|           2.017|
+---------+--------------------+--------------+-----------+----------------+
only showing top 5 rows



In [None]:
# Criando coluna constante (valor fixo)
df_titles.withColumn('Pais', lit('Brasil')).show(10)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|  Pais|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|Brasil|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|Brasil|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|Brasil|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|Brasil|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      

In [None]:
# Criando coluna condicional (CASE WHEN)
df_titles\
    .select('tconst', 'primaryTitle', 'runtimeMinutes')\
    .withColumn('runtimeMinutes', col('runtimeMinutes').cast('Int'))\
    .withColumn('categoria_runtime', when(col('runtimeMinutes') <= 60, 'curto')
                                    .when((col('runtimeMinutes') > 60) & (col('runtimeMinutes') < 90), 'normal')
                                    .when(col('runtimeMinutes') > 90, 'longo')
                                    .when(col('runtimeMinutes').isNull(), 'vazio')
                                    .otherwise('outro'))\
    .filter(col('runtimeMinutes') > 60)\
    .show(10)

+---------+--------------------+--------------+-----------------+
|   tconst|        primaryTitle|runtimeMinutes|categoria_runtime|
+---------+--------------------+--------------+-----------------+
|tt0000502|            Bohemios|           100|            longo|
|tt0000574|The Story of the ...|            70|           normal|
|tt0000591|    The Prodigal Son|            90|            outro|
|tt0000679|The Fairylogue an...|           120|            longo|
|tt0001756|Lucha por la here...|            92|            longo|
|tt0002026|Anny - Story of a...|            68|           normal|
|tt0002101|           Cleopatra|           100|            longo|
|tt0002130|     Dante's Inferno|            71|           normal|
|tt0002315|El lobo de la sierra|            76|           normal|
|tt0002423|             Passion|            85|           normal|
+---------+--------------------+--------------+-----------------+
only showing top 10 rows



In [None]:
import pyspark.sql.functions as f

## Agregação de Data Frame

In [None]:
df_title_subset = df_titles.filter('cast(startYear as Int) >= 2000')\
                           .sample(fraction = 0.5)\
                           .withColumn('genre', f.split('genres', ',').getItem(0))

In [None]:
df_title_subset.count()

2749574

In [None]:
df_title_subset\
    .agg(f.countDistinct('genres').alias('Genre_Distintos'))\
    .show()

+---------------+
|Genre_Distintos|
+---------------+
|           1989|
+---------------+



In [None]:
df_title_subset\
    .agg(f.sum('runtimeMinutes').alias('Total_RuntimeMinutes'))\
    .show()

+--------------------+
|Total_RuntimeMinutes|
+--------------------+
|         3.4333306E7|
+--------------------+



In [None]:
df_title_subset\
    .withColumn('runtimeMinutes', f.col('runtimeMinutes').cast('Int'))\
    .agg(f.sum('runtimeMinutes').alias('Sum'),
         f.mean('runtimeMinutes').alias('Mean'),
         f.max('runtimeMinutes').alias('Max'),
         f.min('runtimeMinutes').alias('Min')
    )\
    .show(10)

+--------+-----------------+-----+---+
|     Sum|             Mean|  Max|Min|
+--------+-----------------+-----+---+
|34333306|40.67141772382822|43200|  0|
+--------+-----------------+-----+---+



In [None]:
df_title_subset\
    .groupBy('genre')\
    .count()\
    .show()

+-----------+------+
|      genre| count|
+-----------+------+
|      Crime| 75145|
|    Romance| 28457|
|   Thriller|  8467|
|  Adventure| 60987|
|         \N|183217|
|      Drama|463087|
|        War|   557|
|Documentary|243415|
| Reality-TV|122041|
|     Family| 59319|
|    Fantasy| 14143|
|  Game-Show| 59926|
|      Adult|111317|
|    History|  9634|
|    Mystery|  8776|
|    Musical|  5170|
|  Animation| 87001|
|      Music| 77019|
|      Short| 72540|
|     Horror| 30222|
+-----------+------+
only showing top 20 rows



In [None]:
df_title_subset\
    .groupBy('genre')\
    .agg(f.mean('runtimeMinutes').alias('Mean'))\
    .orderBy(f.col('Mean').desc())\
    .show(10)

+---------+------------------+
|    genre|              Mean|
+---------+------------------+
|    Adult| 96.40659249418798|
|      War| 83.13422818791946|
|    Sport| 80.51137884872824|
| Thriller| 77.62358276643991|
|  Western| 68.44444444444444|
|     News|55.453895323080815|
|       \N| 55.04858299595141|
|    Crime| 49.87259919130655|
|Biography|49.202232311766934|
|Talk-Show| 48.83925518925519|
+---------+------------------+
only showing top 10 rows



In [None]:
df_title_subset\
    .groupBy('genre', 'startYear')\
    .agg(f.mean('runtimeMinutes').alias('Mean'))\
    .orderBy(f.col('Mean').desc())\
    .filter(f.col('startYear') == '2021')\
    .show(10)

+---------+---------+------------------+
|    genre|startYear|              Mean|
+---------+---------+------------------+
|    Sport|     2021|112.19928825622776|
|      War|     2021| 88.77777777777777|
| Thriller|     2021| 84.85204081632654|
|  Western|     2021|            78.625|
|Biography|     2021| 69.30563798219585|
|    Adult|     2021| 67.32492581602374|
|     News|     2021|  62.6272952853598|
|       \N|     2021| 59.68868703550785|
|Game-Show|     2021|54.536502546689306|
|Adventure|     2021| 53.26246334310851|
+---------+---------+------------------+
only showing top 10 rows

