<a href="https://colab.research.google.com/github/gteless/Aulas_FIAP/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Configurações da biblioteca

In [1]:
!pip install pyspark



### Criando a sessão do SparkContext e SparkSession

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [3]:
sc = SparkContext.getOrCreate()

In [4]:
spark = SparkSession.builder.appName('PySpark DataFrame From RDD').getOrCreate()

# Create PySpark Dataframe from an Existint RDD

In [7]:
rdd = sc.parallelize([('C', 85, 76, 87, 91), ('B', 85, 76, 87, 91), ("A", 85, 78, 96, 92), ("A", 92, 76, 89, 96)], 4)

In [6]:
print(type(rdd))

<class 'pyspark.rdd.RDD'>


In [8]:
sub = ['id_person', 'value_1', 'value_2', 'value_3', 'value_4']

In [9]:
marks_df = spark.createDataFrame(rdd, schema=sub)

In [11]:
print(type(marks_df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [10]:
marks_df.show()

+---------+-------+-------+-------+-------+
|id_person|value_1|value_2|value_3|value_4|
+---------+-------+-------+-------+-------+
|        C|     85|     76|     87|     91|
|        B|     85|     76|     87|     91|
|        A|     85|     78|     96|     92|
|        A|     92|     76|     89|     96|
+---------+-------+-------+-------+-------+



In [12]:
marks_df.printSchema()

root
 |-- id_person: string (nullable = true)
 |-- value_1: long (nullable = true)
 |-- value_2: long (nullable = true)
 |-- value_3: long (nullable = true)
 |-- value_4: long (nullable = true)



### Operações básicas com PySpark

In [13]:
# Caso não tenha instanciado antes, instanciar o pyspark
!pip install pyspark
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("pysparkdf").getOrCreate()



# Importing Data

In [14]:
df = spark.read.csv('cereal.csv', sep=',', inferSchema = True, header = True)

In [15]:
df.show()

+--------------------+--------------+----+--------+-----------+---+------+-------------+-----+------+-------------+---------+---------------------+-------------------+----------------+
|         Cereal Name|  Manufacturer|Type|Calories|Protein (g)|Fat|Sodium|Dietary Fiber|Carbs|Sugars|Display Shelf|Potassium|Vitamins and Minerals|Serving Size Weight|Cups per Serving|
+--------------------+--------------+----+--------+-----------+---+------+-------------+-----+------+-------------+---------+---------------------+-------------------+----------------+
|           100%_Bran|       Nabisco|   C|      70|          4|  1|   130|         10.0|  5.0|     6|            3|      280|                   25|                1.0|            0.33|
|   100%_Natural_Bran|   Quaker Oats|   C|     120|          3|  5|    15|          2.0|  8.0|     8|            3|      135|                    0|                1.0|            -1.0|
|            All-Bran|      Kelloggs|   C|      70|          4|  1|   260| 

In [16]:
df.printSchema()

root
 |-- Cereal Name: string (nullable = true)
 |-- Manufacturer: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Calories: integer (nullable = true)
 |-- Protein (g): integer (nullable = true)
 |-- Fat: integer (nullable = true)
 |-- Sodium: integer (nullable = true)
 |-- Dietary Fiber: double (nullable = true)
 |-- Carbs: double (nullable = true)
 |-- Sugars: integer (nullable = true)
 |-- Display Shelf: integer (nullable = true)
 |-- Potassium: integer (nullable = true)
 |-- Vitamins and Minerals: integer (nullable = true)
 |-- Serving Size Weight: double (nullable = true)
 |-- Cups per Serving: double (nullable = true)



# Select()

In [19]:
df.select('Cereal Name', 'Manufacturer', 'Calories').show()

+--------------------+--------------+--------+
|         Cereal Name|  Manufacturer|Calories|
+--------------------+--------------+--------+
|           100%_Bran|       Nabisco|      70|
|   100%_Natural_Bran|   Quaker Oats|     120|
|            All-Bran|      Kelloggs|      70|
|All-Bran_with_Ext...|      Kelloggs|      50|
|      Almond_Delight|Ralston Purina|     110|
|Apple_Cinnamon_Ch...| General Mills|     110|
|         Apple_Jacks|      Kelloggs|     110|
|             Basic_4| General Mills|     130|
|           Bran_Chex|Ralston Purina|      90|
|         Bran_Flakes|          Post|      90|
|        Cap'n'Crunch|   Quaker Oats|     120|
|            Cheerios| General Mills|     110|
|Cinnamon_Toast_Cr...| General Mills|     120|
|            Clusters| General Mills|     110|
|         Cocoa_Puffs| General Mills|     110|
|           Corn_Chex|Ralston Purina|     110|
|         Corn_Flakes|      Kelloggs|     100|
|           Corn_Pops|      Kelloggs|     110|
|       Count

#withColumn()

In [21]:
df.withColumn('calories', df['Calories'].cast("Integer")).printSchema()

root
 |-- Cereal Name: string (nullable = true)
 |-- Manufacturer: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- calories: integer (nullable = true)
 |-- Protein (g): integer (nullable = true)
 |-- Fat: integer (nullable = true)
 |-- Sodium: integer (nullable = true)
 |-- Dietary Fiber: double (nullable = true)
 |-- Carbs: double (nullable = true)
 |-- Sugars: integer (nullable = true)
 |-- Display Shelf: integer (nullable = true)
 |-- Potassium: integer (nullable = true)
 |-- Vitamins and Minerals: integer (nullable = true)
 |-- Serving Size Weight: double (nullable = true)
 |-- Cups per Serving: double (nullable = true)



#groupBy()

In [25]:
df.groupBy('calories').count().show()

+--------+-----+
|calories|count|
+--------+-----+
|     140|    3|
|     120|    9|
|     100|   17|
|     130|    2|
|      50|    3|
|      80|    1|
|     160|    1|
|      70|    2|
|      90|    7|
|     110|   29|
+--------+-----+



#orderBy()

In [27]:
df.orderBy('Protein (g)').show()

+--------------------+--------------+----+--------+-----------+---+------+-------------+-----+------+-------------+---------+---------------------+-------------------+----------------+
|         Cereal Name|  Manufacturer|Type|Calories|Protein (g)|Fat|Sodium|Dietary Fiber|Carbs|Sugars|Display Shelf|Potassium|Vitamins and Minerals|Serving Size Weight|Cups per Serving|
+--------------------+--------------+----+--------+-----------+---+------+-------------+-----+------+-------------+---------+---------------------+-------------------+----------------+
|        Cap'n'Crunch|   Quaker Oats|   C|     120|          1|  2|   220|          0.0| 12.0|    12|            2|       35|                   25|                1.0|            0.75|
|Cinnamon_Toast_Cr...| General Mills|   C|     120|          1|  3|   210|          0.0| 13.0|     9|            2|       45|                   25|                1.0|            0.75|
|         Cocoa_Puffs| General Mills|   C|     110|          1|  1|   180| 

In [28]:
df.orderBy('calories').show(50)

+--------------------+--------------------+----+--------+-----------+---+------+-------------+-----+------+-------------+---------+---------------------+-------------------+----------------+
|         Cereal Name|        Manufacturer|Type|Calories|Protein (g)|Fat|Sodium|Dietary Fiber|Carbs|Sugars|Display Shelf|Potassium|Vitamins and Minerals|Serving Size Weight|Cups per Serving|
+--------------------+--------------------+----+--------+-----------+---+------+-------------+-----+------+-------------+---------+---------------------+-------------------+----------------+
|All-Bran_with_Ext...|            Kelloggs|   C|      50|          4|  0|   140|         14.0|  8.0|     0|            3|      330|                   25|                1.0|             0.5|
|         Puffed_Rice|         Quaker Oats|   C|      50|          1|  0|     0|          0.0| 13.0|     0|            3|       15|                    0|                0.5|             1.0|
|        Puffed_Wheat|         Quaker Oats|  

# Case When

In [29]:
from pyspark.sql.functions import when

In [37]:
df.select("Cereal Name", df['Vitamins and Minerals'], when(df['Vitamins and Minerals'] >= "25", "rich in vitamins")).show(50)

+--------------------+---------------------+-----------------------------------------------------------------+
|         Cereal Name|Vitamins and Minerals|CASE WHEN (Vitamins and Minerals >= 25) THEN rich in vitamins END|
+--------------------+---------------------+-----------------------------------------------------------------+
|           100%_Bran|                   25|                                                 rich in vitamins|
|   100%_Natural_Bran|                    0|                                                             NULL|
|            All-Bran|                   25|                                                 rich in vitamins|
|All-Bran_with_Ext...|                   25|                                                 rich in vitamins|
|      Almond_Delight|                   25|                                                 rich in vitamins|
|Apple_Cinnamon_Ch...|                   25|                                                 rich in vitamins|
|

#filter()

In [40]:
df.filter(df.Calories >= "100").show(50)

+--------------------+--------------------+----+--------+-----------+---+------+-------------+-----+------+-------------+---------+---------------------+-------------------+----------------+
|         Cereal Name|        Manufacturer|Type|Calories|Protein (g)|Fat|Sodium|Dietary Fiber|Carbs|Sugars|Display Shelf|Potassium|Vitamins and Minerals|Serving Size Weight|Cups per Serving|
+--------------------+--------------------+----+--------+-----------+---+------+-------------+-----+------+-------------+---------+---------------------+-------------------+----------------+
|   100%_Natural_Bran|         Quaker Oats|   C|     120|          3|  5|    15|          2.0|  8.0|     8|            3|      135|                    0|                1.0|            -1.0|
|      Almond_Delight|      Ralston Purina|   C|     110|          2|  2|   200|          1.0| 14.0|     8|            3|       -1|                   25|                1.0|            0.75|
|Apple_Cinnamon_Ch...|       General Mills|  

#isnull() / isnotnull()

In [41]:
from pyspark.sql.functions import *

In [47]:
df.filter(df['Cereal Name'].isNull()).show()

+-----------+------------+----+--------+-----------+---+------+-------------+-----+------+-------------+---------+---------------------+-------------------+----------------+
|Cereal Name|Manufacturer|Type|Calories|Protein (g)|Fat|Sodium|Dietary Fiber|Carbs|Sugars|Display Shelf|Potassium|Vitamins and Minerals|Serving Size Weight|Cups per Serving|
+-----------+------------+----+--------+-----------+---+------+-------------+-----+------+-------------+---------+---------------------+-------------------+----------------+
+-----------+------------+----+--------+-----------+---+------+-------------+-----+------+-------------+---------+---------------------+-------------------+----------------+

