<a href="https://colab.research.google.com/github/dmarcovini07/python-para-ciberseguranca/blob/main/aula01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark


In [None]:
# Instalando a biblioteca pyspark
from pyspark.sql import SparkSession
# Criando uma sessão Spark com o nome "ColabPySparkTutorial"

spark = SparkSession.builder.appName("ColabPySparkTutorial").getOrCreate()

In [None]:
from pyspark.sql import Row

# Criando um dataset fictício de vendas
data = [
    Row(id=1, product="A", Price=10.1),
    Row(id=2, product="B", Price=20.1),
    Row(id=3, product="A", Price=30.2),
    Row(id=4, product="C", Price=40.3),
    Row(id=5, product="B", Price=50.5),
]

df = spark.createDataFrame(data)
df.show()

+---+-------+-----+
| id|product|Price|
+---+-------+-----+
|  1|      A| 10.1|
|  2|      B| 20.1|
|  3|      A| 30.2|
|  4|      C| 40.3|
|  5|      B| 50.5|
+---+-------+-----+



In [None]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- product: string (nullable = true)
 |-- Price: double (nullable = true)



In [None]:
conte_a = df. filter(df.product == "A").count()
print(conte_a)

2


In [None]:
df.groupBy("product").sum("Price").show()

+-------+----------+
|product|sum(Price)|
+-------+----------+
|      B|      70.6|
|      A|      40.3|
|      C|      40.3|
+-------+----------+



In [None]:
from pyspark.sql.functions import when, col
df_with_discount = df.withColumn("discount", when(col("Price") > 30, 0.9).otherwise(1))
df_with_discount.show()

+---+-------+-----+--------+
| id|product|Price|discount|
+---+-------+-----+--------+
|  1|      A| 10.1|     1.0|
|  2|      B| 20.1|     1.0|
|  3|      A| 30.2|     0.9|
|  4|      C| 40.3|     0.9|
|  5|      B| 50.5|     0.9|
+---+-------+-----+--------+



In [None]:
df_with_total = df_with_discount.withColumn("total", col("Price") * col("discount"))
df_with_total.show()

+---+-------+-----+--------+------------------+
| id|product|Price|discount|             total|
+---+-------+-----+--------+------------------+
|  1|      A| 10.1|     1.0|              10.1|
|  2|      B| 20.1|     1.0|              20.1|
|  3|      A| 30.2|     0.9|             27.18|
|  4|      C| 40.3|     0.9|36.269999999999996|
|  5|      B| 50.5|     0.9|             45.45|
+---+-------+-----+--------+------------------+



In [None]:
df_sorted = df_with_total.orderBy(col("total").desc())
df_sorted.show()

+---+-------+-----+--------+------------------+
| id|product|Price|discount|             total|
+---+-------+-----+--------+------------------+
|  5|      B| 50.5|     0.9|             45.45|
|  4|      C| 40.3|     0.9|36.269999999999996|
|  3|      A| 30.2|     0.9|             27.18|
|  2|      B| 20.1|     1.0|              20.1|
|  1|      A| 10.1|     1.0|              10.1|
+---+-------+-----+--------+------------------+



In [None]:
df_ab = df_sorted.filter((col("product") == "A") | (col("product") == "B"))
df_ab.show()

+---+-------+-----+--------+-----+
| id|product|Price|discount|total|
+---+-------+-----+--------+-----+
|  5|      B| 50.5|     0.9|45.45|
|  3|      A| 30.2|     0.9|27.18|
|  2|      B| 20.1|     1.0| 20.1|
|  1|      A| 10.1|     1.0| 10.1|
+---+-------+-----+--------+-----+



In [None]:
df.groupBy("product").avg("Price").show()

+-------+----------+
|product|avg(Price)|
+-------+----------+
|      B|      35.3|
|      A|     20.15|
|      C|      40.3|
+-------+----------+



In [None]:
df_renamed = df.withColumnRenamed("product", "item")
df_renamed.show()

+---+----+-----+
| id|item|Price|
+---+----+-----+
|  1|   A| 10.1|
|  2|   B| 20.1|
|  3|   A| 30.2|
|  4|   C| 40.3|
|  5|   B| 50.5|
+---+----+-----+



In [None]:
df_sorted.write.csv("sorted_data.csv")

In [None]:
from pyspark.sql import Row

In [None]:
data = [
    Row(id=1, product="A", amount=10, category="eletrônicos"),
    Row(id=2, product="B", amount=20, category="vestuário"),
    Row(id=3, product="A", amount=30, category="eletrônicos"),
    Row(id=4, product="C", amount=40, category="alimentos"),
    Row(id=5, product="B", amount=50, category="vestuário")
]

df = spark.createDataFrame(data)

df.show()

+---+-------+------+-----------+
| id|product|amount|   category|
+---+-------+------+-----------+
|  1|      A|    10|eletrônicos|
|  2|      B|    20|  vestuário|
|  3|      A|    30|eletrônicos|
|  4|      C|    40|  alimentos|
|  5|      B|    50|  vestuário|
+---+-------+------+-----------+

