# Importando dependencias

In [None]:
import pyspark.sql.functions as sf
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType
from feature_store import FeatureStore, Catalog
from pyspark.sql import DataFrame
import time
from pyspark.sql.functions import col, when, lit

# Pré-Tratamento dos Dados

In [None]:
df = spark.read.parquet("gs://oculto/loss_prevention/perfil_laranja/data/perfil_laranja_clean.parquet")

In [None]:
total = df.count()

                                                                                

In [None]:
total

184050329

In [None]:
print(df.filter(sf.col("flagBaixaRenda") == 1).count())



110168045


                                                                                

In [None]:
print(df.filter(sf.col("flagMediaRenda") == 1).count())



39328358


                                                                                

In [None]:
print(df.filter(sf.col("flagAltaRenda") == 1).count())



12096582


                                                                                

In [None]:
from functools import reduce

cols = df.columns

# Cria a expressão que soma 1 para cada zero encontrado em uma linha
zero_count_expr = reduce(
    lambda a, b: a + b,
    [when(col(c) == 0, 1).otherwise(0) for c in cols]
)

df = df.withColumn("qtd_zeros", zero_count_expr)

In [None]:
df_group = (
    df.groupBy('qtd_zeros')
    .agg(sf.count('*').alias('count'))
    .withColumn('proportion', sf.round((sf.col('count') / sf.lit(total))*100, 5))
    .sort("qtd_zeros", ascending=False)
)

In [None]:
janela = Window.orderBy(sf.desc("qtd_zeros")).rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Soma acumulada da coluna "x"
df_group = df_group.withColumn("proportion_acumulado", sf.sum("proportion").over(janela))

In [None]:
df_group.show()

25/04/10 18:54:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/10 18:54:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/10 18:54:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/10 18:55:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/10 18:55:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/10 18:55:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/10 1

+---------+--------+----------+--------------------+
|qtd_zeros|   count|proportion|proportion_acumulado|
+---------+--------+----------+--------------------+
|      112|      37|    2.0E-5|              2.0E-5|
|      111| 8762382|   4.76086|             4.76088|
|      110|22289631|  12.11062|             16.8715|
|      109| 9768417|   5.30747|            22.17897|
|      108|18928700|  10.28452|            32.46349|
|      107|17102955|   9.29254|            41.75603|
|      106|16384661|   8.90227|  50.658300000000004|
|      105|16783035|   9.11872|   59.77702000000001|
|      104|15209925|     8.264|            68.04102|
|      103|13153932|   7.14692|            75.18794|
|      102|11222677|   6.09761|            81.28555|
|      101| 9329709|   5.06911|            86.35466|
|      100| 7571079|   4.11359|            90.46825|
|       99| 5596837|   3.04093|            93.50918|
|       98| 3773757|   2.05039|   95.55957000000001|
|       97| 2483878|   1.34956|            96.

                                                                                

In [None]:
df.filter(sf.col('qtd_zeros')<100).count()

                                                                                

17543189

In [None]:
(13674989/total)*100

In [None]:
(17543189/total)*100

9.531734659382217

In [None]:
baixa_renda = df.filter(sf.col("flagBaixaRenda") == 1)
media_renda = df.filter(sf.col("flagMediaRenda") == 1)
alta_renda = df.filter(sf.col("flagAltaRenda") == 1)

In [None]:
print(baixa_renda.filter(sf.col('qtd_zeros')<100).count())



4783994


                                                                                

In [None]:
print(media_renda.filter(sf.col('qtd_zeros')<100).count())



7152189


                                                                                

In [None]:
print(alta_renda.filter(sf.col('qtd_zeros')<100).count())



5604459


                                                                                

In [None]:
print("BAIXA RENDA")
(baixa_renda.filter(sf.col('qtd_zeros')<100)
 .drop("qtd_zeros")
 .write.mode("overwrite")
 .parquet("gs://oculto/loss_prevention/perfil_laranja/data/perfil_laranja_clean_baixaRenda_reduzido.parquet")
)
print("MEDIA RENDA")
(media_renda.filter(sf.col('qtd_zeros')<100)
 .drop("qtd_zeros")
 .write.mode("overwrite")
 .parquet("gs://oculto/loss_prevention/perfil_laranja/data/perfil_laranja_clean_mediaRenda_reduzido.parquet")
)
print("ALTA RENDA")
(alta_renda.filter(sf.col('qtd_zeros')<100)
 .drop("qtd_zeros")
 .write.mode("overwrite")
 .parquet("gs://oculto/loss_prevention/perfil_laranja/data/perfil_laranja_clean_altaRenda_reduzido.parquet")
)

BAIXA RENDA


                                                                                

MEDIA RENDA


                                                                                

ALTA RENDA


                                                                                

In [None]:
print("BAIXA RENDA")
(df
 .filter(sf.col('flagBaixaRenda')==1)
 .drop("flagBaixaRenda","flagMediaRenda","flagAltaRenda")
 .write.mode("overwrite")
 .parquet("gs://oculto/loss_prevention/perfil_laranja/data/perfil_laranja_clean_baixaRenda.parquet")
)

print("MEDIA RENDA")
(df
 .filter(sf.col('flagMediaRenda')==1)
 .drop("flagBaixaRenda","flagMediaRenda","flagAltaRenda")
 .write.mode("overwrite")
 .parquet("gs://oculto/loss_prevention/perfil_laranja/data/perfil_laranja_clean_mediaRenda.parquet")
)

print("ALTA RENDA")
(df
 .filter(sf.col('flagAltaRenda')==1)
 .drop("flagBaixaRenda","flagMediaRenda","flagAltaRenda")
 .write.mode("overwrite")
 .parquet("gs://oculto/loss_prevention/perfil_laranja/data/perfil_laranja_clean_altaRenda.parquet")
)

BAIXA RENDA


                                                                                

MEDIA RENDA


                                                                                

ALTA RENDA


                                                                                