In [1]:
# Importando as bibliotecas necessárias
from pyspark.sql import SparkSession
from pyspark.sql.functions import month, col
from pyspark.ml.feature import StringIndexer, VectorAssembler, Normalizer, PCA
from pyspark.ml.regression import LinearRegression


In [2]:
# Criando a Spark Session
spark = SparkSession.builder \
    .appName("Preparação de Dados para Modelos Preditivos") \
    .getOrCreate()

from pyspark.sql import SparkSession
from pyspark.sql.functions import month, col


In [3]:
# 1. Iniciando a SparkSession
spark = SparkSession.builder.master("local[*]").appName("Data Preparation").getOrCreate()

# Lendo o arquivo correto
df_video = spark.read.parquet("videos-comments-tratados.snappy.parquet")

# Confirmando o carregamento dos dados
df_video.printSchema()
df_video.show(5)


root
 |-- Video ID: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: integer (nullable = true)
 |-- Interaction: integer (nullable = true)
 |-- Year: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Sentiment: integer (nullable = true)
 |-- Likes Comment: integer (nullable = true)

+-----------+--------------------+------------+-------+-----+--------+------+-----------+----+--------------------+---------+-------------+
|   Video ID|               Title|Published At|Keyword|Likes|Comments| Views|Interaction|Year|             Comment|Sentiment|Likes Comment|
+-----------+--------------------+------------+-------+-----+--------+------+-----------+----+--------------------+---------+-------------+
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|202

In [4]:
# 2. Adicionando a coluna 'Month'
df_video = df_video.withColumn("Month", month(col("Published At")))
df_video.show(5)


+-----------+--------------------+------------+-------+-----+--------+------+-----------+----+--------------------+---------+-------------+-----+
|   Video ID|               Title|Published At|Keyword|Likes|Comments| Views|Interaction|Year|             Comment|Sentiment|Likes Comment|Month|
+-----------+--------------------+------------+-------+-----+--------+------+-----------+----+--------------------+---------+-------------+-----+
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|Let's not forget ...|        1|           95|    8|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|Here in NZ 50% of...|        0|           19|    8|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|I will forever ac...|        2|          161|    8|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|Whenever I go to ...|        0

In [6]:
# 3. Adicionando a coluna "Keyword Index" com a transformação da coluna 'keyword' para valores numéricos
from pyspark.sql.functions import lit

# Adicionando uma coluna "keyword" com valores fictícios
df_video = df_video.withColumn("keyword", lit("default_value"))

from pyspark.ml.feature import StringIndexer

# Transformar a coluna "keyword" em valores numéricos
indexer = StringIndexer(inputCol="keyword", outputCol="Keyword Index")
df_video = indexer.fit(df_video).transform(df_video)

# Exibir os dados após a transformação
df_video.show(5)

from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, MinMaxScaler

# Converter a coluna "Year" para inteiro
df_video = df_video.withColumn("Year", col("Year").cast("int"))


+-----------+--------------------+------------+-------------+-----+--------+------+-----------+----+--------------------+---------+-------------+-----+-------------+
|   Video ID|               Title|Published At|      keyword|Likes|Comments| Views|Interaction|Year|             Comment|Sentiment|Likes Comment|Month|Keyword Index|
+-----------+--------------------+------------+-------------+-----+--------+------+-----------+----+--------------------+---------+-------------+-----+-------------+
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|default_value| 3407|     672|135612|     139691|2022|Let's not forget ...|        1|           95|    8|          0.0|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|default_value| 3407|     672|135612|     139691|2022|Here in NZ 50% of...|        0|           19|    8|          0.0|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|default_value| 3407|     672|135612|     139691|2022|I will forever ac...|        2|          161|    8|          0.0|
|wAZ

In [7]:
# Criar o vetor "Features"
assembler = VectorAssembler(inputCols=["Likes", "Views", "Year", "Month", "Keyword Index"], outputCol="Features")
df_video = assembler.transform(df_video)

# Normalizar os dados da coluna "Features"
scaler = MinMaxScaler(inputCol="Features", outputCol="Features Normal")
scaler_model = scaler.fit(df_video)
df_video = scaler_model.transform(df_video)

# Exibir o DataFrame resultante
df_video.select("Features", "Features Normal").show(truncate=False)


# Filtrando valores nulos para normalização
df_video = df_video.na.drop(subset=["Features"])


+----------------------------------+---------------------------------------------------------------------+
|Features                          |Features Normal                                                      |
+----------------------------------+---------------------------------------------------------------------+
|[3407.0,135612.0,2022.0,8.0,0.0]  |[2.072291978642988E-4,6.31492585237833E-5,1.0,0.6363636363636364,0.5]|
|[3407.0,135612.0,2022.0,8.0,0.0]  |[2.072291978642988E-4,6.31492585237833E-5,1.0,0.6363636363636364,0.5]|
|[3407.0,135612.0,2022.0,8.0,0.0]  |[2.072291978642988E-4,6.31492585237833E-5,1.0,0.6363636363636364,0.5]|
|[3407.0,135612.0,2022.0,8.0,0.0]  |[2.072291978642988E-4,6.31492585237833E-5,1.0,0.6363636363636364,0.5]|
|[3407.0,135612.0,2022.0,8.0,0.0]  |[2.072291978642988E-4,6.31492585237833E-5,1.0,0.6363636363636364,0.5]|
|[3407.0,135612.0,2022.0,8.0,0.0]  |[2.072291978642988E-4,6.31492585237833E-5,1.0,0.6363636363636364,0.5]|
|[3407.0,135612.0,2022.0,8.0,0.0]  |[

In [8]:
# 6. Adicionando a coluna "Features PCA" com a redução de 5 características para 1, utilizando o modelo PCA
pca = PCA(k=1, inputCol="Features", outputCol="Features PCA")
pca_model = pca.fit(df_video)
df_video = pca_model.transform(df_video)


In [9]:
# 7. Separando o dataframe df_video em 2 conjuntos: 80% para treinamento e 20% para teste
train_data, test_data = df_video.randomSplit([0.8, 0.2], seed=42)


In [10]:
# 8. Criando um modelo de regressão linear para estimar o valor do campo "Comments", utilizando a "Features Normal"
lr = LinearRegression(featuresCol="Features Normal", labelCol="Comments", predictionCol="Prediction")
lr_model = lr.fit(train_data)

# Avaliando o modelo
evaluation = lr_model.evaluate(test_data)
print(f"RMSE: {evaluation.rootMeanSquaredError}")
print(f"R2: {evaluation.r2}")


RMSE: 25380.24599243739
R2: 0.6599757712884147


In [12]:
# Finalizando a Spark Session
spark.stop()