# Mini projeto 3

Prevendo a cotação de criptomoedas em tempo real com PySpark e Machine Learning

In [1]:
# Importando bibliotecas:
from platform import python_version
print('Versão Python:', python_version(), '\n')

import findspark
findspark.init() # Inicializando o findspark

import pyspark
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pyspark import SparkConf, SparkContext # SparkConf: Configuração do Spark; SparkContext: Contexto do Spark
from pyspark.sql import SparkSession, SQLContext # SparkSession: Sessão do Spark; SQLContext: Contexto SQL
from pyspark.sql.types import * # Importando todos os tipos de dados do Spark
from pyspark.sql.functions import * # Importando todas as funções do Spark
from pyspark.ml.linalg import Vectors # Importando o tipo de dado vetor do Spark
from pyspark.ml.feature import StringIndexer # Importando o indexador de string do Spark
from pyspark.ml.regression import LinearRegression # Importando a regressão linear do Spark
from pyspark.mllib.evaluation import RegressionMetrics # Importando as métricas de avaliação do Spark
from pyspark.ml.linalg import Vectors # Importando o tipo de dado vetor do Spark
from pyspark.ml.feature import StringIndexer # Importando o indexador de string do Spark
from pyspark.ml.stat import Correlation # Importando a correlação do Spark
from pyspark.ml.feature import MinMaxScaler # Importando o escalonador do Spark
from pyspark.ml.feature import VectorAssembler # Importando o montador de vetores do Spark
from pyspark.ml import Pipeline # Importando o pipeline do Spark
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel # Importando o otimizador de hiperparâmetros do Spark
from pyspark.ml.feature import VectorAssembler, StandardScaler # Importando o montador de vetores e o escalonador do Spark
from pyspark.ml.evaluation import RegressionEvaluator # Importando o avaliador de regressão do Spark

%reload_ext watermark
%watermark -a "gustavogzr" --iversions

Versão Python: 3.11.1 

Author: gustavogzr

seaborn   : 0.12.2
pandas    : 2.0.3
numpy     : 1.25.2
matplotlib: 3.7.2
decimal   : 1.70
pyspark   : 3.5.2
findspark : 2.0.1
sys       : 3.11.1 (tags/v3.11.1:a7a450f, Dec  6 2022, 19:58:39) [MSC v.1934 64 bit (AMD64)]



In [2]:
# Formatação das saídas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 400)
from matplotlib.axes._axes import _log as matplotlib_axes_logger # Removendo mensagens de aviso
matplotlib_axes_logger.setLevel('ERROR') # Removendo mensagens de aviso

## Preparando o ambiente Spark

In [3]:
# Definindo semente aleatória (seed) para reprodutabilidade dos resultados
rnd_seed = 23
np.random.seed = rnd_seed
np.random.set_state = rnd_seed

In [4]:
# Criando um Spark Context:
sc = SparkContext(appName='Mini-Projeto-3') # Criando um SparkContext. O parâmetro appName é um nome para a aplicação Spark

In [5]:
# Criando um Spark Session: 
spark_session = SparkSession.Builder().getOrCreate() # Criando uma sessão Spark
# a diferença entre contexto e sessão é que a sessão engloba o contexto e adiciona informações sobre a aplicação

In [6]:
# Visualizando a sessão criada:
spark_session

## Carregando os dados

In [7]:
# Carregar os dados a partir da sessão Spark:
df_spark = spark_session.read.csv('.arquivos_DSA/dados/dataset.csv', header=True, inferSchema=True) # inferSchema: Infere o tipo de dado de cada coluna

In [8]:
# Tipo de objeto:
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [9]:
# Visualizando os dados:
df_spark.show()

+----------+----+----+----+-----+------------+-----------------+--------------+
| Timestamp|Open|High| Low|Close|Volume_(BTC)|Volume_(Currency)|Weighted_Price|
+----------+----+----+----+-----+------------+-----------------+--------------+
|1325317920|4.39|4.39|4.39| 4.39|  0.45558087|     2.0000000193|          4.39|
|1325319300| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325319360| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325319420| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325319480| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325319540| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325319600| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325319660| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325345040| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325345100| NaN| NaN| NaN|  NaN|       

In [10]:
# Visualizar metadados:
df_spark.printSchema()

root
 |-- Timestamp: integer (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume_(BTC): double (nullable = true)
 |-- Volume_(Currency): double (nullable = true)
 |-- Weighted_Price: double (nullable = true)



In [11]:
# Verificar o número de linhas:
df_spark.count()

4856600

## Data Wrangling com SparkSQL (Manipulação de Dados)

In [12]:
# Criar tabela temporária a partir do dataframe:
df_spark.createOrReplaceTempView('dados_bitcoin')
# A tabela temporária é uma visão temporária que pode ser utilizada para executar consultas SQL
# Evita-se utilizar o dataframe diretamente para consultas SQL

In [13]:
# Executar consulta SQL:
df_bitcoin = spark_session.sql('SELECT *, from_unixtime(timestamp) as `dateTime` FROM dados_bitcoin')
df_bitcoin = df_bitcoin.withColumn("dateTime", from_utc_timestamp('dateTime', "UTC-6")) # Ajustando o fuso horário
df_bitcoin.show(5)

+----------+----+----+----+-----+------------+-----------------+--------------+-------------------+
| Timestamp|Open|High| Low|Close|Volume_(BTC)|Volume_(Currency)|Weighted_Price|           dateTime|
+----------+----+----+----+-----+------------+-----------------+--------------+-------------------+
|1325317920|4.39|4.39|4.39| 4.39|  0.45558087|     2.0000000193|          4.39|2011-12-30 23:52:00|
|1325319300| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|2011-12-31 00:15:00|
|1325319360| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|2011-12-31 00:16:00|
|1325319420| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|2011-12-31 00:17:00|
|1325319480| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|2011-12-31 00:18:00|
+----------+----+----+----+-----+------------+-----------------+--------------+-------------------+
only showing top 5 rows



In [14]:
type(df_bitcoin)

pyspark.sql.dataframe.DataFrame

In [15]:
# Remoção de valores NaN (missing values):
df_bitcoin = df_bitcoin.dropna('any')
df_bitcoin.show(5)

+----------+----+----+----+-----+------------+-----------------+--------------+-------------------+
| Timestamp|Open|High| Low|Close|Volume_(BTC)|Volume_(Currency)|Weighted_Price|           dateTime|
+----------+----+----+----+-----+------------+-----------------+--------------+-------------------+
|1325317920|4.39|4.39|4.39| 4.39|  0.45558087|     2.0000000193|          4.39|2011-12-30 23:52:00|
|1325346600|4.39|4.39|4.39| 4.39|        48.0|           210.72|          4.39|2011-12-31 07:50:00|
|1325350740| 4.5|4.57| 4.5| 4.57| 37.86229723|     171.38033753|  4.5264114983|2011-12-31 08:59:00|
|1325350800|4.58|4.58|4.58| 4.58|         9.0|            41.22|          4.58|2011-12-31 09:00:00|
|1325391360|4.58|4.58|4.58| 4.58|       1.502|          6.87916|          4.58|2011-12-31 20:16:00|
+----------+----+----+----+-----+------------+-----------------+--------------+-------------------+
only showing top 5 rows



In [16]:
# Verificar o número de linhas:
df_bitcoin.count()

3613769

In [17]:
# Colunas do dataframe:
df_bitcoin.columns

['Timestamp',
 'Open',
 'High',
 'Low',
 'Close',
 'Volume_(BTC)',
 'Volume_(Currency)',
 'Weighted_Price',
 'dateTime']

In [18]:
# Renomear colunas para facilitar a manipulação de dados:
df_bitcoin = df_bitcoin.withColumnRenamed(
    'Volume_(BTC)', 'Vol_BTC').withColumnRenamed(
    'Volume_(Currency)', 'Vol_Currency')
df_bitcoin.show(5)

+----------+----+----+----+-----+-----------+------------+--------------+-------------------+
| Timestamp|Open|High| Low|Close|    Vol_BTC|Vol_Currency|Weighted_Price|           dateTime|
+----------+----+----+----+-----+-----------+------------+--------------+-------------------+
|1325317920|4.39|4.39|4.39| 4.39| 0.45558087|2.0000000193|          4.39|2011-12-30 23:52:00|
|1325346600|4.39|4.39|4.39| 4.39|       48.0|      210.72|          4.39|2011-12-31 07:50:00|
|1325350740| 4.5|4.57| 4.5| 4.57|37.86229723|171.38033753|  4.5264114983|2011-12-31 08:59:00|
|1325350800|4.58|4.58|4.58| 4.58|        9.0|       41.22|          4.58|2011-12-31 09:00:00|
|1325391360|4.58|4.58|4.58| 4.58|      1.502|     6.87916|          4.58|2011-12-31 20:16:00|
+----------+----+----+----+-----+-----------+------------+--------------+-------------------+
only showing top 5 rows



In [19]:
# Vamos dividir o dataframe extraindo a data e hora:
df_data = df_bitcoin.withColumn('date', split(col('dateTime'), ' ').getItem(0))
df_data = df_data.withColumn('time', split(col('dateTime'), ' ').getItem(1))
df_data.show(5)

+----------+----+----+----+-----+-----------+------------+--------------+-------------------+----------+--------+
| Timestamp|Open|High| Low|Close|    Vol_BTC|Vol_Currency|Weighted_Price|           dateTime|      date|    time|
+----------+----+----+----+-----+-----------+------------+--------------+-------------------+----------+--------+
|1325317920|4.39|4.39|4.39| 4.39| 0.45558087|2.0000000193|          4.39|2011-12-30 23:52:00|2011-12-30|23:52:00|
|1325346600|4.39|4.39|4.39| 4.39|       48.0|      210.72|          4.39|2011-12-31 07:50:00|2011-12-31|07:50:00|
|1325350740| 4.5|4.57| 4.5| 4.57|37.86229723|171.38033753|  4.5264114983|2011-12-31 08:59:00|2011-12-31|08:59:00|
|1325350800|4.58|4.58|4.58| 4.58|        9.0|       41.22|          4.58|2011-12-31 09:00:00|2011-12-31|09:00:00|
|1325391360|4.58|4.58|4.58| 4.58|      1.502|     6.87916|          4.58|2011-12-31 20:16:00|2011-12-31|20:16:00|
+----------+----+----+----+-----+-----------+------------+--------------+---------------

In [20]:
type(df_data)

pyspark.sql.dataframe.DataFrame

In [21]:
df_data.printSchema()

root
 |-- Timestamp: integer (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Vol_BTC: double (nullable = true)
 |-- Vol_Currency: double (nullable = true)
 |-- Weighted_Price: double (nullable = true)
 |-- dateTime: timestamp (nullable = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)



In [22]:
# Extrair a hora:
df_data_hora = df_data.withColumn('hour', split(col('time'), ':').getItem(0))
df_data_hora.show(5)

+----------+----+----+----+-----+-----------+------------+--------------+-------------------+----------+--------+----+
| Timestamp|Open|High| Low|Close|    Vol_BTC|Vol_Currency|Weighted_Price|           dateTime|      date|    time|hour|
+----------+----+----+----+-----+-----------+------------+--------------+-------------------+----------+--------+----+
|1325317920|4.39|4.39|4.39| 4.39| 0.45558087|2.0000000193|          4.39|2011-12-30 23:52:00|2011-12-30|23:52:00|  23|
|1325346600|4.39|4.39|4.39| 4.39|       48.0|      210.72|          4.39|2011-12-31 07:50:00|2011-12-31|07:50:00|  07|
|1325350740| 4.5|4.57| 4.5| 4.57|37.86229723|171.38033753|  4.5264114983|2011-12-31 08:59:00|2011-12-31|08:59:00|  08|
|1325350800|4.58|4.58|4.58| 4.58|        9.0|       41.22|          4.58|2011-12-31 09:00:00|2011-12-31|09:00:00|  09|
|1325391360|4.58|4.58|4.58| 4.58|      1.502|     6.87916|          4.58|2011-12-31 20:16:00|2011-12-31|20:16:00|  20|
+----------+----+----+----+-----+-----------+---

In [23]:
df_data_hora.printSchema()

root
 |-- Timestamp: integer (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Vol_BTC: double (nullable = true)
 |-- Vol_Currency: double (nullable = true)
 |-- Weighted_Price: double (nullable = true)
 |-- dateTime: timestamp (nullable = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- hour: string (nullable = true)



In [24]:
# Encerrar a sessão Spark:
spark_session.stop()
# Encerrar o Spark Context:
sc.stop()