In [0]:
import DadosIO as Db
import Function as ut_f
from pyspark.sql import functions as f
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:


df = ut_f.filtro_ultimas_particoes(spark, 
                                           'sandbox.silver_acoes_disney', 
                                           nm_particao='dt_carga', 
                                           qtd_dias_filtro=3, 
                                           delta=True)

print(df.head())


Row(adj_cluster=98.87271118164062, close=102.47000122070312, High=102.95999908447266, low=101.80999755859375, open=101.91000366210938, volume=7559200, dt_carga='20241105')


In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

feature_columns = df.columns[:-1]
assembler = VectorAssembler(inputCols=feature_columns,outputCol="features")
df = assembler.transform(df)


In [0]:
train, test = df.randomSplit([0.7, 0.3])


In [0]:
lr = LinearRegression(featuresCol = 'features', labelCol='volume')
lr_model = lr.fit(train)


In [0]:
predictions = lr_model.transform(test)
predictions.select("prediction","volume","features").show()

test_result = lr_model.evaluate(test)
print("R Squared (R^2) no conjunto de teste = %g" % test_result.r2)


+--------------------+--------+--------------------+
|          prediction|  volume|            features|
+--------------------+--------+--------------------+
|1.4867900000000011E7|14867900|[79.4235458374023...|
|   9372500.000000019| 9372500|[79.9593124389648...|
|1.0291200000000015E7|10291200|[80.0486068725586...|
|1.1034400000000015E7|11034400|[80.3760299682617...|
|1.1257600000000015E7|11257600|[80.4157180786132...|
|2.6579100000000004E7|26579100|[81.8742065429687...|
|1.1499500000000017E7|11499500|[81.9138946533203...|
|1.5375100000000013E7|15375100|[82.2909240722656...|
|1.4316800000000013E7|14316800|[82.5984954833984...|
| 1.659460000000001E7|16594600|[83.0251312255859...|
|1.1816600000000011E7|11816600|[83.6204299926757...|
|1.0808000000000013E7|10808000|[83.6898880004882...|
|   9554900.000000015| 9554900|[83.6898880004882...|
|1.1210700000000013E7|11210700|[83.7394943237304...|
|2.6942599999999996E7|26942600|[83.8387145996093...|
|1.0047000000000015E7|10047000|[84.01730346679

In [0]:
import mlflow
import mlflow.spark


In [0]:
experiment_name = "/teste_modelo_fiap"
mlflow.set_experiment(experiment_name)


2024/11/05 16:05:11 INFO mlflow.tracking.fluent: Experiment with name '/teste_modelo_fiap' does not exist. Creating a new experiment.


<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/223413049705617', creation_time=1730822711939, experiment_id='223413049705617', last_update_time=1730822711939, lifecycle_stage='active', name='/teste_modelo_fiap', tags={'mlflow.experiment.sourceName': '/teste_modelo_fiap',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 't794746@f1rst.com.br',
 'mlflow.ownerId': '1353537123434921'}>

In [0]:
with mlflow.start_run():
    lr = LinearRegression(featuresCol = 'features', labelCol='volume')
    lr_model = lr.fit(train)

    mlflow.spark.log_model(lr_model, "model")

    test_result = lr_model.evaluate(test)
    mlflow.log_metric("r2", test_result.r2)

    #o codigo abaixo registra o modelo
    mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/model", "model")


2024/11/05 16:05:44 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


In [0]:

# !pip install featuretools
# dbutils.library.restartPython()  

# import featuretools as ft

# es = ft.EntitySet(id = 'volume')
# es.entity_from_dataframe(entity_id = 'data', dataframe = df, 
#                          make_index = True, index = 'index')

# features, feature_defs = ft.dfs(entityset = es, target_entity = 'data',
#                                  trans_primitives = ['add_numeric', 'multiply_numeric'])



[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
Collecting featuretools
  Obtaining dependency information for featuretools from https://files.pythonhosted.org/packages/d4/8c/9e000dfc47b28f482ca7742e11b2f44071cc1981429fd88c922eb1172167/featuretools-1.31.0-py3-none-any.whl.metadata
  Downloading featuretools-1.31.0-py3-none-any.whl.metadata (15 kB)
Collecting holidays>=0.17 (from featuretools)
  Obtaining dependency information for holidays>=0.17 from https://files.pythonhosted.org/packages/52/99/de904580147b409352a98b92ed7d756c500d0818f5f759df741c3b6fa319/holidays-0.60-py3-none-any.whl.metadata
  Downloading holidays-0.60-py3-none-any.whl.metadata (25 kB)
Collecting numpy>=1.25.0 (from featuretools)
  Obtaining dependency information for numpy>=1.25.0 from https://files.pythonhosted.org/packages/7a/f0/80811e836484262b236c684a75dfc4ba0424bc670e765afaa911468d9f39/numpy-2.1.3-cp311-cp311-manylinux_2_17_x86

[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
File [0;32m<command-223413049705618>, line 7[0m
[1;32m      4[0m [38;5;28;01mimport[39;00m [38;5;21;01mfeaturetools[39;00m [38;5;28;01mas[39;00m [38;5;21;01mft[39;00m
[1;32m      6[0m es [38;5;241m=[39m ft[38;5;241m.[39mEntitySet([38;5;28mid[39m [38;5;241m=[39m [38;5;124m'[39m[38;5;124mvolume[39m[38;5;124m'[39m)
[0;32m----> 7[0m es[38;5;241m.[39mentity_from_dataframe(entity_id [38;5;241m=[39m [38;5;124m'[39m[38;5;124mdata[39m[38;5;124m'[39m, dataframe [38;5;241m=[39m df, 
[1;32m      8[0m                          make_index [38;5;241m=[39m [38;5;28;01mTrue[39;00m, index [38;5;241m=[39m [38;5;124m'[39m[38;5;124mindex[39m[38;5;124m'[39m)
[1;32m     10[0m features, feature_defs [38;5;241m=[39m ft[38;5;241m.[39mdfs(entityset [38;5;241m=[39m es, target_enti