databrickslabs · rportilla-databricks · Sep 30, 2021 · Sep 27, 2021 · Sep 27, 2021
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ coverage.xml
 scala/tempo/target
 scala/tempo/project/target/
 scala/tempo/project/project/target/
+.bsp
 
 # local delta tables
 **/spark-warehouse
@@ -19,6 +20,7 @@ scala/tempo/project/project/target/
 **/dist
 **/htmlcov
 **/tempo.egg-info
+**/dbl_tempo.egg-info
 
 ## Python related files
 *.pyc

diff --git a/python/dbl_tempo.egg-info/PKG-INFO b/python/dbl_tempo.egg-info/PKG-INFO
diff --git a/python/dbl_tempo.egg-info/SOURCES.txt b/python/dbl_tempo.egg-info/SOURCES.txt
diff --git a/python/dbl_tempo.egg-info/dependency_links.txt b/python/dbl_tempo.egg-info/dependency_links.txt
diff --git a/python/dbl_tempo.egg-info/requires.txt b/python/dbl_tempo.egg-info/requires.txt
diff --git a/python/dbl_tempo.egg-info/top_level.txt b/python/dbl_tempo.egg-info/top_level.txt
diff --git a/python/tempo/io.py b/python/tempo/io.py
@@ -1,6 +1,9 @@
-import pyspark.sql.functions as f
+import os
+import logging
 from collections import deque
 
+import pyspark.sql.functions as f
+
 def write(tsdf, spark, tabName, optimizationCols = None):
   """
   param: tsdf: input TSDF object to write
@@ -18,7 +21,6 @@ def write(tsdf, spark, tabName, optimizationCols = None):
   else:
      optimizationCols = ['event_time']
 
-  import os
   useDeltaOpt = (os.getenv('DATABRICKS_RUNTIME_VERSION') != None)
 
   view_df = df.withColumn("event_dt", f.to_date(f.col(ts_col))) \
@@ -33,9 +35,9 @@ def write(tsdf, spark, tabName, optimizationCols = None):
       try:
          spark.sql("optimize {} zorder by {}".format(tabName, "(" + ",".join(partitionCols + optimizationCols) + ")"))
       except Exception as e: 
-         print("Delta optimizations attempted, but was not successful.\nError: {}".format(e))
+         logging.error("Delta optimizations attempted, but was not successful.\nError: {}".format(e))
   else:
-      print("Delta optimizations attempted on a non-Databricks platform. Switch to use Databricks Runtime to get optimization advantages.")
+      logging.warning("Delta optimizations attempted on a non-Databricks platform. Switch to use Databricks Runtime to get optimization advantages.")
 
 
 
diff --git a/python/tempo/resample.py b/python/tempo/resample.py
@@ -1,10 +1,11 @@
+import tempo
+
+import logging
+
 import pyspark.sql.functions as f
-from datetime import *
-from pyspark.sql.types import *
 from pyspark.sql.window import Window
 
 # define global frequency options
-import tempo
 
 SEC = 'sec'
 MIN = 'min'

diff --git a/python/tempo/tsdf.py b/python/tempo/tsdf.py
@@ -1,8 +1,13 @@
-import pyspark.sql.functions as f
-from pyspark.sql.window import Window
 import tempo.resample as rs
 import tempo.io as tio
 
+import logging
+from functools import reduce
+
+import pyspark.sql.functions as f
+from pyspark.sql.window import Window
+
+
 class TSDF:
 
   def __init__(self, df, ts_col="event_ts", partition_cols=None, sequence_col = None):
@@ -61,7 +66,6 @@ def __addPrefixToColumns(self,col_list,prefix):
     """
     Add prefix to all specified columns.
     """
-    from functools import reduce
 
     df = reduce(lambda df, idx: df.withColumnRenamed(col_list[idx], '_'.join([prefix,col_list[idx]])),
                 range(len(col_list)), self.df)
@@ -74,7 +78,6 @@ def __addColumnsFromOtherDF(self, other_cols):
     """
     Add columns from some other DF as lit(None), as pre-step before union.
     """
-    from functools import reduce
     new_df = reduce(lambda df, idx: df.withColumn(other_cols[idx], f.lit(None)), range(len(other_cols)), self.df)
 
     return TSDF(new_df, self.ts_col, self.partitionCols)
@@ -87,7 +90,6 @@ def __combineTSDF(self, ts_df_right, combined_ts_col):
     return TSDF(combined_df, combined_ts_col, self.partitionCols)
 
   def __getLastRightRow(self, left_ts_col, right_cols, sequence_col, tsPartitionVal):
-    from functools import reduce
     """Get last right value of each right column (inc. right timestamp) for each self.ts_col value
 
     self.ts_col, which is the combined time-stamp column of both left and right dataframe, is dropped at the end
@@ -118,7 +120,7 @@ def __getLastRightRow(self, left_ts_col, right_cols, sequence_col, tsPartitionVa
           any_blank_vals = (df.agg({column: 'min'}).collect()[0][0] == 0)
           newCol = column.replace("non_null_ct", "")
           if any_blank_vals:
-            print("Column " + newCol + " had no values within the lookback window. Consider using a larger window to avoid missing values. If this is the first record in the data frame, this warning can be ignored.")
+            logging.warning("Column " + newCol + " had no values within the lookback window. Consider using a larger window to avoid missing values. If this is the first record in the data frame, this warning can be ignored.")
           df = df.drop(column)
 
 
@@ -216,7 +218,7 @@ def asofJoin(self, right_tsdf, left_prefix=None, right_prefix="right", tsPartiti
     """
 
     if (tsPartitionVal is not None):
-      print("WARNING: You are using the skew version of the AS OF join. This may result in null values if there are any values outside of the maximum lookback. For maximum efficiency, choose smaller values of maximum lookback, trading off performance and potential blank AS OF values for sparse keys")
+      logging.warning("You are using the skew version of the AS OF join. This may result in null values if there are any values outside of the maximum lookback. For maximum efficiency, choose smaller values of maximum lookback, trading off performance and potential blank AS OF values for sparse keys")
 
     # Check whether partition columns have same name in both dataframes
     self.__checkPartitionCols(right_tsdf)

diff --git a/python/tests/tests.py b/python/tests/tests.py
@@ -1,3 +1,4 @@
+import logging
 import unittest
 
 import pyspark.sql.functions as F
@@ -573,7 +574,7 @@ def test_write_to_delta(self):
         # using lookback of 20 minutes
         #featured_df = tsdf_left.resample(freq = "min", func = "closest_lead").df
         tsdf_left.write(self.spark, "my_table")
-        print('delta table count ' + str(self.spark.table("my_table").count()))
+        logging.info('delta table count ' + str(self.spark.table("my_table").count()))
 
         # should be equal to the expected dataframe
         assert self.spark.table("my_table").count() == 7