#### Notebook for data preparation.
#### Current pipeline:
* Feature extraction
  * Cast date
  * Extract day
  * Extract Month
  * Extract Year
  * Extract Week day
  * Extract if day is weekend
* Normalize data
  * Min-max scaler
* Create series (each serie is 1 months, series alternate days)

In [2]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler

In [3]:
%run ./custom_transformers

In [4]:
train_data = spark.sql("select * from store_item_demand_train_csv")
test_data = spark.sql("select * from store_item_demand_test_csv")

In [5]:
df_train = train_data.withColumn('set', F.lit(0))
df_train = df_train.withColumn('id', F.lit(-1))
df_test = test_data.withColumn('set', F.lit(1))

df_test = df_test.withColumn('sales', F.lit(-1))
joined = df_test.union(df_train.select(*df_test.columns))

pyi = PreviousYearInputer(idCol=['store', 'item'], dateCol='date', inputCol='sales', outputCol='previousyear')

joined = pyi.transform(joined)

train_data = joined.filter('set == 0')
test_data = joined.filter('set == 1')

In [6]:
train, validation = train_data.randomSplit([0.8,0.2], seed=1234)

In [7]:
# Feature extraction
dc = DateConverter(inputCol='date', outputCol='dateFormated')
dex = DayExtractor(inputCol='dateFormated')
mex = MonthExtractor(inputCol='dateFormated')
yex = YearExtractor(inputCol='dateFormated')
wdex = WeekDayExtractor(inputCol='dateFormated')
wex = WeekendExtractor()
mbex = MonthBeginExtractor()
meex = MonthEndExtractor()
yqex = YearQuarterExtractor()
ydex = YearDayExtractor(inputCol='dateFormated')

# Data process
#tentar fazer 'day', 'month', 'year', 'weekday', 'weekend' (as colunas derivadas) ficarem de forma dinâmica, no lugar delas ficar a saída de seu respectivo transformer
va = VectorAssembler(inputCols=['store', 'item', 'day', 'month', 'year', 'weekday', 'weekend', 'monthbegin', 'monthend', 'yearquarter', 'yearday', 'previousyear'], outputCol="features")
# scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

# Serialize data
sm = SerieMaker(inputCol='scaledFeatures', dateCol='date', idCol=['store', 'item'], serieSize=30)

pipeline = Pipeline(stages=[dc, dex, mex, yex, wdex, wex, mbex, meex, yqex, ydex, va, scaler, sm])

In [8]:
pipiline_model = pipeline.fit(train)

In [9]:
train_transformed = pipiline_model.transform(train)
validation_transformed = pipiline_model.transform(validation)

In [10]:
train_transformed.write.saveAsTable('train_transformed_py_30', mode='overwrite')
validation_transformed.write.saveAsTable('validation_transformed_py_30', mode='overwrite')

In [11]:
test_transformed = pipiline_model.transform(test_data)
test_transformed.write.saveAsTable('test_transformed_py_30', mode='overwrite')

In [12]:
print('Train raw: %s' % train.count())
print('Validation raw: %s' % validation.count())
print('Test raw: %s' % test_data.count())

In [13]:
print('Train transformed: %s' % train_transformed.count())
print('Validation transformed: %s' % validation_transformed.count())
print('Test transformed: %s' % test_transformed.count())