In [1]:
# Preprocessing danych historycznych i zapis w HDFS

# hdfs dfs -copyFromLocal etc/jupyter/symlinks_for_jupyterlab_widgets/Local%20Disk/home/jowike/weather hdfs://cluster-bda2-m/user/root/history

In [2]:
from typing import List
from pyspark.sql import SparkSession
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.sql import Window

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler
from pyspark.sql.types import FloatType
import re 

In [3]:
# Auxiliary functions

def load_weather_batches(spark, fpaths: List[str]):
    batches = []

    for path in fpaths:
        batch_df = spark.read.csv(path, inferSchema="true", header="true")
        batches.append(batch_df)

    weather_df = reduce(DataFrame.unionAll, batches)
    weather_df = weather_df.withColumnRenamed("Date time", "date")
    weather_df = weather_df.withColumn(
        "date", F.to_timestamp(F.col("date"), "MM/dd/yyyy HH:mm:ss").alias("date")
    )
    return weather_df


def load_ggtrends(spark, path):
    _website_df = spark.read.csv(path, inferSchema="true", header="true")
    website_df = _website_df.select(
        F.to_timestamp(F.col("date"), "MM-dd-yyyy").alias("date"),
        F.col("value").alias("y"),
    )
    return website_df


def merge_sources(weather_df, website_df):
    columns = [
        "date",
        "Address",
        "Temperature",
        "Relative Humidity",
        "Wind Speed",
        "Visibility",
        "Cloud Cover",
        "y",
    ]

    df = weather_df.join(website_df, how="inner", on="date").select(*columns)
    partition = Window.partitionBy("Address").orderBy(F.col("date").asc())
    df = df.withColumn("lag1", F.lag("y", 1).over(partition))
    df = (
        df.withColumn("lag24", F.lag("y", 24).over(partition))
        .drop("Address")
        .na.drop(subset=["lag1", "lag24"])
        .na.fill(0)
    )

    _df = df.withColumn(
        "filter_col",
        F.when(
            F.date_format(F.col("date"), "HH:mm:ss").between("07:00:00", "23:00:00"),
            "day",
        ).otherwise("night"),
    )
    return _df


def split_days_and_nights(df):
    df_day = df.filter(F.col("filter_col") == "day").drop("filter_F.col")
    df_night = df.filter(F.col("filter_col") == "night").drop("filter_F.col")
    return df_day, df_night


def train_test_split(df, columns):
    df = df.withColumn(
        "rank", F.percent_rank().over(Window.partitionBy().orderBy("date"))
    )
    df_train = df.where("rank <= .8").select(*columns).sort("date")
    df_test = df.where("rank > .8").select(*columns).sort("date")
    return df_train, df_test


def scale_df(df, columns_to_scale):
    assemblers = [
        VectorAssembler(inputCols=[col], outputCol=col + "_vec")
        for col in columns_to_scale
    ]
    scalers = [
        MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled")
        for col in columns_to_scale
    ]

    pipeline = Pipeline(stages=assemblers + scalers)

    scalerModel = pipeline.fit(df)
    _df_scaled = scalerModel.transform(df)
    names = {x + "_scaled": x for x in columns_to_scale}

    firstelement = F.udf(lambda v: float(v[0]), FloatType())
    columns_to_select = [F.col("date"), F.col("y")] + [
        firstelement(c).alias(c) for c in names.keys()
    ]
    _df_scaled = _df_scaled.select(columns_to_select)

    names["date"] = "ds"
    names["y"] = "y"

    df_scaled = _df_scaled.select([F.col(c).alias(names[c]) for c in names.keys()])
    return df_scaled


def trim_colnames(df1):
    schema1 = [re.sub("[^a-zA-Z0-9,]", "", i) for i in df1.columns] 
    df2 = df1.toDF(*schema1)
    return df2

In [4]:
# Parameters settings

weather_batches = [
    "hdfs://cluster-bda2-m/user/root/history/weather/merged_historic_Hamburg_batch1.csv",
    "hdfs://cluster-bda2-m/user/root/history/weather/merged_historic_Hamburg_batch2.csv",
    "hdfs://cluster-bda2-m/user/root/history/weather/merged_historic_Hamburg_batch3.csv",
    "hdfs://cluster-bda2-m/user/root/history/weather/merged_historic_Hamburg_batch4.csv"
]

ggtrends_fpath = "hdfs://cluster-bda2-m/user/root/history/ggtrends/Instagram_Hamburg_historic.csv"

columns = [
    "date",
    "Temperature",
    "Relative Humidity",
    "Wind Speed",
    "Visibility",
    "lag1",
    "lag24",
    "y"]

columns_to_scale = columns[1:-1]

In [5]:
spark = SparkSession \
    .builder \
    .appName("Time series data analysis with Spark") \
    .config("spark.redis.ssl", "true") \
    .getOrCreate()

In [6]:
# Reading historical data from HDFS
weather_df = load_weather_batches(spark, weather_batches)
ggtrends_df = load_ggtrends(spark, ggtrends_fpath)

In [7]:
# Source data integration
df = merge_sources(weather_df, ggtrends_df)

In [8]:
# Day and night data separation
df_day, df_night = split_days_and_nights(df)

In [9]:
# Train, test split
df_day_train, df_day_test = train_test_split(df_day, columns)
df_night_train, df_night_test = train_test_split(df_night, columns)

In [10]:
# Scaling
df_day_train_scaled = scale_df(df_day_train, columns_to_scale)
df_day_test_scaled = scale_df(df_day_test, columns_to_scale)
df_night_train_scaled = scale_df(df_night_train, columns_to_scale)
df_night_test_scaled = scale_df(df_night_test, columns_to_scale)

In [11]:
trim_colnames(df_day_train_scaled).write.mode("overwrite").parquet(
    "hdfs://cluster-bda2-m/user/root/modeling/in/df_day_train_scaled.parquet"
)
trim_colnames(df_day_test_scaled).write.mode("overwrite").parquet(
    "hdfs://cluster-bda2-m/user/root/modeling/in/df_day_test_scaled.parquet"
)
trim_colnames(df_night_train_scaled).write.mode("overwrite").parquet(
    "hdfs://cluster-bda2-m/user/root/modeling/in/df_night_train_scaled.parquet"
)
trim_colnames(df_night_test_scaled).write.mode("overwrite").parquet(
    "hdfs://cluster-bda2-m/user/root/modeling/in/df_night_test_scaled.parquet"
)