In [52]:
from pyspark.sql import SQLContext
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, StandardScaler, VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import rand
from pyspark.mllib.evaluation import MulticlassMetrics

# Keras / Deep Learning
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras import optimizers, regularizers
from keras.optimizers import Adam

# Elephas for Deep Learning on Spark
from elephas.ml_model import ElephasEstimator

In [2]:
%%time
df = spark.read.csv("hdfs://master:9000/user/data/ATKH_Oplus_TWGKHHPSK1MSB04_memory_usage_2020_11.csv", inferSchema=True, header=True)
# inferSchema referring to the type of the column
df.show(5)

+----------+--------+---------------+-------+
|      Date|    time|Allocate memory|  %used|
+----------+--------+---------------+-------+
|2020/11/30|23:57:45|     1606751865|31.6945|
|2020/11/30|23:54:46|     1606751686|31.9328|
|2020/11/30|23:51:45|     1606751505|32.8929|
|2020/11/30|23:48:45|     1606751325|34.5618|
|2020/11/30|23:45:45|     1606751145|  32.14|
+----------+--------+---------------+-------+
only showing top 5 rows

CPU times: user 4.22 ms, sys: 4.28 ms, total: 8.5 ms
Wall time: 16.5 s


In [5]:
df = df.drop('Allocate memory')

In [7]:
df.show(5)

+----------+--------+-------+
|      Date|    time|  %used|
+----------+--------+-------+
|2020/11/30|23:57:45|31.6945|
|2020/11/30|23:54:46|31.9328|
|2020/11/30|23:51:45|32.8929|
|2020/11/30|23:48:45|34.5618|
|2020/11/30|23:45:45|  32.14|
+----------+--------+-------+
only showing top 5 rows



In [47]:
# Helper function to select features to scale given their skew
def select_features_to_scale(df=df, lower_skew=-2, upper_skew=2, drop_cols=['Date','time']):
    
    # Empty Selected Feature List for Output
    selected_features = []
    
    # Select Features to Scale based on Inputs ('in32' type, drop 'ID' columns or others, skew bounds)
    feature_list = list(df.toPandas().columns.drop(drop_cols))
    
    # Loop through 'feature_list' to select features based on Kurtosis / Skew
    for feature in feature_list:

        selected_features.append(feature)
        #if df.toPandas()[feature].kurtosis() < -2 or df.toPandas()[feature].kurtosis() > 2:
            
            
    
    # Return feature list to scale
    return selected_features

In [53]:
stages = []

unscaled_features = select_features_to_scale(df=df, lower_skew=-2, upper_skew=2)
unscaled_assembler = VectorAssembler(inputCols=unscaled_features, outputCol="unscaled_features")
scaler = MinMaxScaler(min=0.0, max=1.0, inputCol="unscaled_features", outputCol="scaled_features")
stages += [unscaled_assembler, scaler]

In [54]:
# Set Pipeline
pipeline = Pipeline(stages=stages)

# Fit Pipeline to Data
pipeline_model = pipeline.fit(df)

# Transform Data using Fitted Pipeline
df_transform = pipeline_model.transform(df)

In [57]:
df_transform.limit(5).toPandas()['scaled_features']

0    [0.20730249328605418]
1    [0.21053631501739048]
2     [0.2235652375698705]
3    [0.24621284599382007]
4    [0.21334809790758305]
Name: scaled_features, dtype: object