In [1]:
%load_ext autoreload
%autoreload 2

In [42]:
from pyspark import SparkContext

from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import asc, desc
from pyspark.sql.functions import avg, round

from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder

from utils import *
import numpy as np

In [3]:
ss = SparkSession.builder.config('spark.driver.extraClassPath', 'postgresql-42.2.18.jar') \
                         .config("spark.executor.memory", "16g") \
                         .config("spark.driver.memory", "16g") \
                         .config("spark.executor.cores", 8) \
                         .config('spark.executor.instances', 4).getOrCreate()

In [4]:
ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

## Load Normalized Assembled Data

In [7]:
df_norm = ss.read.parquet("TimeSeriesWISDM/norm_WISDM.parquet")

In [8]:
df_norm.show(5, truncate=80)

+--------------------------------------------------------------------------------+-----+
|                                                                        features|label|
+--------------------------------------------------------------------------------+-----+
|[2.969466847344476,3.7009700854153924,-3.4912654538885954,-4.698013064420892,...| 13.0|
|[0.8117654053992945,-1.3366482829686235,-0.6593023567251973,1.365790589729112...| 13.0|
|[2.969466847344476,-0.13350195004853757,1.4315522596928316,-3.450757985153246...| 13.0|
|[1.1718308563116413,-1.2188522724876578,1.1611477159999124,1.28930901656362,1...| 13.0|
|[0.46951760799278314,-0.24006077865929895,0.9271614251556844,2.42184824595895...| 13.0|
+--------------------------------------------------------------------------------+-----+
only showing top 5 rows



## Train and validation split

In [11]:
splits = df_norm.randomSplit([0.8, 0.2])

In [12]:
train = splits[0].cache()
valid = splits[1].cache()

## Develop a Logistic Regression Model

In [15]:
lr = LogisticRegression(fitIntercept=True)

In [14]:
evaluator = MulticlassClassificationEvaluator()

In [18]:
max_iter = [10, 100, 150]
reg_params = [0.001, 0.005, 0.0001]
n_fold = 5

In [17]:
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, max_iter) \
                              .addGrid(lr.regParam, reg_params) \
                              .build()

In [19]:
cv = CrossValidator().setEstimator(lr) \
                     .setEvaluator(evaluator) \
                     .setNumFolds(n_fold) \
                     .setEstimatorParamMaps(paramGrid)

In [20]:
cvmodel = cv.fit(train)

## Interpret the Model

In [22]:
best_model = cvmodel.bestModel

In [26]:
best_model.save("models/LogisticRegression_Best")

In [28]:
# print(best_model.coefficients)

In [None]:
print(best_model.intercept)

In [29]:
print(best_model.getMaxIter())

10


In [30]:
print(best_model.getRegParam())

0.0001


## Apply the model to the validation set

In [31]:
evaluator.evaluate(best_model.transform(valid))

0.11613895060464555

In [32]:
pred = best_model.transform(valid)

In [36]:
pred.show(5, truncate=20)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[-3.1275297223239...| 15.0|[-0.1014755147440...|[0.04393321530965...|      16.0|
|[-3.1272159646002...| 13.0|[0.49274797061774...|[0.08775399792062...|       2.0|
|[-3.1216014460869...| 16.0|[-1.1492996962829...|[0.01045463172687...|      16.0|
|[-3.1216014460869...| 16.0|[-1.0338454833150...|[0.01361711095525...|      16.0|
|[-3.1216014460869...| 16.0|[-1.2077170839487...|[0.01232998987772...|      16.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [41]:
prediction_label = pred.select("prediction", "label").rdd

In [43]:
metrics = MulticlassMetrics(prediction_label)

In [44]:
confusionMetrics = metrics.confusionMatrix()

In [49]:
accuracy = metrics.accuracy
print(accuracy)

0.15401535581255085
