# Load Model and Pipeline

<font color='steelblue'>
<h3>
<span style="font-family:Comic sans MS; font-size:1.5em;">
Using saved pipeline and model<br>


 </span>
</h3>
</font>

<font color='gray'>
<span style="font-family:Comic sans MS; font-size:1.2em;">
Following processing is done:<br>
    <ol>
        <li><strong>Load New Data:</strong> Load the new dataset on which predictions need to be made</li>
        <li><strong>Load pipeline:</strong> Load the pipeline saved during pre-processing of the data</li>
        <li><strong>Load Model:</strong> Load the Logistic Regression Model saved during model creation</li>
        <li><strong>Transform:</strong> Transform the newly loaded dataframe using the pipeline model</li>
        <li><strong>Predictions:</strong> Make predictions using the loaded model on new data</li>
        <li><strong>Save Results:</strong> Write the predictions to a csv file</li>
    </ol>
</span>
</font>

In [None]:
# Set up the environment for using pyspark
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import VectorAssembler

In [None]:
spark = SparkSession\
        .builder\
        .appName("Load Pipeline and Model")\
        .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Warn")

## Load new data

In [None]:
df = spark.read.format('csv').options(header='true', inferSchema='true').load('../datasets/agent-0.csv')

In [None]:
df.limit(5).toPandas()

In [None]:
# Save the column names so that they can be written to ouput
dColumns = df.columns

## Load the pipeline for data processing<br>
<span style="font-family:times, serif; font-size:14pt; font-style:bold">
    <ul>
<li>Load the saved pipeline into a pipelinemodel.</li>
<li>Tranform the dataframe using the pipeline (does all the preprocessing that was done during original processing of the dataframe)</li>
    </ul>
</span>

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel

In [None]:
# have to provide the folder where the pipeline was stored
pipelineModel = PipelineModel.load('projPipeline')

In [None]:
# new dataframe is created and a features column is created
preppedDF = pipelineModel.transform(df)

In [None]:
preppedDF.limit(5).toPandas()

## Load the Logistic Regression Model<br>
<span style="font-family:times, serif; font-size:14pt; font-style:bold">
    <ul>
        <li>Load the saved logistic regression model</li>
        <li>Transform the processed dataframe and make predictions</li>
        <li>Prepare a new dataframe with predictions and display it</li>
    </ul>
</span>

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LogisticRegressionModel

In [None]:
lrModel = LogisticRegressionModel.load('projlrModel')

In [None]:
predictions = lrModel.transform(preppedDF)

In [None]:
# Remove the features column
dColumns.append('prediction')
selected = predictions.select(dColumns)
selected.limit(10).toPandas().head(10)

## Remove the results folder if it exists

In [None]:
import os
import shutil
dirpath = './results'
try:
    shutil.rmtree(dirpath)
except OSError as e:
    print("Error/Info: %s : %s" % (dirpath, e.strerror))

## Write the predictions to csv file (results folder)

In [None]:
selected.write.format('csv')\
              .option('header', True)\
              .save('results')