# Task - fit Linear regression

* take the query for average number of answers per week that we solved in interactive-analytics ntb
* fit linear regression on the data

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, window, count, unix_timestamp, when, lit, ceil

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

import os
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Forecast Analytics I')
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

answers_input_path = os.path.join(project_path, 'data/answers')

questions_input_path = os.path.join(project_path, 'output/questions-transformed')

In [None]:
answersDF = (
    spark
    .read
    .option('path', answers_input_path)
    .load()
)

#### Take the query for average number of answers per week

* Add new col time that is conversion of window.start to long data type. This will serve as feature for the linear regression

In [None]:
groupedDF = (
    answersDF
    .filter(col('user_id').isNotNull())
    .groupBy(
        window('creation_date', "1 week")
    )
    .agg(
        count('*').alias('answers')
    )
    .withColumn('date', col('window.start').cast('date'))
    .withColumn('time', col('window.start').cast('long'))
    .orderBy('window')
)

In [None]:
groupedDF.printSchema()

In [None]:
groupedDF.show(n=5)

#### Build th pipeline

Hint:
* use VectorAssembler to convert the feature into vector
* use LinearRegression model
* fit the pipeline

In [None]:
# define your features:
features = ['time']

assembler = VectorAssembler(inputCols=features, outputCol='features')

# define your model:
lr = LinearRegression(labelCol="answers", featuresCol="features")

modelLr = Pipeline(stages=[assembler, lr]).fit(groupedDF)

#### Make the predictions and visualize the result

Hint:
* call transform
* collect to driver using toPandas()
* visualize by calling plot

In [None]:
local_prediction = modelLr.transform(groupedDF).toPandas()

In [None]:
ax = local_prediction.plot(
    x='date', y='prediction', figsize=(12, 6), 
    title='Number of answers per week',
    legend=False
    
)
local_prediction.plot(x='date', y='answers', ax=ax, legend=False)
plt.xlabel('Date')
plt.ylabel('Number of answers')
plt.show()

#### Use scikitlearn to compare the prediction

* import LinearRegression from sklearn
* fit the data localy
* compare the fits in a plot

In [None]:
from sklearn.linear_model import LinearRegression as linR

In [None]:
model = linR()
local_data = groupedDF.toPandas()

In [None]:
X = local_data.iloc[:, 3].values.reshape(-1, 1)
Y = local_data.iloc[:, 1].values.reshape(-1, 1)
model.fit(X, Y)

In [None]:
Y_pred = model.predict(X)  # make predictions

In [None]:
local_prediction['local'] = Y_pred
ax = local_prediction.plot(
    x='date', 
    y='prediction', 
    figsize=(12, 6), 
    title='Number of answers per week',
    legend=False,
    color='black'    
)
local_prediction.plot(
    x='date', 
    y='answers', 
    ax=ax, 
    legend=False
)
local_prediction.plot(
    x='date', 
    y='local', 
    ax=ax, 
    legend=False,
    color='red'
)
plt.xlabel('Date')
plt.ylabel('Number of answers')
plt.show()

In [None]:
spark.stop()