# spark-ts-trends

Computes trends of time series by fitting a low order polinomial

WARNING: This component currently only supports local execution (not Kubeflow/Airflow)  
WARNING: This component currently only supports copernicus climate data


Future work    
[ ] Generalize component  
[ ] Make component run on KubeFlow/Airflow pipelines

In [None]:
%%bash
export version=`python --version |awk '{print $2}' |awk -F"." '{print $1$2}'`

echo $version

if [ $version == '36' ] || [ $version == '37' ]; then
    echo 'Starting installation...'
    pip3 install pyspark==2.4.8 wget==3.2 pyspark2pmml==0.5.1 > install.log 2> install.log
    if [ $? == 0 ]; then
        echo 'Please <<RESTART YOUR KERNEL>> (Kernel->Restart Kernel and Clear All Outputs)'
    else
        echo 'Installation failed, please check log:'
        cat install.log
    fi
elif [ $version == '38' ] || [ $version == '39' ]; then
    pip3 install pyspark==3.1.2 wget==3.2 pyspark2pmml==0.5.1 > install.log 2> install.log
    if [ $? == 0 ]; then
        echo 'Please <<RESTART YOUR KERNEL>> (Kernel->Restart Kernel and Clear All Outputs)'
    else
        echo 'Installation failed, please check log:'
        cat install.log
    fi
else
    echo 'Currently only python 3.6, 3.7 , 3.8 and 3.9 are supported, in case you need a different version please open an issue at https://github.com/IBM/claimed/issues'
    exit -1
fi

In [None]:
import wget
wget.download(
    'https://raw.githubusercontent.com/IBM/claimed/master/component-library/claimed_utils.py'
)
from claimed_utils import parse_args_to_parameters
import numpy as np
import os
import pandas as pd
import pickle
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
import shutil

In [None]:
# data_parquet path and parquet file name (default: data.parquet)
data_parquet = os.environ.get('data_parquet', 'data.parquet')

# @param master url of master (default: local mode)
master = os.environ.get('master', "local[*]")

# data_dir temporal data storage for local execution
data_dir = os.environ.get('data_dir', '../../data/')

# output_result_filename parquet file name of result (default: trends.parquet)
output_result_filename = os.environ.get('output_result_filename', 'trends.parquet')

parse_args_to_parameters()

In [None]:
config = SparkConf().setMaster(master).setAll([
    ('spark.executor.memory', '8g'),
    ('spark.driver.memory', '8g'),
    ('spark.sql.execution.arrow.pyspark.enabled', 'true')
])
sc = SparkContext.getOrCreate(config)
spark = SparkSession.builder.getOrCreate()

In [None]:
df = spark.read.parquet(data_dir + data_parquet)

In [None]:
df.createOrReplaceTempView('df')

In [None]:
if os.path.exists(data_dir + 'lon_lat.p'):
    lon_lat = pickle.load(open(data_dir + "lon_lat.p", "rb"))
else:
    lon_lat = spark.sql('''
        select lon, lat from df where group by lon, lat
    ''').rdd.map(lambda x: [x.lon, x.lat]).collect()
    pickle.dump(lon_lat, open(data_dir + "lon_lat.p", "wb"))

In [None]:
if os.path.exists(data_dir + 'lon_lat_slope.p'):
    lon_lat_slope = pickle.load(open(data_dir + "lon_lat_slope.p", "rb"))
else:
    lon_lat_slope = dict()

In [None]:
for index in range(len(lon_lat)):
    if not lon_lat[index][0] + ':' + lon_lat[index][1] in lon_lat_slope:
        sm = spark.sql("select sm from df where sm <> 'null' and lat='" 
                       + lon_lat[index][0]
                       + "' and lon='"
                       + lon_lat[index][1]
                       + "' order by time asc").collect()
        sm = np.array(list(map(lambda x: x.sm, sm)))
        sm = sm.astype(float)
        if len(sm) > 0:
            try:
                coefficients, residuals, _, _, _ = np.polyfit(
                    range(len(sm)), sm, 1, full=True)
                mse = residuals[0] / (len(sm))
                nrmse = np.sqrt(mse) / (sm.max() - sm.min())
                print('Slope ' + str(coefficients[1]))
                print('NRMSE: ' + str(nrmse))
                lon_lat_slope[lon_lat[index][0] + ":" + lon_lat[index][1]] =
                [coefficients[1]]
                pickle.dump(lon_lat_slope, open(
                    data_dir + "lon_lat_slope.p", "wb"))
            except ValueError:
                print("Could not convert data")
            except Exception:
                print('skipping, Generic Error')
        else:
            print('skipping, empty data')
    else:
        print('already processed')

In [None]:
lon_lat_slope = pickle.load(open(data_dir + "lon_lat_slope.p", "rb"))

In [None]:
result = np.array([])
for k, v in lon_lat_slope.items():
    lon_lat = k.split(':')
    lon = lon_lat[0]
    lat = lon_lat[1]
    slope = lon_lat_slope[k][0]
    result = np.append(result, [lon, lat, slope])

In [None]:
result = result.reshape(int(len(result) / 3), 3)

In [None]:
result = pd.DataFrame(result, columns=['lon', 'lat', 'trend'])

In [None]:
result_df = spark.createDataFrame(result)

In [None]:
if os.path.exists(data_dir + output_result_filename):
    shutil.rmtree(data_dir + output_result_filename)
result_df.write.parquet(data_dir + output_result_filename)