# Parse Time Series in SPSS Modeler with open source extension node

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("mock access file.csv")

In [15]:
df[["name", "first access"]].head()

Unnamed: 0,name,first access
0,bob,2020-05-19T02:13:42.321Z 2020-05-19T03:31:53.0...
1,sue,2020-05-19T11:37:14.237Z
2,harry,2020-05-19T02:02:51.339Z
3,paul,2020-05-19T11:44:04.810Z
4,sam,2020-05-19T11:16:53.992Z 2020-05-19T11:30:31.395Z


In [9]:
for i in df["first access"][0:2]:
    print(i)
    print()

2020-05-19T02:13:42.321Z 2020-05-19T03:31:53.080Z 2020-05-19T03:42:00.881Z 2020-05-19T03:46:31.780Z 2020-05-19T03:53:00.850Z 2020-05-19T04:04:55.125Z 2020-05-19T08:30:08.468Z

2020-05-19T11:37:14.237Z



In [17]:
df_after[["name", "first access"]].head()

Unnamed: 0,name,first access
0,bob,2020-05-19T02:13:42.321Z
0,bob,2020-05-19T03:31:53.080Z
0,bob,2020-05-19T03:42:00.881Z
0,bob,2020-05-19T03:46:31.780Z
0,bob,2020-05-19T03:53:00.850Z


In [11]:
#https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows

def tidy_split(df, column, sep='|', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.

    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

In [12]:
df_after = tidy_split(df, 'first access', sep=' ')

In [19]:
df_after.head()

Unnamed: 0,name,count,first access
0,bob,7,2020-05-19T02:13:42.321Z
0,bob,7,2020-05-19T03:31:53.080Z
0,bob,7,2020-05-19T03:42:00.881Z
0,bob,7,2020-05-19T03:46:31.780Z
0,bob,7,2020-05-19T03:53:00.850Z


# Move to Modeler

In [None]:
import spss.pyspark.runtime
from pyspark.sql.types import *

cxt = spss.pyspark.runtime.getContext() 

def tidy_split(df, column, sep='|', keep=False):
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

if  cxt.isComputeDataModelOnly():
    outputSchema = StructType([StructField('name', StringType(), True), StructField('count', IntegerType(), True),StructField('login', StringType(), True)])
    cxt.setSparkOutputSchema(outputSchema)
else:
    df = cxt.getSparkInputData()
    pd_df = df.toPandas()
    pd_df = tidy_split(pd_df, 'first access', sep=' ')
    output_df = cxt.getSparkSQLContext().createDataFrame(pd_df)
    cxt.setSparkOutputData(output_df)