## https://openscoring.io/blog/2020/03/08/sklearn_date_datetime_pmml/

In [1]:
from pandas import DataFrame
from sklearn_pandas import DataFrameMapper
from sklearn.tree import DecisionTreeClassifier
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.decoration import Alias, DateDomain, DateTimeDomain
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.preprocessing import DaysSinceYearTransformer, ExpressionTransformer, SecondsSinceMidnightTransformer, SecondsSinceYearTransformer

import joblib
import pandas

In [2]:
# Apollo Lunar Missions
# https://nssdc.gsfc.nasa.gov/planetary/lunar/apollo.html
df = DataFrame([
    ["1968-01-01T12:51:00Z", None, "1968-12-27T15:51:42Z", True], # Apollo 8
    ["1969-05-18T16:49:00Z", None, "1969-05-26T16:52:23Z", True], # Apollo 10
    ["1969-07-16T13:32:00Z", "1969-07-20T20:17:40Z", "1969-07-24T16:50:35Z", True], # Apollo 11
    ["1969-11-14T16:22:00Z", "1969-11-19T06:54:35Z", "1969-11-24T20:58:24Z", True], # Apollo 12
    ["1970-04-11T19:13:00Z", None, "1970-04-17T18:07:41Z", False], # Apollo 13
    ["1971-01-31T21:03:02Z", "1971-02-05T09:18:11Z", "1971-02-09T21:05:00Z", True], # Apollo 14
    ["1971-07-26T13:34:00Z", "1971-07-30T22:16:29Z", "1971-08-07T20:45:53Z", True], # Apollo 15
    ["1972-04-16T17:54:00Z", "1972-04-21T02:23:35Z", "1972-04-27T19:45:05Z", True], # Apollo 16
    ["1972-12-07T05:33:00Z", "1972-12-11T19:54:57Z", "1972-12-19T19:24:59Z", True], # Apollo 17
], columns = ["launch", "moon landing", "return", "success"])
df

Unnamed: 0,launch,moon landing,return,success
0,1968-01-01T12:51:00Z,,1968-12-27T15:51:42Z,True
1,1969-05-18T16:49:00Z,,1969-05-26T16:52:23Z,True
2,1969-07-16T13:32:00Z,1969-07-20T20:17:40Z,1969-07-24T16:50:35Z,True
3,1969-11-14T16:22:00Z,1969-11-19T06:54:35Z,1969-11-24T20:58:24Z,True
4,1970-04-11T19:13:00Z,,1970-04-17T18:07:41Z,False
5,1971-01-31T21:03:02Z,1971-02-05T09:18:11Z,1971-02-09T21:05:00Z,True
6,1971-07-26T13:34:00Z,1971-07-30T22:16:29Z,1971-08-07T20:45:53Z,True
7,1972-04-16T17:54:00Z,1972-04-21T02:23:35Z,1972-04-27T19:45:05Z,True
8,1972-12-07T05:33:00Z,1972-12-11T19:54:57Z,1972-12-19T19:24:59Z,True


In [3]:
def awarestr_to_naivestr(x, tzinfo):
    # Parse aware
    x = pandas.to_datetime(x)
    # Unify timezones
    x = x.dt.tz_convert(tzinfo)
    # Convert from aware to naive
    x = x.dt.tz_localize(None)
    # Format naive
    x = x.dt.strftime("%Y-%m-%dT%H:%M:%S")
    return x

In [4]:
tzinfo = "Europe/Tallinn"

df["launch"] = awarestr_to_naivestr(df["launch"], tzinfo)
df["moon landing"] = awarestr_to_naivestr(df["moon landing"], tzinfo).replace({"NaT" : None})
df["return"] = awarestr_to_naivestr(df["return"], tzinfo)

In [5]:
df.dtypes

launch          object
moon landing    object
return          object
success           bool
dtype: object

In [30]:
def fit_convert(mapper):
    pipeline = PMMLPipeline([
        ("mapper", mapper)
    ])
    dfresult = pipeline.fit_transform(df, df["success"])
    return dfresult

In [35]:
mapper = DataFrameMapper([
    (["launch", "return"], [DateTimeDomain(),DaysSinceYearTransformer(year = 1968)])
])

resl = fit_convert(mapper)

In [36]:
resl

array([[   0,  361],
       [ 503,  511],
       [ 562,  570],
       [ 683,  693],
       [ 831,  837],
       [1127, 1136],
       [1302, 1314],
       [1567, 1578],
       [1802, 1814]])

In [37]:
mapper = DataFrameMapper([
    (["launch", "return"], [DateTimeDomain(),SecondsSinceMidnightTransformer()])
])

resl = fit_convert(mapper)

In [38]:
resl

array([[57060, 67902],
       [71340, 71543],
       [59520, 71435],
       [69720, 86304],
       [79980, 76061],
       [  182,   300],
       [59640, 85553],
       [75240, 81905],
       [30780, 80699]])