In [1]:
import pandas as pd
from numpy import dtype
from ludwig.api import LudwigModel
from new_api import (add_entity, create_new_entity_set, add_aggregation_features, get_training_df, get_prediction_df,
add_entity_df, list_features, get_training_df_el, generate_base_model_definition)


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [11]:
passenger_df = pd.read_csv('titanic.csv')
eol_df = passenger_df[["PassengerId", "Survived"]]

passenger_entity = {"name": "passengers", "table": "passengers", "type": "primary", "index": "PassengerId"}

add_entity_df(passenger_df, passenger_entity)

passenger_df


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [8]:
from datetime import datetime, timedelta
from pytz import timezone, utc
import numpy as np

passengers = [i for i in range(1, 892)]

days = [datetime(2012,4,10).replace(hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=utc) \
        + timedelta(day) for day in range(4)][::-1]



passenger_activity_df = pd.DataFrame(
    {
        "id": [i for i in range(len(passengers) * len(days))],
        "PassengerId": [passenger for day in days for passenger in passengers],
        "date": [day for day in days for passenger in passengers],
        "buffet": [np.random.rand() * 10 for _ in range(len(days) * len(passengers))],
        "ballroom": [np.random.randint(10) for _ in range(len(days) * len(passengers))],
    }
)


passenger_activity = {"name": "passenger_activity", "table": "passenger_activity", "type": "event", "index": "id"}

add_entity_df(passenger_activity_df, passenger_activity)

passsenger_activity_features = {"total_buffet": {"feature": "buffet","function":"sum", "name": "total_buffet", "time_window": "full_history"},
                             "max_buffet":  {"feature": "buffet", "function":"max", "name": "max_buffet", "time_window": "full_history"},
                             "total_ballroom":   {"feature": "ballroom", "function": "sum", "name": "total_ballroom", "time_window": "full_history"}
                            }
add_aggregation_features("passenger_activity", passsenger_activity_features)

passenger_activity_df

Unnamed: 0,id,PassengerId,date,buffet,ballroom
0,0,1,2012-04-13 00:00:00+00:00,8.052401,2
1,1,2,2012-04-13 00:00:00+00:00,0.605412,1
2,2,3,2012-04-13 00:00:00+00:00,4.203537,6
3,3,4,2012-04-13 00:00:00+00:00,0.685746,1
4,4,5,2012-04-13 00:00:00+00:00,8.340162,2
...,...,...,...,...,...
3559,3559,887,2012-04-10 00:00:00+00:00,1.276697,3
3560,3560,888,2012-04-10 00:00:00+00:00,0.776263,7
3561,3561,889,2012-04-10 00:00:00+00:00,4.660930,9
3562,3562,890,2012-04-10 00:00:00+00:00,2.577083,0


In [9]:
relationships =  [
                 ("one_to_many", {"name": "passengers", "index": "PassengerId"}, {"name": "passenger_activity",  "index": "PassengerId"})
                ]


create_new_entity_set(name="titanic", entities=["passengers","passenger_activity"], relationships=relationships)




In [12]:

eol_df = passenger_df[["PassengerId", "Survived"]]


eol = {"pk": "PassengerId", "label": "Survived"}

features = {"entity_set": "titanic",
            "target_entity": "passengers",
            "features": {
                "passengers": ["Pclass","Sex","Age","SibSp","Parch","Ticket","Fare","Cabin","Embarked"],
                "passenger_activity": ["total_buffet", "max_buffet", "total_ballroom"]

            },
            "observations": {"type": "el", "eol": eol, "data": eol_df}

            }
training_df = get_training_df_el(features)

training_df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,total_buffet,max_buffet,total_ballroom
0,1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S,25.847066,9.158444,17.0
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,17.236785,6.422057,9.0
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S,24.168323,9.872204,25.0
3,4,1,1,female,35.0,1,0,113803,53.1000,C123,S,8.658106,5.195514,14.0
4,5,0,3,male,35.0,0,0,373450,8.0500,,S,32.779833,9.427541,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,211536,13.0000,,S,5.122756,1.571405,19.0
887,888,1,1,female,19.0,0,0,112053,30.0000,B42,S,16.571679,9.351057,22.0
888,889,0,3,female,,1,2,W./C. 6607,23.4500,,S,11.814999,5.374823,15.0
889,890,1,1,male,26.0,0,0,111369,30.0000,C148,C,17.221847,8.674646,22.0


In [13]:
generate_base_model_definition(features)

In [None]:
# %load model_definition.yaml
input_features:
- name: PassengerId
  type: numerical
- name: Pclass
  type: numerical
- name: Sex
  type: category
- name: Age
  type: numerical
- name: SibSp
  type: numerical
- name: Parch
  type: numerical
- name: Ticket
  type: category
- name: Fare
  type: numerical
- name: Cabin
  type: category
- name: Embarked
  type: category
- name: total_buffet
  type: numerical
- name: max_buffet
  type: numerical
- name: total_ballroom
  type: numerical
output_features:
- name: Survived
  type: binary


## Iterate on model in cell below.  Results are written to a new directory every time

In [31]:
%%writefile model_definition.yaml
input_features:
- name: Pclass
  type: numerical
- name: Sex
  type: category
- name: Age
  type: numerical
  preprocessing:
          missing_value_strategy: fill_with_mean
          normalization: zscore  
- name: SibSp
  type: numerical
- name: Parch
  type: numerical
- name: Ticket
  type: category
- name: Fare
  type: numerical
- name: Cabin
  type: category
- name: total_buffet
  type: numerical
- name: max_buffet
  type: numerical
- name: total_ballroom
  type: numerical
output_features:
- name: Survived
  type: binary
  num_fc_layers: 2
  fc_size: 128
training:
    batch_size: 32
    epochs: 200


Overwriting model_definition.yaml


In [32]:
ludwig_model = LudwigModel(model_definition_file='model_definition.yaml')
train_status = ludwig_model.train(data_df=training_df)
max(train_status['validation']['combined']['accuracy'])

0.7530864197530864

In [None]:
deploy_model_to_moc(ludwig_model)
#saves model to tmp directory (or locates experiement in results directory) and package as tar file
#generates a model conformance python file (same every time?) that just unpacks tar file
#using input features and output features generate schema definition file
#upload everything to fastscore using python sdk
#model_definition,yaml gets tagged and stored....now we can automate re-training model on new data and we can guarantee that model can be recreated.
