In [1]:
import sys
sys.path.append('/Users/aitomatic/src/github/h1st-ai/h1st/h1st/model/kswe')

from kswe_modeler import KSWEModeler
from segmentor import CombinationSegmentor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split

def load_data():
    df_raw = datasets.load_iris(as_frame=True).frame
    df_raw.columns = ['sepal_length','sepal_width','petal_length','petal_width', 'species']
    df_raw['sepal_size'] = df_raw['sepal_length'] * df_raw['sepal_width']
    df_raw['sepal_aspect_ratio'] =  df_raw['sepal_width'] / df_raw['sepal_length'] 
    
    X_cols = list(df_raw.columns)
    # X_cols.remove('species')
    X_train, X_test, y_train, y_test = train_test_split(
        df_raw[X_cols], df_raw['species'], test_size=0.4, random_state=1)    
    return {
        'dataframe': {
            'X_train': X_train, 
            'y_train': y_train, 
            'X_test': X_test,
            'y_test': y_test,
        }
    }

In [3]:
from typing import Any, Dict

from h1st.model.ml_model import MLModel
from h1st.model.ml_modeler import MLModeler
from h1st.model.rule_based_modeler import RuleBasedModeler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as sk_metrics
import pandas as pd

from segmentor import CombinationSegmentor
from ensemble import MajorityVotingEnsemble

class MySubModel(MLModel):
    def predict(self, input_data: Dict) -> Dict:
        y = self.base_model.predict(input_data['X'])
        return {'predictions': y}


class MySubModelModeler(MLModeler):
    def __init__(self, model_class=MySubModel):
        self.model_class = model_class
        self.stats = {}
    
    def evaluate_model(self, data: Dict, model: MLModel) -> Dict:
        # super().evaluate_model(data, model)
        if 'X_test' not in data:
            print('No test data found. evaluating training results')
            X, y_true = data['X_train'], data['y_train']
        else:
            X, y_true = data['X_test'], data['y_test']
        y_pred = pd.Series(model.predict({'X': X})['predictions'])
        return {'r2_score': sk_metrics.r2_score(y_true, y_pred)}

    def train_base_model(self, data: Dict[str, Any]) -> Any:
        X, y = data['X_train'], data['y_train']
        model = LogisticRegression(random_state=0)
        model.fit(X, y)
        return model

In [4]:
data = load_data()
segmentation_features = {
    'sepal_size': [(None, 18.5), (18.5, None)],
    'sepal_aspect_ratio': [(None, 0.65), (0.65, None)],
    'species': [[0, 1], [1, 2]]
}
kswe_modeler = KSWEModeler()
kswe = kswe_modeler.build_model(
    input_data=data,
    segmentation_features=segmentation_features, 
    min_data_size=30,
    segmentor=CombinationSegmentor(), 
    sub_model_modeler=MySubModelModeler(),
    ensemble_modeler=RuleBasedModeler(model_class=MajorityVotingEnsemble)
)
X_features = list(data['dataframe']['X_test'].columns)
for item in segmentation_features.keys(): X_features.remove(item)

pred = kswe.predict({'X': data['dataframe']['X_test'][X_features]})['predictions']


INFO:root:sub model segment_0_lvl_1 training resluts based on 60 samples: {'r2_score': 0.9712092130518234}
INFO:root:sub model segment_0_lvl_1 test resluts based on 60 samples: {'r2_score': 0.897392047883711}
INFO:root:sub model segment_1_lvl_1 training resluts based on 30 samples: {'r2_score': 0.9424184261036468}
INFO:root:sub model segment_1_lvl_1 test resluts based on 60 samples: {'r2_score': 0.9230440359127832}
INFO:root:sub model segment_2_lvl_1 training resluts based on 68 samples: {'r2_score': 0.9384893713251923}
INFO:root:sub model segment_2_lvl_1 test resluts based on 60 samples: {'r2_score': 0.9486960239418555}
INFO:root:sub model segment_4_lvl_1 training resluts based on 60 samples: {'r2_score': 1.0}
INFO:root:sub model segment_4_lvl_1 test resluts based on 60 samples: {'r2_score': 0.4869602394185548}
INFO:root:sub model segment_5_lvl_1 training resluts based on 59 samples: {'r2_score': 0.7965517241379311}
INFO:root:sub model segment_5_lvl_1 test resluts based on 60 samples:

No test data found. evaluating training results
No test data found. evaluating training results
No test data found. evaluating training results
No test data found. evaluating training results
No test data found. evaluating training results
No test data found. evaluating training results
No test data found. evaluating training results
No test data found. evaluating training results
No test data found. evaluating training results
No test data found. evaluating training results
No test data found. evaluating training results
No test data found. evaluating training results
kswe test results {'r2_score': 0.9486960239418555}


In [5]:
import os
import tempfile
from kswe import KSWE

def test_kswe(kswe, data):
    X, y_true = data['X_test'], data['y_test']
    y_pred = pd.Series(kswe.predict({'X': X})['predictions'])
    return {'r2_score': metrics.r2_score(y_true, y_pred)}   
    

with tempfile.TemporaryDirectory() as path:
    os.environ['H1ST_MODEL_REPO_PATH'] = path
    print(test_kswe(kswe, data['dataframe']))
    kswe.persist('my_v1')

    kswe = None
    kswe = KSWE(
        segmentor=CombinationSegmentor(),
        sub_model=MySubModel,
        ensemble=MajorityVotingEnsemble()
    )
    kswe.load_params('my_v1')
    print(test_kswe(kswe, data['dataframe']))

INFO:h1st.model.repository.model_repository:Model persistence currently supports only stats, model and metrics properties.
INFO:h1st.model.repository.model_repository:Make sure you store stastistic in stats property, models in model property and model metrics in metrics one.
INFO:h1st.model.repository.model_repository:Saving metrics property...
INFO:h1st.model.repository.model_repository:Saving stats property...
INFO:h1st.model.repository.model_repository:Saving model property...


{'r2_score': 0.9486960239418555}
my_v1_segment_0_lvl_1


INFO:h1st.model.repository.model_repository:Saving metrics property...
INFO:h1st.model.repository.model_repository:Saving stats property...
INFO:h1st.model.repository.model_repository:Saving model property...
INFO:h1st.model.repository.model_repository:Saving metrics property...
INFO:h1st.model.repository.model_repository:Saving stats property...
INFO:h1st.model.repository.model_repository:Saving model property...
INFO:h1st.model.repository.model_repository:Saving metrics property...
INFO:h1st.model.repository.model_repository:Saving stats property...
INFO:h1st.model.repository.model_repository:Saving model property...
INFO:h1st.model.repository.model_repository:Saving metrics property...
INFO:h1st.model.repository.model_repository:Saving stats property...
INFO:h1st.model.repository.model_repository:Saving model property...
INFO:h1st.model.repository.model_repository:Saving metrics property...
INFO:h1st.model.repository.model_repository:Saving stats property...
INFO:h1st.model.reposito

my_v1_segment_1_lvl_1
my_v1_segment_2_lvl_1
my_v1_segment_4_lvl_1
my_v1_segment_5_lvl_1
my_v1_segment_6_lvl_2
my_v1_segment_10_lvl_2
my_v1_segment_11_lvl_2
my_v1_segment_14_lvl_2
my_v1_segment_15_lvl_2
my_v1_segment_18_lvl_3
my_v1_segment_19_lvl_3


TypeError: 'MySubModel' object is not callable

In [None]:
test_data = {
    'X_test': data['dataframe']['X_test'][X_features],
    'y_test': data['dataframe']['y_test']
}
for name, model in kswe.sub_models.items():
    
    metrics = MySubModelModeler().evaluate_model(test_data, model)
    print(f'sub model {name} test resluts based on {test_data["X_test"].shape[0]} samples: {metrics}')

sub model segment_0_lvl_1 test resluts based on 60 samples: {'r2_score': 0.4869602394185548}
sub model segment_1_lvl_1 test resluts based on 60 samples: {'r2_score': 0.4869602394185548}
sub model segment_2_lvl_1 test resluts based on 60 samples: {'r2_score': 0.4869602394185548}
sub model segment_4_lvl_1 test resluts based on 60 samples: {'r2_score': 0.4869602394185548}
sub model segment_5_lvl_1 test resluts based on 60 samples: {'r2_score': 0.4869602394185548}
sub model segment_6_lvl_2 test resluts based on 60 samples: {'r2_score': 0.4869602394185548}
sub model segment_10_lvl_2 test resluts based on 60 samples: {'r2_score': 0.4869602394185548}
sub model segment_11_lvl_2 test resluts based on 60 samples: {'r2_score': 0.4869602394185548}
sub model segment_14_lvl_2 test resluts based on 60 samples: {'r2_score': 0.4869602394185548}
sub model segment_15_lvl_2 test resluts based on 60 samples: {'r2_score': 0.4869602394185548}
sub model segment_18_lvl_3 test resluts based on 60 samples: {'r2_

In [None]:
kswe.sub_model

<__main__.MySubModel at 0x7f9489639820>

In [None]:
assert 1 == 0

AssertionError: 

In [None]:
data = load_data()

In [None]:
data['dataframe'].keys()

dict_keys(['X_train', 'y_train', 'X_test', 'y_test'])

In [None]:
'dataframe' not in data

False

In [None]:
if 'dataframe' not in data or 'json' not in data:
    raise KeyError('key "dataframe" or "json" is not in your input_data')       

KeyError: 'key "dataframe" or "json" is not in your input_data'

In [None]:
# df_0['sepal_aspect_ratio'].hist(bins=20

In [None]:
# df_0['sepal_size'].hist(bins=20)

In [None]:
segmentation_features = {
    'sepal_size': [(None, 18.5), (18.5, None)],
    'sepal_aspect_ratio': [(None, 0.65), (0.65, None)],
    # 'species': [[0]]
}

In [None]:
cs = CombinationSegmentor()

In [None]:
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

results, filter_combinations = cs.process(
    data, 
    by=segmentation_features,
    min_data_size=40,
    levels=[2]
)

In [None]:
filter_combinations['segment_0_lvl_2']

[('sepal_size', (None, 18.5)), ('sepal_aspect_ratio', (None, 0.65))]

In [None]:
for k, v in results.items():
    print(k, v['X'].shape)

segment_0_lvl_2 (69, 4)
segment_2_lvl_2 (44, 4)


In [None]:
filter_combinations

{'segment_0_lvl_2': [('sepal_size', (None, 18.5)),
  ('sepal_aspect_ratio', (None, 0.65))],
 'segment_1_lvl_2': [('sepal_size', (None, 18.5)),
  ('sepal_aspect_ratio', (0.65, None))],
 'segment_2_lvl_2': [('sepal_size', (18.5, None)),
  ('sepal_aspect_ratio', (None, 0.65))],
 'segment_3_lvl_2': [('sepal_size', (18.5, None)),
  ('sepal_aspect_ratio', (0.65, None))]}

In [None]:
temp = {'df': 2, 'ddd': 6}

In [None]:
list(temp.values())[0]

2

In [None]:
X = results['segment_0_lvl_2'].iloc[:, :-1]
X.shape

In [None]:
df_all['species'].loc[X.index]

In [None]:
def save_coco(file, info, licenses, images, annotations, categories):
    with open(file, 'wt', encoding='UTF-8') as coco:
        json.dump({'info': info, 'licenses': licenses, 'images': images,
                   'annotations': annotations, 'categories': categories}, coco, indent=2, sort_keys=True)

    
def filter_annotations(annotations, images):
    image_ids = funcy.lmap(lambda i: int(i['id']), images)
    return funcy.lfilter(lambda a: int(a['image_id']) in image_ids, annotations)

In [None]:

def main(args):
    with open(args.annotations, 'rt', encoding='UTF-8') as annotations:
        coco = json.load(annotations)
        info = coco['info']
        licenses = coco['licenses']
        images = coco['images']
        annotations = coco['annotations']
        categories = coco['categories']

        number_of_images = len(images)

        images_with_annotations = funcy.lmap(lambda a: int(a['image_id']), annotations)

        if args.having_annotations:
            images = funcy.lremove(lambda i: i['id'] not in images_with_annotations, images)

        x, y = train_test_split(images, train_size=args.split)

        save_coco(args.train, info, licenses, x, filter_annotations(annotations, x), categories)
        save_coco(args.test, info, licenses, y, filter_annotations(annotations, y), categories)

        print("Saved {} entries in {} and {} in {}".format(len(x), args.train, len(y), args.test))

In [None]:
import json

data_path = '/Users/aitomatic/Desktop/dataset/furuno/sample_15mins/annotations.json'
with open(data_path, 'r', encoding='UTF-8') as annotations:
    coco = json.load(annotations)
info = coco['info']
licenses = coco['licenses']
images = coco['images']
annotations = coco['annotations']
categories = coco['categories']

In [None]:
logic_example = filter_combinations['segment_0_lvl_2']
logic_example

In [None]:
import funcy

1. create new features and save that in annotation
ex) 
- depth_of_bb
- aspect_ratio_of_bb
- size_of_bb
- datetime

2. make logics in this format. [('depth_of_bb', (None, 200)), ('aspect_ratio_of_bb', (None, 0.65))]

3. make a function get_segments_from_json(JSON, segmentation_logics) -> return segmented JSONs
- make sure images, annotations, and categories are synchronized. 

4. save those JSONs and move around files based on that json

In [None]:
def create_sample_features():
    annotation_json_path = '/Users/aitomatic/Desktop/dataset/furuno/sample_15mins/annotations.json'
    with open(annotation_json_path, 'r', encoding='UTF-8') as annotations:
        coco = json.load(annotations)
    info = coco['info']
    licenses = coco['licenses']
    images = coco['images']
    annotations = coco['annotations']
    categories = coco['categories']

    for idx in range(len(annotations)):
        print(idx)

In [None]:
images[0]

In [None]:
annotations[0]