## Load Required Libraries
 - ```pip install pandas```<br>
 - ```pip install sklearn```<br> 
 
## Set up Raw Data

In [20]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'col1' : np.arange(7)
                   ,'col2' : ['a','b','c','a','b','d','e']})
df.head()

Unnamed: 0,col1,col2
0,0,a
1,1,b
2,2,c
3,3,a
4,4,b


## Import Library

In [21]:
import py_topping.data_preparation as prep

***
## Create Lag
***

In [None]:
prep.create_lag(df_in 
                , col_in 
                , lag_range 
                , lag_name = 'lag' 
                , drop_null = True
                , debug = False)

### Standard Lagging

In [23]:
prep.create_lag(df_in = df 
                , col_in = ['col1','col2']
                , lag_range = 2)

Unnamed: 0,col1,col2,col1_lag_1,col2_lag_1,col1_lag_2,col2_lag_2
2,2,c,1.0,b,0.0,a
3,3,a,2.0,c,1.0,b
4,4,b,3.0,a,2.0,c
5,5,d,4.0,b,3.0,a
6,6,e,5.0,d,4.0,b


### Edit Name

In [24]:
prep.create_lag(df_in = df , col_in = ['col1','col2'] , lag_range = 2 
                , lag_name = 'GAG' )

Unnamed: 0,col1,col2,col1_GAG_1,col2_GAG_1,col1_GAG_2,col2_GAG_2
2,2,c,1.0,b,0.0,a
3,3,a,2.0,c,1.0,b
4,4,b,3.0,a,2.0,c
5,5,d,4.0,b,3.0,a
6,6,e,5.0,d,4.0,b


### Keep Null Value

In [25]:
prep.create_lag(df_in = df , col_in = ['col1','col2'] , lag_range = 2 
                , drop_null = False)

Unnamed: 0,col1,col2,col1_lag_1,col2_lag_1,col1_lag_2,col2_lag_2
0,0,a,,,,
1,1,b,0.0,a,,
2,2,c,1.0,b,0.0,a
3,3,a,2.0,c,1.0,b
4,4,b,3.0,a,2.0,c
5,5,d,4.0,b,3.0,a
6,6,e,5.0,d,4.0,b


***
## Encoder
***

### Create Encoder

In [None]:
prep.create_encoder(df_in 
                    , col_in
                    , folder_in 
                    , debug = False)

### Encode Column

In [None]:
prep.encode_col(df_in 
                , col_in 
                , folder_in 
                , debug = False)

### Decode Column

In [None]:
prep.decode_col(df_in 
                , col_in 
                , folder_in 
                , debug = False)

### Encoder in Action

Row 1 - 3 is train data set<br>
Row 4 - 7 is test data set

In [26]:
df_train = df.loc[:2]
df_train

Unnamed: 0,col1,col2
0,0,a
1,1,b
2,2,c


In [27]:
df_test = df.loc[3:]
df_test

Unnamed: 0,col1,col2
3,3,a
4,4,b
5,5,d
6,6,e


**Create Encoder from train set**

In [28]:
prep.create_encoder(df_train
                   , col_in = ['col2']
                   , folder_in = 'encoder')
print('Created')

Created


Encoder will store sklearn's LaberEncoding Model <br>
in folder name "encoder" as pkl format

**Transform train set**

In [29]:
df_train2 = prep.encode_col(df_in = df_train 
                        , col_in = ['col2']
                        , folder_in = 'encoder')
df_train2

Unnamed: 0,col1,col2
0,0,0
1,1,1
2,2,2


**Decode encoded train set**

In [30]:
prep.decode_col(df_in = df_train2 
                , col_in = ['col2']
                , folder_in = 'encoder')

Unnamed: 0,col1,col2
0,0,a
1,1,b
2,2,c


**Transform test set** <br>
Noted that test set has "d" and "e" that not in train set

In [34]:
df_test2 = prep.encode_col(df_in = df_test
                        , col_in = ['col2']
                        , folder_in = 'encoder')
df_test2

Unnamed: 0,col1,col2
3,3,0
4,4,1
5,5,3
6,6,3


encode_col will automatic use max number + 1 for data that never seen before
<br><br><br>
**Decode encoded test set**<br>
Noted that encoded test set has "3" which is not in train set

In [35]:
prep.decode_col(df_in = df_test2
                , col_in = ['col2']
                , folder_in = 'encoder')

Unnamed: 0,col1,col2
3,3,a
4,4,b
5,5,unknown
6,6,unknown


Decode will automatic decode "3" that not in train set as "unknown"

# Auto Explain Decision Tree Logic

# Article about this function
 - https://faun.pub/explain-decision-trees-logic-d1b06369bf3d
 
## Create Simple model

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np

iris = load_iris()
col_name = [i.replace(' ','_') for i in iris['feature_names']]
X = pd.DataFrame(iris['data'], columns = col_name)
y = iris['target']
model = DecisionTreeClassifier(random_state=0, max_depth=4)
model = model.fit(X, y)
model

DecisionTreeClassifier(max_depth=4, random_state=0)

## Import Function

In [2]:
from py_topping.data_preparation import lazy_tree_logic

## Create explainer

In [4]:
tree_explain = lazy_tree_logic(model, list(X.columns))

## Explain the logic

In [5]:
X['reason'] = X.apply(tree_explain.explain, axis =1)
X.head()

Unnamed: 0,sepal_length_(cm),sepal_width_(cm),petal_length_(cm),petal_width_(cm),reason
0,5.1,3.5,1.4,0.2,"[[petal_width_(cm), <=, 0.8000], class: 0]"
1,4.9,3.0,1.4,0.2,"[[petal_width_(cm), <=, 0.8000], class: 0]"
2,4.7,3.2,1.3,0.2,"[[petal_width_(cm), <=, 0.8000], class: 0]"
3,4.6,3.1,1.5,0.2,"[[petal_width_(cm), <=, 0.8000], class: 0]"
4,5.0,3.6,1.4,0.2,"[[petal_width_(cm), <=, 0.8000], class: 0]"


In [6]:
X.loc[0,'reason']

[['petal_width_(cm)', '<=', '0.8000'], 'class: 0']