In [None]:
%load_ext kedro.extras.extensions.ipython

In [None]:
%reload_kedro

In [None]:
import pandas as pd
import numpy as np

In [None]:
catalog.list()

In [None]:
articles = context.catalog.load('articles_sample')

In [None]:
customers = context.catalog.load('customers_sample')

In [None]:
transactions = context.catalog.load('transactions_sample')

### check results

In [None]:
auto_articles = context.catalog.load('automated_articles_features')

In [None]:
auto_customers = context.catalog.load('automated_customers_features')

In [None]:
auto_articles.head()

In [None]:
auto_articles.shape

In [None]:
auto_customers.head()

In [None]:
auto_customers.shape

In [None]:
auto_articles[sorted(auto_articles.columns.to_list())]

In [None]:
auto_customers[sorted(auto_customers.columns.to_list())].sort_values(by='COUNT(transactions)_all', ascending=False)

In [None]:
from featuretools.selection import (
    # remove_highly_correlated_features() does not work with Booleans in current version of featuretools
    #remove_highly_correlated_features,
    remove_highly_null_features,
    remove_single_value_features,
)

### my custom correlation

In [None]:
cor_matrix = auto_articles.corr().abs()

In [None]:
np.fill_diagonal(cor_matrix.values, 0)

In [None]:
cor_matrix

upper = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool_))

In [None]:
def my_correlation(df):
    col_list = df.columns.to_list()
    correlated_columns = set()
    non_correlated_columns = set()
    for col in col_list:
        if col in correlated_columns:
            print('continue in a loop')
            continue
        non_correlated_columns.add(col)
        
        corr_cols_list = df.index[df.loc[:, col].ge(0.8)].to_list()
        if len(corr_cols_list)>0:
            correlated_columns |= set(corr_cols_list)
    print(f'{len(correlated_columns)=}')
    print(f'{len(non_correlated_columns)=}')
    return correlated_columns

In [None]:
my_correlation(cor_matrix)

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
auto_customers.isna().sum().reset_index().sort_values(by='index')

In [None]:
auto_articles.isna().sum()

In [None]:
pd.set_option('display.max_rows', 30)

In [None]:
articles.shape, customers.shape, transactions.shape

In [None]:
pd.set_option('display.max_columns', None)

### featuretools

In [None]:
import featuretools as ft

In [None]:
es = ft.EntitySet(id="kaggle_hm_data")

In [None]:
import woodwork as ww

ww.list_logical_types()

In [None]:
from woodwork.logical_types import Categorical, Boolean, AgeNullable, Double, NaturalLanguage

In [None]:
customers.head()

In [None]:
es = es.add_dataframe(
    dataframe_name="customers",
    dataframe=customers,
    index="customer_id",
    logical_types={
        "customer_id": Categorical,
        "FN": Boolean,
        "Active": Boolean,
        "club_member_status": Categorical,
        "fashion_news_frequency": Categorical,
        "age": AgeNullable
    },
)

In [None]:
transactions['_index'] = transactions.index

In [None]:
es = es.add_dataframe(
    dataframe_name="transactions",
    dataframe=transactions,
    index="_index",
    time_index="t_dat",
    logical_types={
        "customer_id": Categorical,
        "article_id": Categorical,
        "price": Double,
        "sales_channel_id": Categorical,
    },
)

In [None]:
es = es.add_dataframe(
    dataframe_name="articles",
    dataframe=articles,
    index="article_id",
    logical_types={
        "article_id": Categorical,
        "detail_desc": NaturalLanguage,
    },
)

In [None]:
es

In [None]:
es = es.add_relationship("customers", "customer_id", "transactions", "customer_id")

In [None]:
es = es.add_relationship("articles", "article_id", "transactions", "article_id")

In [None]:
es

In [None]:
es['articles'].ww.schema

In [None]:
es['articles'].ww.schema

### articles

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
ft.primitives.list_primitives()

In [None]:
pd.set_option('display.max_rows', 30)

In [None]:
articles_feature_matrix, articles_feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="articles",
    agg_primitives=["sum", "mean", "median", "max", "min", "time_since_last", "count", "time_since_first"]
)

In [None]:
list(enumerate(articles_feature_defs))

In [None]:
articles_feature_defs[0].get_name()

In [None]:
feats = [feature.get_name() for feature in articles_feature_defs if len(feature.base_features)>0]

In [None]:
feats

In [None]:
articles_feature_matrix[feats]

In [None]:
articles_feature_defs[21].base_features

In [None]:
articles_feature_defs[10].base_features

In [None]:
dir(articles_feature_matrix)

In [None]:
ft.describe_feature(articles_feature_defs[30])

In [None]:
list(enumerate(articles_feature_defs))

### customers

In [None]:
customers_feature_matrix, customers_feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="customers",
    ignore_dataframes=["articles"],
    agg_primitives=["sum", "mean", "median", "max", "min", "time_since_last", "count", "time_since_first", "avg_time_between"],
)

In [None]:
list(customers_feature_matrix.ww.select(exclude=[Boolean]).ww.schema.columns.keys())

In [None]:
fm_to_check = customers_feature_matrix.ww.select(include=[Boolean])

In [None]:
ft.primitives.list_primitives()

In [None]:
customers_feature_matrix

In [None]:
list(enumerate(customers_feature_defs))

### transactions

transactions features won't be used

In [None]:
transactions_feature_matrix, transactions_feature_defs = ft.dfs(entityset=es, target_dataframe_name="transactions")

In [None]:
transactions_feature_matrix.loc[0, :][0]

In [None]:
transactions[transactions.customer_id=='0090852ad6a446ccf02a7d57ec7385c360ab33a813b870e7e725beae6a76ddd4']

In [None]:
transactions_feature_matrix

In [None]:
transactions_feature_defs

### remove

In [None]:
customers_feature_matrix

In [None]:
articles_feature_matrix

In [None]:
from featuretools.selection import (
    remove_highly_correlated_features,
    remove_highly_null_features,
    remove_single_value_features,
)

In [None]:
remove_highly_null_features(customers_feature_matrix, pct_null_threshold=0.2)

In [None]:
new_fm, new_features = remove_single_value_features(customers_feature_matrix, features=customers_feature_defs)

In [None]:
new_fm.shape

In [None]:
set(new_features)-set(customers_feature_defs)

In [None]:
customers_feature_matrix

bug in featuretools -> can't call .corr() on Boolean features

https://github.com/alteryx/featuretools/issues/2229

In [None]:
new_fm, new_features = remove_highly_correlated_features(customers_feature_matrix, features=customers_feature_defs)

In [None]:
cols = list(customers_feature_matrix.ww.select(exclude=[Boolean]).ww.schema.columns.keys())

In [None]:
cols

In [None]:
customers_feature_matrix.ww.select(exclude=[Boolean])

In [None]:
new_fm, new_features = remove_highly_correlated_features(customers_feature_matrix.ww.select(exclude=[Boolean]),
                                                        features=cols)

In [None]:
list(customers_feature_defs[2:])

In [None]:
new_fm, new_features = remove_highly_correlated_features(customers_feature_matrix.iloc[:, 2:], features=customers_feature_defs[2:])

In [None]:
new_fm