* fft

* Catboost on fft only

* keras conv1d

* ts-fresh

* cum sum first n-columns

* groupby columns

In [None]:
pip install tsfresh

In [None]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import find_peaks
from scipy.fft import fft, fftfreq

from catboost import CatBoostClassifier, Pool
import optuna
import shap
shap.initjs()

from IPython.display import FileLink, display

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import f1_score

from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute

pd.set_option("max_info_columns", 500)

Config

In [None]:
class Config:
    cv = False
    single_model = True
    ts_fresh_feat = True

In [None]:
def is_kaggle_gpu_enabled():
    from tensorflow.python.client import device_lib
    return len(device_lib.list_local_devices()) > 1

### Считывание данных

In [None]:
train_df = pd.read_csv("/kaggle/input/smit-tgt-2023/data/train.csv")
test_df = pd.read_csv("/kaggle/input/smit-tgt-2023/data/test.csv")

In [None]:
# train_aligndf.info();
# train_df.columns
train_df.head(5)

In [None]:
def get_features(df):
    df['nans_c'] = df.isna().sum(axis=1).values
    df['mean'] = df.mean(axis=1, skipna=True).values
    df['max'] = df.max(axis=1, skipna=True).values
    df['mean'] = df.min(axis=1, skipna=True).values
    df['med'] = df.median(axis=1, skipna=True).values
    df['std'] = df.std(axis=1, skipna=True).values
    df['q_5'] = df.quantile(q=0.5, axis=1).values
    df['q_6'] = df.quantile(q=0.6, axis=1).values
    df['q_65'] = df.quantile(q=0.65, axis=1).values
    df['q_7'] = df.quantile(q=0.7, axis=1).values
    df['q_75'] = df.quantile(q=0.75, axis=1).values
    df['q_8'] = df.quantile(q=0.8, axis=1).values
    df['q_85'] = df.quantile(q=0.85, axis=1).values
    df['q_9'] = df.quantile(q=0.9, axis=1).values
    df['q_95'] = df.quantile(q=0.55, axis=1).values
    df['cum_sum'] = df.fillna(0).cumsum(axis=1, skipna=True)['data_300']
    return df


def get_fft_features(row, full=False):
    values = np.nan_to_num(row.values)

    N = values.size
    T = 1.0 / 117200
    y = values
    yf = fft(y)

    psd = 2.0/N * np.abs(yf[0:N//2])
    
    if full:
        return psd
    else:
        return psd[-30:] # last 30
    

def get_fft_peaks_features(row):
    psd = get_fft_features(row, full=True)
    
    func_values = psd # [12:24]
    peaks, _ = find_peaks(func_values, height=0.02, distance=4)
    
    peaks = peaks[-10:]

    a = np.zeros(10)
    a[(a.shape[0] - peaks.shape[0]):] = peaks
    a = a.astype(int)
    
    return a

def get_ts_fresh_features(df, filtered=True):
    transformed_df = df.iloc[:, 0:300].T.melt().fillna(0)
    extracted_feat = extract_features(transformed_df, column_id='variable')
    impute(extracted_feat)
    if filtered:
        features_filtered = select_features(extracted_feat, y=train_df['label'])
        return features_filtered
    else:
        return extracted_feat

In [None]:
# train_df.iloc[0, 0:10].rolling(5).mean() # axis=4

In [None]:
# Peaks
train_df_peaks = np.array(train_df.apply(get_fft_peaks_features, axis=1).values.tolist())
test_df_peaks = np.array(test_df.apply(get_fft_peaks_features, axis=1).values.tolist())

# FFT
train_df_fft = np.array(train_df.apply(get_fft_features, axis=1).values.tolist())
test_df_fft = np.array(test_df.apply(get_fft_features, axis=1).values.tolist())

# Other features
train_df = get_features(train_df)
test_df = get_features(test_df)

# # Concat # FFT
# train_df = pd.concat([train_df, pd.DataFrame(train_df_fft)], axis=1)
# test_df = pd.concat([test_df, pd.DataFrame(test_df_fft)], axis=1)
# # Peaks
# train_df = pd.concat([train_df, pd.DataFrame(train_df_peaks, columns=['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9' , 'p10'])], axis=1)
# test_df = pd.concat([test_df, pd.DataFrame(test_df_peaks, columns=['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9' , 'p10'])], axis=1)

features from ts fresh 

In [None]:
if Config.ts_fresh_feat:
    train_ts_feat = get_ts_fresh_features(train_df, filtered=True)
    test_ts_feat = get_ts_fresh_features(test_df, filtered=False)
    test_ts_feat = test_ts_feat.loc[:, train_ts_feat.columns]

In [None]:
# train_df.isna().sum(axis=1).valuesaggregate[0:15]

In [None]:
# train_df.isna().sum(axis=1).value_counts()

In [None]:
# np.searchsorted(train_df.isna().sum(axis=1).values, [52, 46])

In [None]:
# train_df['label'].values[train_df['label'].values == 0].size, train_df['label'].values[train_df['label'].values == 1].size

### Данные

#### Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(columns=['label']), train_df['label'], test_size=0.33, random_state=43)

In [None]:
if Config.ts_fresh_feat:
    X_train, X_test, y_train, y_test = train_test_split(train_ts_feat, train_df['label'], test_size=0.33, random_state=43)

In [None]:
assert(test_df.shape[0] == 450)

train_pool = Pool(X_train, y_train)
val_pool = Pool(X_test, y_test)
submit_pool = Pool(test_df)

if Config.ts_fresh_feat:
    submit_pool = Pool(test_ts_feat)

### Модель

In [None]:
params = {
    'iterations': 700, # 15000 # 750
    'depth': 7,
#     'learning_rate': 0.2,
#     'loss_function': 'MultiClass', # MultiClassOneVsAll
    'auto_class_weights': 'Balanced',
    'random_seed': 42, 
    'verbose': 100,
    'l2_leaf_reg': 50,
    'random_strength': 0.2,
#     'bagging_temperature': 1,
#     'grow_policy' : 'Depthwise',
    'eval_metric': 'TotalF1',
#     'custom_metric': 'Recall',
}

if is_kaggle_gpu_enabled():
    params.update({'task_type': 'GPU', 'devices': '0:1'})

In [None]:
def optimize_hp(trial):
    cb_params = {
        'iterations': 10000,
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 1.0, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 100, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.1, 20.0, log=True),
        'random_strength': trial.suggest_float('random_strength', 1.0, 2.0),
        'depth': trial.suggest_int('depth', 3, 9),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
        "use_best_model": True,
        "task_type": "GPU",
        'loss_function': 'MultiClass',
        'random_seed': 42,
        'eval_metric': 'TotalF1',
        'loss_function': 'MultiClass', # MultiClassOneVsAll
        'auto_class_weights': 'Balanced',
#         'custom_metric': 'Recall',
    }
    
    model = CatBoostClassifier(**cb_params)
    model.fit(train_pool, eval_set=val_pool, verbose=1000)
    y_pred = model.predict(val_pool)
    return f1_score(y_test, y_pred, average='micro')


# if Config.optuna:
# #     오래 걸려서 주석 처리함. 이거 실행하면 hyperparameter 얻을 수 있다.
#     study = optuna.create_study(direction="maximize")
#     study.optimize(optimize_hp, n_trials=10)
#     print('Trials:', len(study.trials))
#     print('Best parameters:', study.best_trial.params)
#     print('Best score:', study.best_value)
    
#     params.update(study.best_trial.params)
#     model = CatBoostClassifier(**params)

In [None]:
cb_clf = CatBoostClassifier(**params)

### Обучение

In [None]:
cb_clf.fit(train_pool,use_best_model=True, eval_set = val_pool)
# cb_clf.fit(train_pool)

In [None]:
cb_clf.get_all_params()

#### CrossVal

In [None]:
def get_model():
    return CatBoostClassifier(**params)

def get_pool(train_index, test_index):
    assert(test_df.shape[0] == 450)
    
    if Config.ts_fresh_feat:
        train_pool = Pool(train_ts_feat.iloc[train_index,:], train_df['label'].iloc[train_index])
        val_pool = Pool(train_ts_feat.iloc[test_index,:], train_df['label'].iloc[test_index])
        submit_pool = Pool(test_ts_feat)
    else:
        train_pool = Pool(train_df.drop(columns=['label']).iloc[train_index, :], train_df['label'].iloc[train_index])
        val_pool = Pool(train_df.drop(columns=['label']).iloc[test_index, :], train_df['label'].iloc[test_index])
        submit_pool = Pool(test_df)
    
    return train_pool, val_pool, submit_pool

def val_metric(val_true, val_pred):
    print(f1_score(val_true, val_pred, average='binary'))

In [None]:
if Config.cv:
    val_preds = []
    submit_preds = []
    submit_probas = []

    kf = KFold(n_splits=3, random_state=42, shuffle=True)
    # rkf = RepeatedKFold(n_repeats=2, n_splits=2, random_state=2652124)
    for i, (train_index, test_index) in enumerate(kf.split(train_df)):
        print(f'fold: {i}')

        model = get_model()
        train_pool, val_pool, submit_pool = get_pool(train_index, test_index)
        model.fit(train_pool, use_best_model=True, eval_set = val_pool)

        val_pred = model.predict(val_pool)
        val_preds.append(val_pred)
        val_metric(val_pool.get_label(), val_pred)
        predicted_label = cb_clf.predict(submit_pool)
        predicted_probas = cb_clf.predict_proba(submit_pool)[:, 0]
        submit_preds.append(predicted_label)
        submit_probas.append(predicted_probas)

avg results and predict

In [None]:
if Config.cv:
    predicted_label = scipy.stats.mode(submit_preds)[0][0]

    predicted_probas = np.mean(submit_probas, axis=0)
    predicted_label = np.where(predicted_probas > 0.5, 0, 1)

    file_name = "./cb_cv_tsfr.csv"

    result_df = pd.DataFrame({"label":predicted_label})
    result_df.to_csv(file_name, index=False)
    display(FileLink(file_name))

In [None]:
predicted_label.shape

### Валидация

In [None]:
val_preds = cb_clf.predict(val_pool)
val_preds_proba = cb_clf.predict_proba(val_pool)[:, 0]
# val_preds = np.where(cb_clf.predict_proba(val_pool)[:, 0] > 0.35, 0, 1)
print(f1_score(y_test, val_preds, average='binary'))
print(val_preds_proba.mean(), val_preds_proba.std())
val_preds_proba_1 = cb_clf.predict_proba(val_pool)[:, 1]
print(val_preds_proba_1.mean(), val_preds_proba_1.std())

In [None]:
# val_preds_proba[val_preds_proba < 0.75].sum()

### FE

In [None]:
def plot_feature_importance(importance, names, limit=20):
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    
    fi_df = fi_df.iloc[:limit, :]
    
    plt.figure(figsize=(10,8))
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])

In [None]:
plot_feature_importance(cb_clf.get_feature_importance(), X_train.columns)

SHAP

In [None]:
explainer = shap.TreeExplainer(cb_clf)
shap_values = explainer.shap_values(train_pool)

In [None]:
shap.summary_plot(shap_values, X_train,max_display=30)

### Сабмит

In [None]:
sumbit_probas = cb_clf.predict_proba(submit_pool)[:, 0]
print(sumbit_probas.mean(), sumbit_probas.std())

sumbit_probas_1 = cb_clf.predict_proba(submit_pool)[:, 1]
sumbit_probas_1.mean(), sumbit_probas_1.std()

In [None]:
file_name = "./cb_tsfr.csv"

predicted_label = cb_clf.predict(submit_pool)

result_df = pd.DataFrame({"label":predicted_label})
result_df.to_csv(file_name, index=False)
FileLink(file_name)

Others

FFT

In [None]:
# sample_id = 61
# values = train_df[train_df.label == 1].iloc[sample_id, 0:300].values
# # values = values[~np.isnan(values)]
# values = np.nan_to_num(values)

# # Number of sample points
# N = values.size
# # sample spacing
# T = 1.0 / 117200
# x = np.linspace(0.0, N*T, N, endpoint=False)
# y = values
# yf = fft(y)
# xf = fftfreq(N, T)[:N//2]

# psd = 2.0/N * np.abs(yf[0:N//2])

# plt.plot(xf, 2.0/N * np.abs(yf[0:N//2]))
# plt.grid()
# plt.show()

Peaks

In [None]:
# func_values = psd # [12:24]
# peaks, _ = find_peaks(func_values, height=0.02, distance=4)
# print("Количество пиков: ", len(peaks))
# print(peaks)

# a = np.zeros(10)
# a[(a.shape[0] - peaks.shape[0]):] = peaks
# a = a.astype(int)