In [None]:
from operator import itemgetter

In [None]:
import catboost as cb
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.base import clone, BaseEstimator, TransformerMixin
from sklearn.externals import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [None]:
from basedir import SAMPLE
from info import id_cols
from utils import to_feather, from_feather, starts, dropcols, float64

In [None]:
x_trn, y_trn, x_tst = from_feather('x_trn', 'y_trn', 'x_tst')

In [None]:
def kmeans_cluster(ser_id, feat_cols, data, k=4, normalize=True):
    kmeans = KMeans(n_clusters=k).fit(data[feat_cols])
    vec = kmeans.cluster_centers_.flatten()
    return ser_id, vec

In [None]:
def cluster(data, func, **params):
    id_col = params.get('id_col', 'series_id')
    with joblib.Parallel(n_jobs=1, backend='loky') as parallel:
        results = parallel(
            joblib.delayed(func)(ser_id, dropcols(group, [id_col]), **params)
            for ser_id, group in data.groupby(id_col))
    _, vectors = zip(*sorted(results, key=itemgetter(0)))
    return np.row_stack(vectors)

In [None]:
class ColumnsScaler(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
        self.scaler_ = None
    
    def fit(self, X, y=None):
        scaler = StandardScaler()
        scaler.fit(float64(X[self.cols]))
        self.scaler_ = scaler
        return self
    
    def transform(self, X):
        allcols = X.columns
        scaled = self.scaler_.transform(float64(X[self.cols]))
        new = pd.DataFrame()
        count = 0
        for col in allcols:
            if col not in self.cols:
                new[col] = X[col]
            else:
                new[col] = scaled[:, count]
                count += 1
        return new

In [None]:
class ClusterFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, id_col, feat_cols, func=kmeans_cluster, cluster_params=None):
        self.id_col = id_col
        self.feat_cols = feat_cols
        self.func = func
        self.cluster_params = cluster_params or {}
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        params = self.cluster_params
        with joblib.Parallel(n_jobs=12, backend='loky') as parallel:
            results = parallel(
                joblib.delayed(self.func)(self.id_col, self.feat_cols, group, **params)
                for _, group in X.groupby(self.id_col))
        _, vectors = zip(*sorted(results, key=itemgetter(0)))
        return np.row_stack(vectors)

In [None]:
scaler = ColumnsScaler(cols=x_trn.select_dtypes(np.float32).columns)
x_trn_scaled = scaler.fit_transform(x_trn)
x_tst_scaled = scaler.transform(x_tst)

In [None]:
clust_orient = ClusterFeatures('series_id', starts(x_trn_scaled, 'orient'))
clust_ang = ClusterFeatures('series_id', starts(x_trn_scaled, 'ang'))
clust_lin = ClusterFeatures('series_id', starts(x_trn_scaled, 'lin'))

In [None]:
x_trn_vec = np.column_stack([
    clust_orient.fit_transform(x_trn_scaled),
    clust_ang.fit_transform(x_trn_scaled),
    clust_lin.fit_transform(x_trn_scaled)
])

In [None]:
x_tst_vec = np.column_stack([
    clust_orient.fit_transform(x_tst_scaled),
    clust_ang.fit_transform(x_tst_scaled),
    clust_lin.fit_transform(x_tst_scaled)
])

In [None]:
enc = LabelEncoder()
y_enc = enc.fit_transform(y_trn['surface'])

In [None]:
model = RandomForestClassifier(n_estimators=1000, n_jobs=-1)

In [None]:
model.fit(x_trn_vec, y_enc)

In [None]:
preds = model.predict(x_tst_vec)

In [None]:
submit = pd.read_csv(SAMPLE)
submit['surface'] = enc.classes_[preds]
submit.to_csv('submit.csv', index=None)

In [None]:
!kaggle c submit career-con-2019 -f 'submit.csv' -m "One more attempt with simple clustering"