In [1]:
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.cluster import KMeans
import xgboost as xgb

RND_STATE = 874

In [2]:
#!pip install watermark
%load_ext watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [3]:
%watermark -v -m -p numpy,scipy,pandas,matplotlib,statsmodels,sklearn,xgboost,seaborn -g

CPython 3.6.3
IPython 5.3.0

numpy 1.14.2
scipy 1.0.0
pandas 0.20.1
matplotlib 2.0.2
statsmodels 0.8.0
sklearn 0.19.1
xgboost 0.7
seaborn 0.8.1

compiler   : MSC v.1900 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 69 Stepping 1, GenuineIntel
CPU cores  : 4
interpreter: 64bit
Git hash   :


In [4]:
train_data = pd.read_csv('contest_train.csv')
test_data = pd.read_csv('contest_test.csv')

# Columns by datatype

In [5]:
col_drop = ['FEATURE_3', 'FEATURE_144', 'FEATURE_249', 'FEATURE_256'] # на удаление

col_redundant = ['FEATURE_117', 'FEATURE_118', 'FEATURE_149',
                 'FEATURE_150', 'FEATURE_155', 'FEATURE_163',
                 'FEATURE_164', 'FEATURE_168', 'FEATURE_182',
                 'FEATURE_184', 'FEATURE_203', 'FEATURE_205',
                 'FEATURE_206', 'FEATURE_227', 'FEATURE_241',
                 'FEATURE_37', 'FEATURE_49', 'FEATURE_55',
                 'FEATURE_61', 'FEATURE_67', 'FEATURE_68',
                 'FEATURE_71', 'FEATURE_72', 'FEATURE_73',
                 'FEATURE_78', 'FEATURE_79', 'FEATURE_81',
                 'FEATURE_84'] # сильная корреляция с другими, на удаление

col_num =  ['FEATURE_7', 'FEATURE_8', 'FEATURE_12',
            'FEATURE_21', 'FEATURE_23', 'FEATURE_24',
            'FEATURE_26', 'FEATURE_33', 'FEATURE_34',
            'FEATURE_35', 'FEATURE_36', 'FEATURE_38',
            'FEATURE_45', 'FEATURE_46', 'FEATURE_47',
            'FEATURE_48', 'FEATURE_50', 'FEATURE_51',
            'FEATURE_52', 'FEATURE_53', 'FEATURE_54',
            'FEATURE_56', 'FEATURE_57', 'FEATURE_58',
            'FEATURE_59', 'FEATURE_60', 'FEATURE_62',
            'FEATURE_63', 'FEATURE_64', 'FEATURE_65',
            'FEATURE_66', 'FEATURE_77', 'FEATURE_80',
            'FEATURE_82', 'FEATURE_83', 'FEATURE_85',
            'FEATURE_86', 'FEATURE_87', 'FEATURE_88',
            'FEATURE_89', 'FEATURE_90', 'FEATURE_91',
            'FEATURE_92', 'FEATURE_93', 'FEATURE_94',
            'FEATURE_95', 'FEATURE_96', 'FEATURE_97',
            'FEATURE_98', 'FEATURE_99', 'FEATURE_100',
            'FEATURE_101', 'FEATURE_102', 'FEATURE_103',
            'FEATURE_104', 'FEATURE_105', 'FEATURE_106',
            'FEATURE_107', 'FEATURE_108', 'FEATURE_109',
            'FEATURE_112', 'FEATURE_113', 'FEATURE_127',
            'FEATURE_128', 'FEATURE_129', 'FEATURE_130',
            'FEATURE_135', 'FEATURE_136', 'FEATURE_137',
            'FEATURE_138', 'FEATURE_143', 'FEATURE_147',
            'FEATURE_160', 'FEATURE_161', 'FEATURE_162',
            'FEATURE_177', 'FEATURE_181', 'FEATURE_183',
            'FEATURE_186', 'FEATURE_188', 'FEATURE_195',
            'FEATURE_217', 'FEATURE_223', 'FEATURE_225',
            'FEATURE_228', 'FEATURE_230', 'FEATURE_232',
            'FEATURE_233', 'FEATURE_235', 'FEATURE_236',
            'FEATURE_237', 'FEATURE_240', 'FEATURE_244',
            'FEATURE_252', 'FEATURE_253', 'FEATURE_0', 
            'FEATURE_13', 'FEATURE_22', 'FEATURE_25', 
            'FEATURE_39', 'FEATURE_40', 'FEATURE_41', 
            'FEATURE_42', 'FEATURE_43', 'FEATURE_44', 
            'FEATURE_76', 'FEATURE_110', 'FEATURE_111', 
            'FEATURE_114', 'FEATURE_115', 'FEATURE_116', 
            'FEATURE_119', 'FEATURE_120', 'FEATURE_121', 
            'FEATURE_122', 'FEATURE_124', 'FEATURE_125', 
            'FEATURE_126', 'FEATURE_132', 'FEATURE_133', 
            'FEATURE_134', 'FEATURE_139', 'FEATURE_142', 
            'FEATURE_145', 'FEATURE_148', 'FEATURE_153', 
            'FEATURE_158', 'FEATURE_166', 'FEATURE_169', 
            'FEATURE_171', 'FEATURE_173', 'FEATURE_174', 
            'FEATURE_185', 'FEATURE_196', 'FEATURE_197', 
            'FEATURE_215', 'FEATURE_216', 'FEATURE_221', 
            'FEATURE_222', 'FEATURE_226', 'FEATURE_231', 
            'FEATURE_234', 'FEATURE_238', 'FEATURE_239', 
            'FEATURE_242', 'FEATURE_243', 'FEATURE_245', 
            'FEATURE_246', 'FEATURE_247', 'FEATURE_248', 
            'FEATURE_250', 'FEATURE_251', 'FEATURE_141',
            'FEATURE_199', 'FEATURE_20', 'FEATURE_224'] # числовые переменные

col_cat = ['FEATURE_1', 'FEATURE_9', 'FEATURE_10',
           'FEATURE_14', 'FEATURE_27','FEATURE_229',
           'FEATURE_28', 'FEATURE_29', 'FEATURE_30',
           'FEATURE_31', 'FEATURE_32', 'FEATURE_69',
           'FEATURE_70', 'FEATURE_74', 'FEATURE_75',
           'FEATURE_123', 'FEATURE_131', 'FEATURE_259',
           'FEATURE_146', 'FEATURE_151', 'FEATURE_152',
           'FEATURE_154', 'FEATURE_156', 'FEATURE_157',
           'FEATURE_165', 'FEATURE_167', 'FEATURE_170',
           'FEATURE_172', 'FEATURE_175', 'FEATURE_176',
           'FEATURE_178', 'FEATURE_179', 'FEATURE_180',
           'FEATURE_198', 'FEATURE_200', 'FEATURE_258',
           'FEATURE_201', 'FEATURE_202', 'FEATURE_204',
           'FEATURE_207', 'FEATURE_208', 'FEATURE_209',
           'FEATURE_210', 'FEATURE_211', 'FEATURE_212',
           'FEATURE_213', 'FEATURE_214', 'FEATURE_218',
           'FEATURE_219', 'FEATURE_220', 'FEATURE_257'] # категориальные переменные


col_new_bin = ['FEATURE_189', 'FEATURE_194', 'FEATURE_190', 
               'FEATURE_191', 'FEATURE_192', 'FEATURE_193', 
               'FEATURE_187'] # новые бинарные, где очень много NaN

col_default_num = ['FEATURE_108', 'FEATURE_98', 'FEATURE_82',
                   'FEATURE_25', 'FEATURE_42', 'FEATURE_39',
                   'FEATURE_43', 'FEATURE_40', 'FEATURE_44',
                   'FEATURE_41', 'FEATURE_119', 'FEATURE_251',
                   'FEATURE_120', 'FEATURE_139', 'FEATURE_142',
                   'FEATURE_145', 'FEATURE_148', 'FEATURE_141',
                   'FEATURE_153', 'FEATURE_158', 'FEATURE_171',
                   'FEATURE_185', 'FEATURE_197', 'FEATURE_231',
                   'FEATURE_234', 'FEATURE_242', 'FEATURE_250',
                   'FEATURE_243', 'FEATURE_245', 'FEATURE_246',
                   'FEATURE_247', 'FEATURE_248', 'FEATURE_20',
                   'FEATURE_199', 'FEATURE_224'] # числовые, которые не надо преобразовывать

col_log_std_num = ['FEATURE_135', 'FEATURE_127', 'FEATURE_107',
                   'FEATURE_97', 'FEATURE_87', 'FEATURE_77',
                   'FEATURE_22'] # числовые, которые нужно логарифмировать

col_extra_num = ['FEATURE_87', 'FEATURE_77', 'FEATURE_244',
                 'FEATURE_233', 'FEATURE_232', 'FEATURE_217',
                 'FEATURE_177', 'FEATURE_89', 'FEATURE_88', 
                 'FEATURE_56', 'FEATURE_54', 'FEATURE_53', 
                 'FEATURE_52', 'FEATURE_51', 'FEATURE_38', 
                 'FEATURE_36', 'FEATURE_35', 'FEATURE_34', 
                 'FEATURE_33','FEATURE_12', 'FEATURE_8', 
                 'FEATURE_7', 'FEATURE_0', 'FEATURE_13',
                 'FEATURE_76', 'FEATURE_110', 'FEATURE_111',
                 'FEATURE_125', 'FEATURE_133', 'FEATURE_196',
                 'FEATURE_226'] # для создания дополниельных признаков, исходя из распределений

col_bin = ['FEATURE_2', 'FEATURE_4', 'FEATURE_5',
           'FEATURE_6', 'FEATURE_11', 'FEATURE_15',
           'FEATURE_16', 'FEATURE_17', 'FEATURE_18',
           'FEATURE_19', 'FEATURE_140', 'FEATURE_159',
           'FEATURE_254', 'FEATURE_255']

col_025_num = [col for col in col_num if col not in col_default_num+col_log_std_num] 
# числовые, которые нужно преобразовать через корень 4 степени

# Pipeline = Features Preprocessing

In [6]:
n_components = 35 # SVD

n_clusters_1 = 3 # Clustering
n_clusters_2 = 5
n_clusters_3 = 10

n_features = 'all' # SelectKBest

levels = np.union1d(train_data[col_cat].fillna(9999), test_data[col_cat].fillna(9999))

In [7]:
class SelectColumnsTransfomer(BaseEstimator, TransformerMixin):

    def __init__(self, columns=[]):
        self.columns = columns

    def transform(self, X, **transform_params):
        trans = X[self.columns].copy() 
        return trans

    def fit(self, X, y=None, **fit_params):
        return self
    

class DataFrameFunctionTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, func, impute = False):
        self.func = func
        self.impute = impute
        self.series = pd.Series() 

    def transform(self, X, **transformparams):
        if self.impute:
            trans = pd.DataFrame(X).fillna(self.series).copy()
        else:
            trans = pd.DataFrame(X).apply(self.func).copy()
        return trans

    def fit(self, X, y=None, **fitparams):
        if self.impute:
            self.series = pd.DataFrame(X).apply(self.func).copy()
        return self
    
    
class DataFrameFeatureUnion(BaseEstimator, TransformerMixin):

    def __init__(self, list_of_transformers):
        self.list_of_transformers = list_of_transformers
        
    def transform(self, X, **transformparamn):
        concatted = pd.concat([transformer.transform(X)
                            for transformer in
                            self.fitted_transformers_], axis=1).copy()
        return concatted

    def fit(self, X, y=None, **fitparams):
        self.fitted_transformers_ = []
        for transformer in self.list_of_transformers:
            fitted_trans = clone(transformer).fit(X, y=None, **fitparams)
            self.fitted_transformers_.append(fitted_trans)
        return self
    

class DummiesTransformer(BaseEstimator, TransformerMixin):

    def transform(self, X, **transformparams):
        return pd.get_dummies(X).copy()

    def fit(self, X, y=None, **fitparams):
        return self


class DataFrameSVDTransformer(BaseEstimator, TransformerMixin):

    def transform(self, X, **transformparams):
        trans = X
        columns = ['COMPONENT_'+str(i) for i in range(trans.shape[1])]
        trans = pd.DataFrame(data=trans, columns=columns)
        return trans

    def fit(self, X, y=None, **fitparams):
        return self 
    
class DataFrameClustTransformer(BaseEstimator, TransformerMixin):
    
    def transform(self, X, **transformparams):
        trans = X
        if trans.shape[1]>1:
            columns = ['CLUSTER_'+str(trans.shape[1])+'_Dist_'+str(i) for i in range(trans.shape[1])]
        else:
            columns = ['CLUSTER_'+str(len(np.unique(trans)))]
            
        trans = pd.DataFrame(data=trans, columns=columns)
        return trans

    def fit(self, X, y=None, **fitparams):
        return self 
    
    
class ColumnsNameTransformer(BaseEstimator, TransformerMixin):
    
    def transform(self, X, **transformparams):
        trans = X.copy()
        trans.columns = [col + '_extra' for col in X.columns]
        return trans

    def fit(self, X, y=None, **fitparams):
        return self 
    
    
class DropAllZeroTrainColumnsTransformer(BaseEstimator, TransformerMixin):

    def transform(self, X, **transformparams):       
        return X.drop(self.cols_, axis=1).copy()

    def fit(self, X, y=None, **fitparams):
        self.cols_ = X.columns[(X==0).all()]
        return self

In [8]:
preprocessing_pipeline = [ 
    
                            #select binary
                            make_pipeline(  
                                SelectColumnsTransfomer(col_bin)),

                            #new_binary
                            make_pipeline(  
                                SelectColumnsTransfomer(col_new_bin),
                                DataFrameFunctionTransformer(func = lambda x: np.where(x.isnull(), 0, 1))), 

                            #numeric default
                            make_pipeline(  
                                SelectColumnsTransfomer(col_default_num),
                                DataFrameFunctionTransformer(lambda x: x.median(), impute=True)), 

                            #numeric log std
                            make_pipeline(  
                                SelectColumnsTransfomer(col_log_std_num),
                                DataFrameFunctionTransformer(func = lambda x: np.log(x + x.std())),
                                DataFrameFunctionTransformer(lambda x: x.median(), impute=True)),

                            #numeric **0.25
                            make_pipeline(  
                                SelectColumnsTransfomer(col_025_num),
                                DataFrameFunctionTransformer(func = lambda x: x**0.25),
                                DataFrameFunctionTransformer(lambda x: x.median(), impute=True)),

                            #categorical
                            make_pipeline(  
                                SelectColumnsTransfomer(col_cat),
                                DataFrameFunctionTransformer(lambda x: 9999, impute=True),
                                DataFrameFunctionTransformer(lambda x: x.astype(int)), 
                                DataFrameFunctionTransformer(lambda x: x.astype('category', categories=levels)),
                                DummiesTransformer(),
                                DropAllZeroTrainColumnsTransformer(),
                                TruncatedSVD(n_components=n_components, random_state=RND_STATE),
                                DataFrameSVDTransformer())
                             
                          ]

In [9]:
feature_engineering_numeric_pipeline = [ 
    
                                    #select all collumns
                                    make_pipeline(
                                        SelectColumnsTransfomer(['COMPONENT_'+str(i) for i in range(n_components)] +
                                                                 col_num + col_new_bin + col_bin)),
                                    #numeric extra features
                                    make_pipeline(  
                                        SelectColumnsTransfomer(col_extra_num),
                                        DataFrameFunctionTransformer(func = lambda x: np.where(x >= x.median(), 1, 0)),
                                        ColumnsNameTransformer())
    
                                       ]

In [10]:
feature_engineering_clustering_pipeline = [
    
                                    #select all collumns
                                    make_pipeline(
                                        SelectColumnsTransfomer(['COMPONENT_'+str(i) for i in range(n_components)] +
                                                                [col + '_extra' for col in col_extra_num] +
                                                                 col_num + col_new_bin + col_bin)),

                                    #clusterization  n_clusters = 3
                                    make_pipeline(
                                        SelectColumnsTransfomer(['COMPONENT_'+str(i) for i in range(n_components)] +
                                                                [col + '_extra' for col in col_extra_num] +
                                                                 col_num + col_new_bin + col_bin),
                                        KMeans(random_state=RND_STATE, n_clusters=n_clusters_1, 
                                                         precompute_distances = True, n_jobs=-1),
                                        DataFrameClustTransformer()),

                                    #clusterization  n_clusters = 5
                                    make_pipeline(
                                        SelectColumnsTransfomer(['COMPONENT_'+str(i) for i in range(n_components)] +
                                                                [col + '_extra' for col in col_extra_num] +
                                                                 col_num + col_new_bin + col_bin),
                                        KMeans(random_state=RND_STATE, n_clusters=n_clusters_2, 
                                               precompute_distances = True, n_jobs=-1),
                                        DataFrameClustTransformer()),

                                    #clusterization  n_clusters = 10
                                    make_pipeline(
                                        SelectColumnsTransfomer(['COMPONENT_'+str(i) for i in range(n_components)] +
                                                                [col + '_extra' for col in col_extra_num] +
                                                                 col_num + col_new_bin + col_bin),
                                        KMeans(random_state=RND_STATE, n_clusters=n_clusters_3, 
                                               precompute_distances = True, n_jobs=-1),
                                        DataFrameClustTransformer())
    
                        ]

In [11]:
preprocessing = DataFrameFeatureUnion(preprocessing_pipeline)
feature_engineering_numeric = DataFrameFeatureUnion(feature_engineering_numeric_pipeline)
#feature_engineering_clustering = DataFrameFeatureUnion(feature_engineering_clustering_pipeline)

In [12]:
data_transformation = make_pipeline(preprocessing, 
                                    feature_engineering_numeric,
                                    #feature_engineering_clustering,
                                    SelectKBest(f_classif, k=n_features))

In [13]:
target = train_data.TARGET
train_data_transformed = data_transformation.fit_transform(train_data.iloc[:, 2:], target)
test_data_transformed = data_transformation.transform(test_data.iloc[:, 1:])

In [14]:
sample_weights = []
for element in target:
    if element == 0:
        sample_weights.append(0.06)
    elif element == 1:
        sample_weights.append(0.23)
    elif element == 2:
        sample_weights.append(0.71)

In [15]:
xgb_model = xgb.XGBClassifier(silent=1, 
                              colsample_bytree=0.8,
                              gamma=0.3, 
                              max_depth=8, 
                              min_child_weight=2.0,
                              n_estimators=5300, 
                              subsample=0.7, 
                              learning_rate=0.066,
                              n_jobs=-1)

In [16]:
eval_set = [(train_data_transformed, target)]

xgb_model.fit(train_data_transformed, target, 
              sample_weight=sample_weights,
              eval_metric=['merror', 'mlogloss'], 
              eval_set=eval_set, 
              verbose=False, 
              early_stopping_rounds=40)

In [17]:
y_pred = xgb_model.predict(test_data_transformed)

In [18]:
contest_answer = test_data["ID"].to_frame()
contest_answer["TARGET"] = y_pred

In [19]:
contest_answer.to_csv('contest_answer.csv', index=False)

In [20]:
y_pred_proba = xgb_model.predict_proba(test_data_transformed)

In [21]:
def get_segments(y_pred_proba):
    best_class = []
    best_proba = []
    for i, row in enumerate(y_pred_proba):
        max_proba = np.max(row)
        best_proba.append(max_proba)
        best_class.append(row.tolist().index(max_proba))    

    list_proba = (list(map(lambda x, y, z: (x, (y, z)), best_class, best_proba, range(len(y_pred_proba)))))
    list_proba = sorted(list_proba, key=lambda tup: (tup[0], tup[1][0]), reverse=True)
    index_class1_500 = [tup[1][1] for tup in list_proba if tup[0]==1][:500]
    index_class2_200 = [tup[1][1] for tup in list_proba if tup[0]==2][:200]
    
    df_cl1_500 = test_data.ID.iloc[index_class1_500].to_frame()
    df_cl2_200 = test_data.ID.iloc[index_class2_200].to_frame()
    
    df_cl1_500["TARGET"] = 1
    df_cl2_200["TARGET"] = 2
    
    contest_segments = pd.concat([df_cl1_500, df_cl2_200])
    return contest_segments

In [22]:
contest_segments = get_segments(y_pred_proba)

In [23]:
contest_segments.to_csv('contest_segments.csv', index=False)