In [1]:
import os
import gc
import logging
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, roc_auc_score, roc_curve, accuracy_score, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import svm
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.base import clone

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import cluster

In [2]:
%matplotlib inline

In [3]:
start_time = time.time()

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / (1024 ** 2)    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / (1024 ** 2)
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [271]:
def set_classes(true_class_size, predictions_df, target_column_name):
    sorted_predictions_df = predictions_df.sort_values(by=[target_column_name])
    sorted_predictions_df[:true_class_size] = 0
    sorted_predictions_df[true_class_size:] = 1
    return sorted_predictions_df.sort_index()

In [5]:
train_df = pd.read_csv('../input/train.csv')

In [6]:
test_df = pd.read_csv('../input/test.csv')

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 202 entries, ID_code to var_199
dtypes: float64(200), int64(1), object(1)
memory usage: 308.2+ MB


In [8]:
train_df.describe()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,...,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.10049,10.679914,-1.627622,10.715192,6.796529,11.078333,-5.065317,5.408949,16.54585,0.284162,...,3.23444,7.438408,1.927839,3.331774,17.993784,-0.142088,2.303335,8.908158,15.87072,-3.326537
std,0.300653,3.040051,4.050044,2.640894,2.043319,1.62315,7.863267,0.866607,3.418076,3.332634,...,4.559922,3.023272,1.478423,3.99203,3.135162,1.429372,5.454369,0.921625,3.010945,10.438015
min,0.0,0.4084,-15.0434,2.1171,-0.0402,5.0748,-32.5626,2.3473,5.3497,-10.5055,...,-14.0933,-2.6917,-3.8145,-11.7834,8.6944,-5.261,-14.2096,5.9606,6.2993,-38.8528
25%,0.0,8.45385,-4.740025,8.722475,5.254075,9.883175,-11.20035,4.7677,13.9438,-2.3178,...,-0.058825,5.1574,0.889775,0.5846,15.6298,-1.1707,-1.946925,8.2528,13.8297,-11.208475
50%,0.0,10.52475,-1.60805,10.58,6.825,11.10825,-4.83315,5.3851,16.4568,0.3937,...,3.2036,7.34775,1.9013,3.39635,17.95795,-0.1727,2.4089,8.8882,15.93405,-2.81955
75%,0.0,12.7582,1.358625,12.5167,8.3241,12.261125,0.9248,6.003,19.1029,2.9379,...,6.4062,9.512525,2.9495,6.2058,20.396525,0.8296,6.556725,9.5933,18.064725,4.8368
max,1.0,20.315,10.3768,19.353,13.1883,16.6714,17.2516,8.4477,27.6918,10.1513,...,18.4409,16.7165,8.4024,18.2818,27.9288,4.2729,18.3215,12.0004,26.0791,28.5007


In [9]:
train_df.shape

(200000, 202)

In [10]:
train_df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [11]:
train_df_id_droped = train_df[train_df.columns.drop('ID_code')]

In [12]:
train_df['var_0'].unique().shape

(94672,)

In [13]:
#train_df.apply(pd.unique, axis=0)

In [14]:
uniques_dict = {column_name: train_df_id_droped[column_name].unique() for column_name in train_df_id_droped.columns.drop('target').tolist()}

In [15]:
uniques_dict_counts = {column_name: uniques.shape[0] for column_name, uniques in uniques_dict.items()}

In [16]:
uniques_counts_series = pd.Series(uniques_dict_counts)

In [17]:
uniques_counts_series[:5]

var_0     94672
var_1    108932
var_2     86555
var_3     74597
var_4     63515
dtype: int64

In [18]:
uniques_counts_series.unique().shape

(200,)

In [19]:
uniques_counts_series.max()

169968

In [20]:
uniques_counts_series.min()

451

In [21]:
'''
fig = plt.figure(figsize=(24, 18))
ax = fig.add_subplot(111)
ax.bar(np.arange(200), uniques_counts_series.values.astype(np.int64))
#ax.bar(uniques_counts_series)
ax.set_title('Features uniques values num')
plt.show()
'''

"\nfig = plt.figure(figsize=(24, 18))\nax = fig.add_subplot(111)\nax.bar(np.arange(200), uniques_counts_series.values.astype(np.int64))\n#ax.bar(uniques_counts_series)\nax.set_title('Features uniques values num')\nplt.show()\n"

In [22]:
all_cells_execution_time = time.time() - start_time
print("all cells execution time: {} min".format(all_cells_execution_time / 60))

all cells execution time: 0.33239564498265584 min


In [23]:
train_df_id_droped.shape

(200000, 201)

In [24]:
'''
plt.figure(figsize=(24, 18))
plt.title("Distributon of unqie values per column in the train dataset")
#sns.distplot(train_df_id_droped[train_df_id_droped.columns.drop('target').tolist()].unique(), color='green', kde=True, bins=200, label="train")
sns.distplot(uniques_counts_series.values.astype(np.int64), color='green', kde=True, bins=200, label="train")
plt.legend()
plt.show()
'''

'\nplt.figure(figsize=(24, 18))\nplt.title("Distributon of unqie values per column in the train dataset")\n#sns.distplot(train_df_id_droped[train_df_id_droped.columns.drop(\'target\').tolist()].unique(), color=\'green\', kde=True, bins=200, label="train")\nsns.distplot(uniques_counts_series.values.astype(np.int64), color=\'green\', kde=True, bins=200, label="train")\nplt.legend()\nplt.show()\n'

In [25]:
'''
features = train_df.columns.values[2:202]
plt.figure(figsize=(24, 18))
plt.title("Distribution of mean values per column in the train and test set")
sns.distplot(train_df[features].mean(axis=0), color="magenta", kde=True,bins=120, label='train')
#sns.distplot(test_df[features].mean(axis=0),color="darkblue", kde=True,bins=120, label='test')
plt.legend()
plt.show()
'''

'\nfeatures = train_df.columns.values[2:202]\nplt.figure(figsize=(24, 18))\nplt.title("Distribution of mean values per column in the train and test set")\nsns.distplot(train_df[features].mean(axis=0), color="magenta", kde=True,bins=120, label=\'train\')\n#sns.distplot(test_df[features].mean(axis=0),color="darkblue", kde=True,bins=120, label=\'test\')\nplt.legend()\nplt.show()\n'

In [26]:
'''
plt.figure(figsize=(24, 18))
features = train_df.columns.values[2:202]
#plt.title("Distribution of mean values per row in the train and test set")
plt.title("Distribution of mean values per row in the train set")
sns.distplot(train_df[features].mean(axis=1), color="blue", kde=True, bins=120, label='train')
#sns.distplot(test_df[features].mean(axis=1),color="blue", kde=True,bins=120, label='test')
plt.legend()
plt.show()
'''

'\nplt.figure(figsize=(24, 18))\nfeatures = train_df.columns.values[2:202]\n#plt.title("Distribution of mean values per row in the train and test set")\nplt.title("Distribution of mean values per row in the train set")\nsns.distplot(train_df[features].mean(axis=1), color="blue", kde=True, bins=120, label=\'train\')\n#sns.distplot(test_df[features].mean(axis=1),color="blue", kde=True,bins=120, label=\'test\')\nplt.legend()\nplt.show()\n'

In [27]:
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
target = train_df['target']

In [28]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.4,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': -1,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1
}

In [29]:
'''
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=44000)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=1000, early_stopping_rounds=3000)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    fold_importance_df = pd.DataFrame()
    fold_importance_df['Feature'] = features
    fold_importance_df['importance'] = clf.feature_importance()
    fold_importance_df['fold'] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
'''

'\nfolds = StratifiedKFold(n_splits=10, shuffle=False, random_state=44000)\noof = np.zeros(len(train_df))\npredictions = np.zeros(len(test_df))\nfeature_importance_df = pd.DataFrame()\nfor fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):\n    print("Fold {}".format(fold_))\n    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])\n    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])\n    num_round = 1000000\n    clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=1000, early_stopping_rounds=3000)\n    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)\n    fold_importance_df = pd.DataFrame()\n    fold_importance_df[\'Feature\'] = features\n    fold_importance_df[\'importance\'] = clf.feature_importance()\n    fold_importance_df[\'fold\'] = fold_ + 1\n    feature_importance_df = pd.concat([feature_import

In [30]:
'''
cols = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:150].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.savefig('FI.png')
'''

'\ncols = (feature_importance_df[["Feature", "importance"]]\n        .groupby("Feature")\n        .mean()\n        .sort_values(by="importance", ascending=False)[:150].index)\nbest_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]\n\nplt.figure(figsize=(14,28))\nsns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))\nplt.title(\'Features importance (averaged/folds)\')\nplt.tight_layout()\nplt.savefig(\'FI.png\')\n'

In [31]:
min_max_scaler = MinMaxScaler()

In [32]:
train_df_rows_count = train_df.shape[0]

In [33]:
train_df_rows_count

200000

In [34]:
uniques_count_more_1_2 = uniques_counts_series[uniques_counts_series > train_df_rows_count / 2]

In [35]:
uniques_count_more_1_2.shape

(110,)

In [36]:
uniques_count_more_1_4_less_1_2 = uniques_counts_series[uniques_counts_series < train_df_rows_count / 2]

In [37]:
#uniques_count_less_1_2_more_1_4 = uniques_counts_series[
#    ((uniques_counts_series < train_df_rows_count / 2).bool() and (uniques_counts_series > train_df_rows_count / 4).bool()).bool()
#]

In [38]:
uniques_count_less_1_2_more_1_4 = uniques_count_more_1_4_less_1_2[uniques_count_more_1_4_less_1_2 > train_df_rows_count / 4]

In [39]:
uniques_count_more_1_4_less_1_2.shape

(90,)

In [40]:
uniques_count_less_1_4 = uniques_counts_series[uniques_counts_series < train_df_rows_count / 4]

In [41]:
uniques_count_less_1_4.shape

(39,)

In [42]:
uniques_count_less_1_4.head()

var_6     38599
var_9     49417
var_12     9561
var_15    19810
var_23    24913
dtype: int64

In [43]:
#uniques_count_less_1_4.index.tolist()

In [44]:
def train(train_df, test_df, target, features, param, num_round=1000000):
    start_time = time.time()
    folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=44000)
    oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))
    feature_importance_df = pd.DataFrame()
    lgb_classifier = None
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
        num_round = num_round
        clf = lgb.train(
            param,
            trn_data,
            num_round,
            valid_sets=[trn_data, val_data],
            verbose_eval=1000,
            early_stopping_rounds=3000
        )
        lgb_classifier = clf
        oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
        fold_importance_df = pd.DataFrame()
        fold_importance_df['Feature'] = features
        fold_importance_df['importance'] = clf.feature_importance()
        fold_importance_df['fold'] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
    print("Total run time {} min:".format((time.time() - start_time) / 60))
    print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
    return oof, predictions, feature_importance_df, clf

In [97]:
train_results_more_1_2 = train(train_df, test_df, target, uniques_count_more_1_2.index.tolist(), param)

Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.836449	valid_1's auc: 0.819362
[2000]	training's auc: 0.845447	valid_1's auc: 0.823533
[3000]	training's auc: 0.852066	valid_1's auc: 0.82583
[4000]	training's auc: 0.858253	valid_1's auc: 0.826207
[5000]	training's auc: 0.864232	valid_1's auc: 0.826657
[6000]	training's auc: 0.869822	valid_1's auc: 0.826773
[7000]	training's auc: 0.875256	valid_1's auc: 0.826797
[8000]	training's auc: 0.880766	valid_1's auc: 0.826868
[9000]	training's auc: 0.886046	valid_1's auc: 0.826551
[10000]	training's auc: 0.89129	valid_1's auc: 0.826084
Early stopping, best iteration is:
[7789]	training's auc: 0.879598	valid_1's auc: 0.826957
Fold 1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.837076	valid_1's auc: 0.81285
[2000]	training's auc: 0.845801	valid_1's auc: 0.817226
[3000]	training's auc: 0.852524	valid_1's auc: 0.819153
[4000]	training's auc: 0.858646	valid_1's

In [126]:
train_results_less_1_2_more_1_4 = train(train_df, test_df, target, uniques_count_less_1_2_more_1_4.index.tolist(), param)

Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.772379	valid_1's auc: 0.741305
[2000]	training's auc: 0.778907	valid_1's auc: 0.742285
[3000]	training's auc: 0.78477	valid_1's auc: 0.742419
[4000]	training's auc: 0.790565	valid_1's auc: 0.742499
[5000]	training's auc: 0.796234	valid_1's auc: 0.742592
Early stopping, best iteration is:
[2500]	training's auc: 0.782039	valid_1's auc: 0.742856
Fold 1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.771593	valid_1's auc: 0.748627
[2000]	training's auc: 0.778487	valid_1's auc: 0.749908
[3000]	training's auc: 0.784376	valid_1's auc: 0.749326
[4000]	training's auc: 0.790141	valid_1's auc: 0.74907
[5000]	training's auc: 0.795899	valid_1's auc: 0.749057
Early stopping, best iteration is:
[2227]	training's auc: 0.779966	valid_1's auc: 0.750254
Fold 2
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.772997	valid_1's auc: 

In [161]:
train_results_less_1_4 = train(train_df, test_df, target, uniques_count_less_1_4.index.tolist(), param)

Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.744883	valid_1's auc: 0.710591
[2000]	training's auc: 0.750561	valid_1's auc: 0.712807
[3000]	training's auc: 0.753888	valid_1's auc: 0.713001
[4000]	training's auc: 0.75647	valid_1's auc: 0.712625
Early stopping, best iteration is:
[1837]	training's auc: 0.750068	valid_1's auc: 0.713563
Fold 1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.744471	valid_1's auc: 0.712185
[2000]	training's auc: 0.749998	valid_1's auc: 0.715383
[3000]	training's auc: 0.75323	valid_1's auc: 0.716157
[4000]	training's auc: 0.755847	valid_1's auc: 0.715947
[5000]	training's auc: 0.758416	valid_1's auc: 0.715308
[6000]	training's auc: 0.760556	valid_1's auc: 0.714514
Early stopping, best iteration is:
[3082]	training's auc: 0.753392	valid_1's auc: 0.71638
Fold 2
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.744833	valid_1's auc: 0

In [48]:
#oof_more_1_2, predictions_more_1_2, importance_df_more_1_2  = train_results_more_1_2

In [49]:
#predictions_more_1_2.shape

In [50]:
#predictions_more_1_2[:10]

In [51]:
#predictions_more_1_2.max()

In [52]:
#predictions_more_1_2.min()

In [53]:
#df = pd.DataFrame({'var0': [1, 2], 'var1': [3, 4], 'var2': [5, 6], 'var3': [7, 8]})

In [54]:
#df

In [55]:
polinomial_features_maker = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

In [56]:
#new_features = polinomial_features_maker.fit_transform(df)

In [57]:
#new_features

In [58]:
#new_features.shape

In [59]:
#polinomial_features_maker.get_feature_names(['var0', 'var1', 'var2', 'var3'])

In [60]:
def normalise_feature_name(feature_name):
    if '^' in feature_name:
        return '_'.join(feature_name.split('^'))
    elif ' ' in feature_name:
        return '_'.join(feature_name.split())
    else:
        return feature_name

In [61]:
#normalised_feature_names = [normalise_feature_name(feature_name) for feature_name in polinomial_features_maker.get_feature_names(['var0', 'var1', 'var2', 'var3'])]

In [62]:
#normalised_feature_names

In [63]:
target_values = train_df['target'].values

In [64]:
'''
train_values, holdout_test_values, train_target_values, holdout_test_target_values = train_test_split(
    #scaled_train_values,
    train_df[train_df.columns.drop(['ID_code', 'target'])].values,
    target_values,
    test_size=0.2,
    random_state=0
)
'''

"\ntrain_values, holdout_test_values, train_target_values, holdout_test_target_values = train_test_split(\n    #scaled_train_values,\n    train_df[train_df.columns.drop(['ID_code', 'target'])].values,\n    target_values,\n    test_size=0.2,\n    random_state=0\n)\n"

In [65]:
#feature_names = train_df.columns.drop(['ID_code', 'target']).tolist()

In [66]:
#polinomial_train_values = polinomial_features_maker.fit_transform(train_values)

In [67]:
#polinomial_holdout_test_values = polinomial_features_maker.fit_transform(holdout_test_values)

In [68]:
#polinomial_features_names = [normalise_feature_name(feature_name) for feature_name in polinomial_features_maker.get_feature_names(feature_names)]

In [69]:
train_df_uniques_count_more_1_2 = train_df[uniques_count_more_1_2.index].astype(np.float32)

In [70]:
train_df_uniques_count_more_1_4_less_1_2 = train_df[uniques_count_more_1_4_less_1_2.index].astype(np.float32)

In [71]:
train_df_uniques_count_less_1_4 = train_df[uniques_count_less_1_4.index].astype(np.float32)

In [72]:
'target' in train_df_uniques_count_less_1_4.columns.tolist()

False

In [73]:
polinomial_values_uniques_count_more_1_2 = polinomial_features_maker.fit_transform(train_df_uniques_count_more_1_2).astype(np.float32)

In [74]:
#del polinomial_values_uniques_count_more_1_2

In [75]:
#gc.collect()

In [76]:
polinomial_feature_names_uniques_count_more_1_2 = [
    normalise_feature_name(feature_name) for feature_name in polinomial_features_maker.get_feature_names(train_df_uniques_count_more_1_2.columns.tolist())
]

In [77]:
len(polinomial_feature_names_uniques_count_more_1_2)

6215

In [78]:
train_polinomial_values_ucm_1_2, holdout_test_polinomial_values_ucm_1_2, train_target_values_ucm_1_2, holdout_test_target_values_ucm_1_2 = train_test_split(
    #scaled_train_values,
    polinomial_values_uniques_count_more_1_2,
    target_values,
    test_size=0.2,
    random_state=0
)

In [79]:
'''
train_polinomial_df_ucm_1_2 = reduce_mem_usage(pd.DataFrame(
    data=train_polinomial_values_ucm_1_2,
    columns=polinomial_feature_names_uniques_count_more_1_2
))
'''
train_polinomial_df_ucm_1_2 = pd.DataFrame(
    data=train_polinomial_values_ucm_1_2,
    columns=polinomial_feature_names_uniques_count_more_1_2,
    dtype=np.float32
)

In [80]:
del train_polinomial_values_ucm_1_2
gc.collect()

18

In [81]:
train_target_df_ucm_1_2 = pd.DataFrame(data=train_target_values_ucm_1_2, columns=['target'], dtype=np.float32)

In [82]:
#del train_target_values_ucm_1_2
#gc.collect()

In [83]:
train_target_df_ucm_1_2.shape

(160000, 1)

In [84]:
train_target_df_ucm_1_2.values?

In [85]:
train_target_series_ucm_1_2 = pd.Series(train_target_values_ucm_1_2, dtype=np.float32)

In [86]:
del train_target_values_ucm_1_2
gc.collect()

0

In [87]:
train_target_series_ucm_1_2.shape

(160000,)

In [88]:
train_target_series_ucm_1_2.head()

0    0.0
1    1.0
2    0.0
3    0.0
4    0.0
dtype: float32

In [89]:
train_polinomial_df_ucm_1_2.shape

(160000, 6215)

In [90]:
train_polinomial_df_ucm_1_2.head()

Unnamed: 0,var_1,var_5,var_7,var_10,var_11,var_13,var_17,var_18,var_19,var_20,...,var_190_2,var_190_var_193,var_190_var_196,var_190_var_199,var_193_2,var_193_var_196,var_193_var_199,var_196_2,var_196_var_199,var_199_2
0,-1.4003,-15.148,23.001101,-12.8277,-11.9705,0.9585,-13.7352,8.9064,0.698,2.9975,...,97.253128,103.938377,63.965946,-21.822956,111.083176,68.363007,-23.32308,42.072086,-14.353533,4.896926
1,-2.3055,-3.9362,20.1087,-2.1613,2.0213,12.1363,-11.3936,2.4146,12.2082,18.9734,...,0.18054,-1.965333,-0.716424,7.05249,21.394325,7.798887,-76.772385,2.842933,-27.985888,275.493591
2,2.0901,-19.5462,16.896999,-9.586,-2.1832,8.4016,-3.2487,11.4263,13.9954,15.6798,...,1.587348,8.172215,-1.094097,13.587013,42.073387,-5.63279,69.950638,0.754119,-9.364999,116.298965
3,1.8913,-14.9898,21.2463,7.0288,2.4824,-0.057,-5.4731,7.6422,12.2217,4.5632,...,138.525833,105.14344,-12.135738,70.626442,79.805641,-9.211229,53.606655,1.063167,-6.187322,36.0084
4,2.1761,-17.0776,16.811001,4.6831,-2.1157,16.591,-8.8183,23.848,23.709801,15.3342,...,16.612961,36.816788,20.045683,-83.176071,81.591469,44.42421,-184.330536,24.187706,-100.362679,416.437469


In [91]:
test_values_ucm_1_2 = polinomial_features_maker.fit_transform(test_df[uniques_count_more_1_2.index]).astype(np.float32)

In [92]:
test_polinomial_df_ucm_1_2 = pd.DataFrame(
    data=test_values_ucm_1_2,
    columns=polinomial_feature_names_uniques_count_more_1_2,
    dtype=np.float32
)

In [93]:
del test_values_ucm_1_2
gc.collect()

7

In [94]:
del polinomial_values_uniques_count_more_1_2
gc.collect()

0

In [95]:
train_results_polinomial_ucm_1_2 = train(
    train_polinomial_df_ucm_1_2,
    test_polinomial_df_ucm_1_2,
    #train_target_df_ucm_1_2,
    train_target_series_ucm_1_2,
    train_polinomial_df_ucm_1_2.columns.tolist(),
    param
)

Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.838762	valid_1's auc: 0.802518
[2000]	training's auc: 0.864215	valid_1's auc: 0.811186
[3000]	training's auc: 0.882027	valid_1's auc: 0.812682
[4000]	training's auc: 0.898256	valid_1's auc: 0.812993
[5000]	training's auc: 0.913146	valid_1's auc: 0.812344
[6000]	training's auc: 0.926696	valid_1's auc: 0.812159
Early stopping, best iteration is:
[3874]	training's auc: 0.896196	valid_1's auc: 0.813274
Fold 1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.838786	valid_1's auc: 0.802206
[2000]	training's auc: 0.863654	valid_1's auc: 0.812419
[3000]	training's auc: 0.881227	valid_1's auc: 0.815243
[4000]	training's auc: 0.897331	valid_1's auc: 0.816398
[5000]	training's auc: 0.912293	valid_1's auc: 0.816428
[6000]	training's auc: 0.925757	valid_1's auc: 0.816328
[7000]	training's auc: 0.938058	valid_1's auc: 0.81598
Early stopping, best iteration is:
[4563

In [100]:
oof, predictions, feature_importance_df, clf = train_results_polinomial_ucm_1_2

In [103]:
#type(predictions)
predictions_df = pd.DataFrame(data=predictions, columns=['target'])

In [104]:
predictions_df.to_csv('predictions_ucm_1_2.csv', index=False)

In [107]:
train_polinomial_df_ucm_1_2.to_csv('train_polinomial_data_ucm_1_2.csv', index=False)

In [108]:
test_polinomial_df_ucm_1_2.to_csv('test_polinomial_data_ucm_1_2.csv', index=False)

In [109]:
train_target_series_ucm_1_2.to_csv('train_target_ucm_1_2.csv', index=False)

In [110]:
del train_polinomial_df_ucm_1_2
del test_polinomial_df_ucm_1_2
del train_target_series_ucm_1_2
gc.collect()

346

In [105]:
clf.save_model('lgbm_ucf_1_2.txt')

In [305]:
clf.predict?

In [111]:
polinomial_values_uniques_count_more_1_4_less_1_2 = polinomial_features_maker.fit_transform(train_df_uniques_count_more_1_4_less_1_2).astype(np.float32)

In [112]:
len(polinomial_values_uniques_count_more_1_4_less_1_2)

200000

In [113]:
polinomial_feature_names_uniques_count_more_1_4_less_1_2 = [normalise_feature_name(feature_name) for feature_name in polinomial_features_maker.get_feature_names(train_df_uniques_count_more_1_4_less_1_2.columns.tolist())]

In [114]:
train_polinomial_values_ucm_1_4_1_2, holdout_test_polinomial_values_ucm_1_4_1_2, train_target_values_ucm_1_4_1_2, holdout_test_target_values_ucm_1_4_1_2 = train_test_split(
    #scaled_train_values,
    polinomial_values_uniques_count_more_1_4_less_1_2,
    target_values,
    test_size=0.2,
    random_state=0
)

In [115]:
train_polinomial_df_ucm_1_4_1_2 = pd.DataFrame(
    data=train_polinomial_values_ucm_1_4_1_2,
    columns=polinomial_feature_names_uniques_count_more_1_4_less_1_2,
    dtype=np.float32
)

In [116]:
train_target_series_ucm_1_4_1_2 = pd.Series(train_target_values_ucm_1_4_1_2)

In [118]:
train_target_series_ucm_1_4_1_2.shape

(160000,)

In [120]:
train_target_series_ucm_1_4_1_2.head()

0    0
1    1
2    0
3    0
4    0
dtype: int64

In [121]:
test_polinomial_values_ucm_1_4_1_2 = polinomial_features_maker.fit_transform(test_df[uniques_count_more_1_4_less_1_2.index]).astype(np.float32)

In [122]:
test_polinomial_df_ucm_1_4_1_2 = pd.DataFrame(
    data=test_polinomial_values_ucm_1_4_1_2,
    columns=polinomial_feature_names_uniques_count_more_1_4_less_1_2,
    dtype=np.float32
)

In [123]:
del polinomial_values_uniques_count_more_1_4_less_1_2
del train_polinomial_values_ucm_1_4_1_2
del test_polinomial_values_ucm_1_4_1_2
del train_target_values_ucm_1_4_1_2
gc.collect()

108

In [124]:
train_results_polinomial_ucm_1_4_1_2 = train(
    train_polinomial_df_ucm_1_4_1_2,
    test_polinomial_df_ucm_1_4_1_2,
    train_target_series_ucm_1_4_1_2,
    train_polinomial_df_ucm_1_4_1_2.columns.tolist(),
    param
)

Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.83246	valid_1's auc: 0.801038
[2000]	training's auc: 0.854414	valid_1's auc: 0.805477
[3000]	training's auc: 0.871523	valid_1's auc: 0.806123
[4000]	training's auc: 0.887375	valid_1's auc: 0.805903
[5000]	training's auc: 0.902057	valid_1's auc: 0.806144
Early stopping, best iteration is:
[2842]	training's auc: 0.868851	valid_1's auc: 0.80622
Fold 1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.831354	valid_1's auc: 0.807533
[2000]	training's auc: 0.853586	valid_1's auc: 0.814413
[3000]	training's auc: 0.871036	valid_1's auc: 0.815046
[4000]	training's auc: 0.887196	valid_1's auc: 0.814766
[5000]	training's auc: 0.90206	valid_1's auc: 0.814434
[6000]	training's auc: 0.915315	valid_1's auc: 0.814192
Early stopping, best iteration is:
[3584]	training's auc: 0.880639	valid_1's auc: 0.815199
Fold 2
Training until validation scores don't improve for 3000 

In [127]:
oof_ucm_1_4_1_2, predictions_ucm_1_4_1_2, feature_importance_df_ucm_1_4_1_2, clf_ucm_1_4_1_2 = train_results_polinomial_ucm_1_4_1_2

In [129]:
#type(oof_ucm_1_4_1_2)
predictions_df_ucm_1_4_1_2 = pd.DataFrame(data=predictions_ucm_1_4_1_2, columns=['target'])

In [130]:
predictions_df_ucm_1_4_1_2.to_csv('predictions_ucm_1_4_1_2.csv', index=False)

In [131]:
train_polinomial_df_ucm_1_4_1_2.to_csv('train_polinomial_data_ucm_1_4_1_2.csv', index=False)

In [132]:
test_polinomial_df_ucm_1_4_1_2.to_csv('test_polinomial_data_ucm_1_4_1_2.csv', index=False)

In [134]:
train_target_series_ucm_1_4_1_2.to_csv('train_target_ucm_1_4_1_2.csv', index=False)

In [136]:
del train_polinomial_df_ucm_1_4_1_2
del test_polinomial_df_ucm_1_4_1_2
del train_target_series_ucm_1_4_1_2
gc.collect()

168

In [135]:
clf_ucm_1_4_1_2.save_model('lgbm_ucm_1_4_1_2.txt')

In [137]:
polinomial_values_uniques_count_less_1_4 = polinomial_features_maker.fit_transform(train_df_uniques_count_less_1_4).astype(np.float32)

In [142]:
len(polinomial_values_uniques_count_less_1_4)
polinomial_values_uniques_count_less_1_4.shape

(200000, 819)

In [145]:
polinomial_feature_names_uniques_count_less_1_4 = [normalise_feature_name(feature_name) for feature_name in polinomial_features_maker.get_feature_names(train_df_uniques_count_less_1_4.columns.tolist())]

In [143]:
train_polinomial_values_ucm_1_4, holdout_test_polinomial_values_ucm_1_4, train_target_values_ucm_1_4, holdout_test_target_values_ucm_1_4 = train_test_split(
    #scaled_train_values,
    polinomial_values_uniques_count_less_1_4,
    target_values,
    test_size=0.2,
    random_state=0
)

In [146]:
train_polinomial_df_ucm_1_4 = pd.DataFrame(
    data=train_polinomial_values_ucm_1_4,
    columns=polinomial_feature_names_uniques_count_less_1_4,
    dtype=np.float32
)

In [153]:
train_target_series_ucm_1_4 = pd.Series(train_target_values_ucm_1_4)

In [154]:
train_target_series_ucm_1_4.shape

(160000,)

In [156]:
test_polinomial_values_ucm_1_4 = polinomial_features_maker.fit_transform(test_df[uniques_count_less_1_4.index]).astype(np.float32)

In [152]:
test_polinomial_df_ucm_1_4 = pd.DataFrame(
    data=test_values_ucm_1_4,
    columns=polinomial_feature_names_uniques_count_less_1_4,
    dtype=np.float32
)

In [159]:
#del polinomial_values_uniques_count_less_1_4
#del train_polinomial_values_ucm_1_4
del test_polinomial_values_ucm_1_4
del train_target_values_ucm_1_4
gc.collect()

505

In [160]:
train_results_polinomial_ucm_1_4 = train(
    train_polinomial_df_ucm_1_4,
    test_polinomial_df_ucm_1_4,
    train_target_series_ucm_1_4,
    train_polinomial_df_ucm_1_4.columns.tolist(),
    param
)

Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.750146	valid_1's auc: 0.706936
[2000]	training's auc: 0.773966	valid_1's auc: 0.707595
[3000]	training's auc: 0.796843	valid_1's auc: 0.706173
[4000]	training's auc: 0.817903	valid_1's auc: 0.705203
Early stopping, best iteration is:
[1548]	training's auc: 0.763514	valid_1's auc: 0.70773
Fold 1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.748817	valid_1's auc: 0.719931
[2000]	training's auc: 0.773467	valid_1's auc: 0.720096
[3000]	training's auc: 0.7964	valid_1's auc: 0.718983
[4000]	training's auc: 0.816975	valid_1's auc: 0.718124
Early stopping, best iteration is:
[1220]	training's auc: 0.754455	valid_1's auc: 0.721226
Fold 2
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.749849	valid_1's auc: 0.706014
[2000]	training's auc: 0.773877	valid_1's auc: 0.708997
[3000]	training's auc: 0.796516	valid_1's auc: 0

In [162]:
oof_ucm_1_4, predictions_ucm_1_4, feature_importance_df_ucm_1_4, clf_ucm_1_4 = train_results_polinomial_ucm_1_4

In [163]:
predictions_df_ucm_1_4 = pd.DataFrame(data=predictions_ucm_1_4, columns=['target'])

In [164]:
predictions_df_ucm_1_4.to_csv('predictions_ucm_1_4.csv', index=False)

In [165]:
train_polinomial_df_ucm_1_4.to_csv('train_polinomial_data_ucm_1_4.csv', index=False)

In [166]:
test_polinomial_df_ucm_1_4.to_csv('test_polinomial_data_ucm_1_4.csv', index=False)

In [167]:
train_target_series_ucm_1_4.to_csv('train_target_ucm_1_4.csv', index=False)

In [168]:
del train_polinomial_df_ucm_1_4
del test_polinomial_df_ucm_1_4
del train_target_series_ucm_1_4
gc.collect()

276

In [169]:
clf_ucm_1_4.save_model('lgbm_ucm_1_4.txt')

In [309]:
#print(predictions_df.head())
#print(predictions_df.shape)
#print(predictions_df_ucm_1_4_1_2.head())
#print(predictions_df_ucm_1_4_1_2.shape)
#print(predictions_df_ucm_1_4.head())
#print(predictions_df_ucm_1_4.shape)
#predictions_df.add?

In [201]:
def simply_blend(prediction_dataframes, weights, target_column_name='target'):
    blended_prediction_df = pd.DataFrame(data=np.zeros(prediction_dataframes[0].shape[0]), columns=[target_column_name])
    for prediction_df, weight in zip(prediction_dataframes, weights):
        blended_prediction_df[target_column_name] = blended_prediction_df[target_column_name] + weight * prediction_df[target_column_name]
        #blended_prediction_df.add(weight * prediction_df)
    return blended_prediction_df / sum(weights)

In [244]:
blended_polinomial_prediction = simply_blend(
    [predictions_df, predictions_df_ucm_1_4_1_2, predictions_df_ucm_1_4],
    [1, 1, 1]
)

In [209]:
blended_polinomial_prediction.head()
blended_polinomial_prediction.shape

(200000, 1)

In [205]:
#blended_polinomial_prediction.to_csv('simply_blended_polinom_lgb_submission.csv', index=False)

In [207]:
#classes_ratio = train_df[train_df['target'] == 1].shape[0] / train_df[train_df['target'] == 0].shape[0]

In [234]:
#threshold, submission_predicts = detect_threshold(classes_ratio, 0.01, blended_polinomial_prediction)

In [211]:
#blended_polinomial_prediction.sort_index?

In [219]:
#sorted_blended_polinomial_prediction = blended_polinomial_prediction.sort_values(by=['target'])

In [222]:
#sorted_blended_polinomial_prediction.reindex_axis?

In [310]:
#sorted_blended_polinomial_prediction.head()

In [311]:
#sorted_blended_polinomial_prediction.tail()

In [218]:
#blended_polinomial_prediction.sort_index()

In [312]:
#sorted_blended_polinomial_prediction.reset_index(inplace=True)

In [313]:
#sorted_blended_polinomial_prediction.head()

In [314]:
#sorted_blended_polinomial_prediction = sorted_blended_polinomial_prediction[sorted_blended_polinomial_prediction.columns.drop(['index'])]

In [315]:
#sorted_blended_polinomial_prediction

In [316]:
#classes_ratio

In [230]:
#above_treshold_size = sorted_blended_polinomial_prediction.shape[0] * classes_ratio

In [317]:
#above_treshold_size

In [318]:
#below_treshold_size = sorted_blended_polinomial_prediction.shape[0] - int(above_treshold_size)

In [319]:
#below_treshold_size

In [320]:
#sorted_blended_polinomial_prediction = blended_polinomial_prediction.sort_values(by=['target'])

In [321]:
#blended_polinomial_prediction.head()

In [322]:
#train_df[train_df['target'] == 0].shape[0]

In [323]:
#submission_df = set_classes(train_df[train_df['target'] == 0].shape[0], blended_polinomial_prediction, 'target')

In [324]:
#submission_df

In [325]:
#submission_df.tail()

In [326]:
#submission_df[submission_df['target'] == 0].shape

In [327]:
#submission_df.shape

In [328]:
#200000 - 179902

In [329]:
#train_df[train_df['target'] == 0].shape[0]

In [330]:
#train_df.shape

In [331]:
#submission_df.shape

In [332]:
#sorted_blended_polinomial_prediction.head()

In [333]:
#sorted_blended_polinomial_prediction.head()

In [334]:
#sorted_blended_polinomial_prediction.loc[:train_df[train_df['target'] == 0].shape[0], 'target'].shape

In [335]:
#sorted_blended_polinomial_prediction[:train_df[train_df['target'] == 0].shape[0]] = 0

In [336]:
#sorted_blended_polinomial_prediction.head()

In [337]:
#sorted_blended_polinomial_prediction[train_df[train_df['target'] == 0].shape[0]:] = 1

In [338]:
#sorted_blended_polinomial_prediction.tail()

In [339]:
#sorted_blended_polinomial_prediction.sort_index()

In [340]:
#submission_df.to_csv('submission_lgbm_blended_1.csv', index=False)

In [341]:
#submission_df.shape

In [342]:
#submission_df.info()

In [343]:
#blended_polinomial_prediction.info()

In [344]:
#submission_df

In [345]:
#df = pd.read_csv('submission_mlp_0.csv')

In [346]:
#df

In [290]:
ID_code = test_df['ID_code'].values

In [306]:
submission_df = pd.DataFrame({'ID_code': ID_code, 'target': blended_polinomial_prediction['target'].values.astype('float32')})

In [347]:
#submission_df

In [308]:
submission_df.to_csv('simply_blended_polinom_lgb_submission.csv', index=False)

In [348]:
#submission_df

In [349]:
#blended_df_right_classes_values = set_classes(train_df[train_df['target'] == 0].shape[0], blended_polinomial_prediction, 'target')

In [350]:
#blended_df_right_classes_values

In [301]:
submission_df = pd.DataFrame({'ID_code': ID_code, 'target': blended_df_right_classes_values['target'].values.astype('float32')})

In [302]:
submission_df

Unnamed: 0,ID_code,target
0,test_0,0.0
1,test_1,0.0
2,test_2,0.0
3,test_3,0.0
4,test_4,0.0
5,test_5,0.0
6,test_6,0.0
7,test_7,0.0
8,test_8,0.0
9,test_9,0.0


In [303]:
submission_df.to_csv('simply_blended_polinom_lgb_submission.csv', index=False)