In [None]:
import os
import gc
import logging
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, roc_auc_score, roc_curve, accuracy_score, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import svm
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.base import clone

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import cluster

In [None]:
%matplotlib inline

In [None]:
start_time = time.time()

In [None]:
train_df = pd.read_csv('../input/train.csv')

In [None]:
test_df = pd.read_csv('../input/test.csv')

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
train_df_id_droped = train_df[train_df.columns.drop('ID_code')]

In [None]:
train_df['var_0'].unique().shape

In [None]:
#train_df.apply(pd.unique, axis=0)

In [None]:
uniques_dict = {column_name: train_df_id_droped[column_name].unique() for column_name in train_df_id_droped.columns.drop('target').tolist()}

In [None]:
uniques_dict_counts = {column_name: uniques.shape[0] for column_name, uniques in uniques_dict.items()}

In [None]:
uniques_counts_series = pd.Series(uniques_dict_counts)

In [None]:
uniques_counts_series[:5]

In [None]:
uniques_counts_series.unique().shape

In [None]:
uniques_counts_series.max()

In [None]:
uniques_counts_series.min()

In [None]:
fig = plt.figure(figsize=(24, 18))
ax = fig.add_subplot(111)
ax.bar(np.arange(200), uniques_counts_series.values.astype(np.int64))
#ax.bar(uniques_counts_series)
ax.set_title('Features uniques values num')
plt.show()

In [None]:
all_cells_execution_time = time.time() - start_time
print("all cells execution time: {} min".format(all_cells_execution_time / 60))

In [None]:
train_df_id_droped.shape

In [None]:
plt.figure(figsize=(24, 18))
plt.title("Distributon of unqie values per column in the train dataset")
#sns.distplot(train_df_id_droped[train_df_id_droped.columns.drop('target').tolist()].unique(), color='green', kde=True, bins=200, label="train")
sns.distplot(uniques_counts_series.values.astype(np.int64), color='green', kde=True, bins=200, label="train")
plt.legend()
plt.show()

In [None]:
features = train_df.columns.values[2:202]
plt.figure(figsize=(24, 18))
plt.title("Distribution of mean values per column in the train and test set")
sns.distplot(train_df[features].mean(axis=0), color="magenta", kde=True,bins=120, label='train')
#sns.distplot(test_df[features].mean(axis=0),color="darkblue", kde=True,bins=120, label='test')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(24, 18))
features = train_df.columns.values[2:202]
#plt.title("Distribution of mean values per row in the train and test set")
plt.title("Distribution of mean values per row in the train set")
sns.distplot(train_df[features].mean(axis=1), color="blue", kde=True, bins=120, label='train')
#sns.distplot(test_df[features].mean(axis=1),color="blue", kde=True,bins=120, label='test')
plt.legend()
plt.show()

In [None]:
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
target = train_df['target']

In [None]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.4,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': -1,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1
}

In [None]:
'''
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=44000)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=1000, early_stopping_rounds=3000)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    fold_importance_df = pd.DataFrame()
    fold_importance_df['Feature'] = features
    fold_importance_df['importance'] = clf.feature_importance()
    fold_importance_df['fold'] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
'''

In [None]:
'''
cols = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:150].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.savefig('FI.png')
'''

In [None]:
min_max_scaler = MinMaxScaler()

In [None]:
train_df_rows_count = train_df.shape[0]

In [None]:
train_df_rows_count

In [None]:
uniques_count_more_1_2 = uniques_counts_series[uniques_counts_series > train_df_rows_count / 2]

In [None]:
uniques_count_more_1_2.shape

In [None]:
uniques_count_less_1_2_more_1_4 = uniques_counts_series[uniques_counts_series < train_df_rows_count / 2]

In [None]:
#uniques_count_less_1_2_more_1_4 = uniques_counts_series[
#    ((uniques_counts_series < train_df_rows_count / 2).bool() and (uniques_counts_series > train_df_rows_count / 4).bool()).bool()
#]

In [None]:
uniques_count_less_1_2_more_1_4 = uniques_count_less_1_2_more_1_4[uniques_count_less_1_2_more_1_4 > train_df_rows_count / 4]

In [None]:
uniques_count_less_1_2_more_1_4.shape

In [None]:
uniques_count_less_1_4 = uniques_counts_series[uniques_counts_series < train_df_rows_count / 4]

In [None]:
uniques_count_less_1_4.shape

In [None]:
uniques_count_less_1_4.head()

In [None]:
uniques_count_less_1_4.index.tolist()

In [None]:
def train(train_df, test_df, features, param, num_round=1000000):
    start_time = time.time()
    folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=44000)
    oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))
    feature_importance_df = pd.DataFrame()
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
        num_round = num_round
        clf = lgb.train(
            param,
            trn_data,
            num_round,
            valid_sets=[trn_data, val_data],
            verbose_eval=1000,
            early_stopping_rounds=3000
        )
        oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
        fold_importance_df = pd.DataFrame()
        fold_importance_df['Feature'] = features
        fold_importance_df['importance'] = clf.feature_importance()
        fold_importance_df['fold'] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
    print("Total run time {} min:".format((time.time() - start_time) / 60))
    print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
    return oof, predictions, feature_importance_df

In [None]:
#train_results_more_1_2 = train(train_df, test_df, uniques_count_more_1_2.index.tolist(), param)

In [None]:
#train_results_less_1_2_more_1_4 = train(train_df, test_df, uniques_count_less_1_2_more_1_4.index.tolist(), param)

In [None]:
#train_results_less_1_4 = train(train_df, test_df, uniques_count_less_1_4.index.tolist(), param)

In [None]:
#oof_more_1_2, predictions_more_1_2, importance_df_more_1_2  = train_results_more_1_2

In [None]:
#predictions_more_1_2.shape

In [None]:
#predictions_more_1_2[:10]

In [None]:
#predictions_more_1_2.max()

In [None]:
#predictions_more_1_2.min()

In [None]:
#df = pd.DataFrame({'var0': [1, 2], 'var1': [3, 4], 'var2': [5, 6], 'var3': [7, 8]})

In [None]:
#df

In [None]:
polinomial_features_maker = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

In [None]:
#new_features = polinomial_features_maker.fit_transform(df)

In [None]:
#new_features

In [None]:
#new_features.shape

In [None]:
#polinomial_features_maker.get_feature_names(['var0', 'var1', 'var2', 'var3'])

In [None]:
def normalise_feature_name(feature_name):
    if '^' in feature_name:
        return '_'.join(feature_name.split('^'))
    elif ' ' in feature_name:
        return '_'.join(feature_name.split())
    else:
        return feature_name

In [None]:
#normalised_feature_names = [normalise_feature_name(feature_name) for feature_name in polinomial_features_maker.get_feature_names(['var0', 'var1', 'var2', 'var3'])]

In [None]:
#normalised_feature_names

In [None]:
target_values = train_df['target'].values

In [None]:
'''
train_values, holdout_test_values, train_target_values, holdout_test_target_values = train_test_split(
    #scaled_train_values,
    train_df[train_df.columns.drop(['ID_code', 'target'])].values,
    target_values,
    test_size=0.2,
    random_state=0
)
'''

In [None]:
#feature_names = train_df.columns.drop(['ID_code', 'target']).tolist()

In [None]:
#polinomial_train_values = polinomial_features_maker.fit_transform(train_values)

In [None]:
#polinomial_holdout_test_values = polinomial_features_maker.fit_transform(holdout_test_values)

In [None]:
#polinomial_features_names = [normalise_feature_name(feature_name) for feature_name in polinomial_features_maker.get_feature_names(feature_names)]

In [None]:
train_df_uniques_count_more_1_2 = train_df[uniques_count_more_1_2.index]

In [None]:
train_df_uniques_count_less_1_2_more_1_4 = train_df[uniques_count_less_1_2_more_1_4.index]

In [None]:
train_df_uniques_count_less_1_4 = train_df[uniques_count_less_1_4.index]

In [None]:
'target' in train_df_uniques_count_less_1_4.columns.tolist()

In [None]:
polinomial_values_uniques_count_more_1_2 = polinomial_features_maker.fit_transform(train_df_uniques_count_more_1_2)

In [None]:
#del polinomial_values_uniques_count_more_1_2

In [None]:
#gc.collect()