In [1]:
import os
import time
import numpy as np
import pandas as pd
import math
from scipy import stats
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#pandas settings
pd.set_option('max_colwidth',250)
pd.set_option('max_columns',250)
pd.set_option('max_rows',500)

## Extra Libraries

In [2]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn import svm
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor

In [3]:
def truncate(n, decimals=0):
    multiplier = 10 ** decimals
    return math.floor(n * multiplier) / multiplier

## Load Data

In [4]:
train = pd.read_csv('../input/duth-dbirlab2-1/train.csv')
test = pd.read_csv('../input/duth-dbirlab2-1/test.csv')

## Παρακάτω φαίνονται καποιες απο τις δοκιμές που καναμε σε κελιά με σχόλια και έχουμε αφήσει χωρίς σχόλια τα κελιά τα οποία σειριακά μας δίνουν το καλύτερο σκόρ που πετύχαμε στο Public Leaderboard.

## Feature Engineering - Current Intensity/Current Direction 

In [5]:
variables = ['meridionalCurrent','zonalCurrent']
names = []
direction = []
dir_name = []
for var in variables:
    names.append(train.filter(regex='^'+ var ,axis=1).filter(regex='mean$',axis=1).columns.tolist())
for i in range(0,len(names[0])):
    var_name = 'currentIntensity' + names[0][i].split('Current')[1]
    dir_name.append('currentDirection' + names[0][i].split('Current')[1])
    
    train[var_name] = (train[names[0][i]]**2 + train[names[1][i]]**2)**(1/2)
    test[var_name] = (test[names[0][i]]**2 + test[names[1][i]]**2)**(1/2)
    
    for df in [train,test]:
        temp = []
        for j in df.index:
            a = df[names[0][i]][j]
            b = df[names[1][i]][j] 
            if a > 0 :
                if b > 0 :
                    s = 'NE'
                elif b < 0 :
                    s = 'NW'
                else:
                    s = 'N'
            elif a < 0 :
                if b > 0 :
                    s = 'SE'
                elif b < 0 :
                    s = 'SW'
                else:
                    s = 'S'
            else:
                if b > 0 :
                    s = 'E'
                elif b < 0 :
                    s = 'W'
                else:
                    s = 'None'
            temp.append(s)
        direction.append(temp)

        df.drop([names[0][i]], axis=1, inplace=True)
        df.drop([names[1][i]], axis=1, inplace=True)

        
for i in range(0,len(direction)):
    if (i % 2) == 0:
        train[dir_name[0]] = direction[i]
    else:
        test[dir_name[0]] = direction[i]
        dir_name.pop(0)

An attempt to utilise the given variables in a better way. This script creates two new variables, Current Intensity and Current Direction , for every pair of MeridionalCurrent & ZonalCurrent mean. This method delivers a good CV mean but it doesn't improve our best run. 

## Feature Engineering - Bathymetry

In [6]:
train['depth_range_0400'] = train['bathymetry'].apply(lambda x: 1 if x>-400 else 0)
test['depth_range_0400'] = test['bathymetry'].apply(lambda x: 1 if x>-400 else 0)

Adding a depth_range_0400 binary column that has value 1 if depth between 0-400 and 0 otherwise

## Encode Categorical

In [7]:
# df is just the train dataframe and then the test dataframe...
for df in [train,test]:
    for c in df.drop(['obs_id'],axis=1):
        if (df[c].dtype=='object'):
            lbl = LabelEncoder() 
            lbl.fit(list(df[c].values))
            df[c] = lbl.transform(list(df[c].values))

## Train Models Parameters

In [8]:
# Some useful parameters which will come in handy later on
ntrain = train.shape[0] # or len(train) train samples length
ntest = test.shape[0] # or len(test) test samples length
SEED = 11 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
folds = KFold(n_splits= NFOLDS, random_state=SEED, shuffle=True)

## Handling Outliers

Approach 1: Replacing outliers with mean value, does not seem to work well

In [9]:
# for col in train.columns:
#     temp = train[col]
#     mean = temp.mean()
#     q1 = train[col].quantile(.15)
#     q3 = train[col].quantile(.85)
#     train.loc[temp<q1,col] = mean
#     train.loc[temp>q3,col] = mean

Approach 2: Capping the values

In [10]:
# for col in train.columns:
#     temp = train[col]
#     q1 = train[col].quantile(.03)
#     q3 = train[col].quantile(.97)
#     train.loc[temp<q1,col] = q1
#     train.loc[temp>q3,col] = q3

Approach 3: Replacing value with mode of distribution

In [11]:
# for col in train.columns:
#     temp = train[col]
#     q1 = train[col].quantile(.03)
#     q3 = train[col].quantile(.97)
#     train.loc[temp<q1,col] = temp.mode()
#     train.loc[temp>q3,col] = temp.mode()

Approach 4: A combination of caping extreme outliers and imputing the rest based on correlated variables.

In [12]:
variables = ['secchiDiskDepth','temperatureSurface','dissolvedOxygenSurface','euphoticDepth','chlorophyll']

for index in range(0, len(variables)): #maybe better done with index due to list popping appending
    mainv = variables[0] + '_mean'
    z = np.abs(stats.zscore(train[mainv]))
    if len(np.where(z>3)[0]) == 0:
        temp = variables.pop(0)
        variables.append(temp)
        continue
    train= train[z<3]
    Q1 = train[mainv].quantile(0.25)
    Q3 = train[mainv].quantile(0.75)
    IQR = Q3 - Q1
    out = ((train[mainv]< (Q1 - 1.5 * IQR)) | (train[mainv] > (Q3 + 1.5 * IQR)))
    if len(np.where(out == True)[0]) == 0:
        temp = variables.pop(0)
        variables.append(temp)
        continue
    train = train.assign(outlier = list(out))

    names = []
    for var in variables:
        names.append(train.filter(regex='^'+ var ,axis=1).filter(regex='mean$',axis=1).columns.tolist())

    #names[0].remove('chlorophyll_detrend_mean') maybe remove it generally from dataset

    for i in range(0,len(names[0])):
        pt = train[train['outlier']==True ][['obs_id',names[1][i],names[2][i],names[3][i],names[4][i]]]
        s = [names[1][i],names[2][i],names[3][i],names[4][i]]
        new_mean = []
        no = []
        for id in pt['obs_id']: 
            all1 = 0.0
            tot = 4
            for name in s:
                if name.startswith('euphotic'):
                    limit = truncate(pt[name][id],2)
                    if limit<0:
                        sample = train[(train[name]>=limit)&(train[name]<=round(limit+0.01,2))&(train['outlier']==False) ][names[0][i]]
                    else:
                        sample = train[(train[name]>=limit)&(train[name]<=round(limit+0.01,2))&(train['outlier']==False) ][names[0][i]]
                    if sample.empty:
                        tot = tot - 1
                        continue
                    sm = sample.sum()
                    c = sample.count()
                    all1 = all1 + sm/c
                    continue

                limit = math.floor(pt[name][id])
                sample = train[(train[name]>=limit)&(train[name]<=limit+1)&(train['outlier']==False)][names[0][i]]
                if sample.empty:
                        tot = tot - 1
                        continue
                sm = sample.sum()
                c = sample.count()
                all1 = all1 + sm/c
            if(tot != 0):
                all1 = np.around(all1/tot,decimals=6)
                new_mean.append(all1)
                no.append(id)
        tot = 0
        for num in no:
            train.loc[[num],[names[0][i]]] = new_mean[tot]
            tot+=1
    
    temp = variables.pop(0)
    variables.append(temp)

The algorithm executes the following steps:
* Find the values that are considered as outliers and remove the extreme ones
* For the remaining outliers check the values of the correlated variables
* Set a range based on this values and find corresponding values for the 'main' variable
* Replace the outlier value with the mean of the corresponding values
  


In [13]:
variables = ['bathymetry','salinitySurface_mean','secchiDiskDepth_mean','temperatureSurface_mean','chlorophyll_mean']

z = np.abs(stats.zscore(train[variables[0]]))
train= train[z<3]
Q1 = train[variables[0]].quantile(0.25)
Q3 = train[variables[0]].quantile(0.75)
IQR = Q3 - Q1
out = ((train[variables[0]]< (Q1 - 1.5 * IQR)) | (train[variables[0]] > (Q3 + 1.5 * IQR)))
train = train.assign(outlier = list(out))
pt = train[train['outlier']==True ][['obs_id',variables[1],variables[2],variables[3],variables[4]]]
no = []
new_mean = []

for id in pt['obs_id']:
    all1 = 0.0
    tot = 4
    for i in range(1,len(variables)):     

        if variables[i].startswith('chlorophyll'):
            limit = truncate(pt[variables[i]][id],2)
            sample = train[(train[variables[i]]>=limit)&(train[variables[i]]<=round(limit+0.01,2))&(train['outlier']==False) ][variables[0]]
            if sample.empty:
                tot = tot - 1
                continue
            sm = sample.sum()
            c = sample.count()
            all1 = all1 + sm/c
            continue
        elif variables[i].startswith('salinity'):
            limit = math.floor(pt[variables[i]][id])
            sample = train[(train[variables[i]]>=limit)&(train[variables[i]]<=limit+0.5)&(train['outlier']==False)][variables[0]]
            if sample.empty:
                tot = tot - 1
                continue
            sm = sample.sum()
            c = sample.count()
            all1 = all1 + sm/c
            continue

        limit = math.floor(pt[variables[i]][id])
        sample = train[(train[variables[i]]>=limit)&(train[variables[i]]<=limit+1)&(train['outlier']==False)][variables[0]]
        if sample.empty:
            tot = tot - 1
            continue
        sm = sample.sum()
        c = sample.count()
        all1 = all1 + sm/c
    if(tot != 0):
        all1 = np.around(all1/tot,decimals=6)
        new_mean.append(all1)
        no.append(id)
tot = 0
for num in no:
    train.loc[[num],[variables[0]]] = new_mean[tot]
    tot+=1

# Extracting data to a numpy vector

In [14]:
cols_to_exclude = ['obs_id','Overall Probability','outlier']
df_train_columns = [c for c in train.columns if c not in cols_to_exclude]

y_train = train['Overall Probability'].ravel() #ravel coverts a series to a numpy array
x_train = train[df_train_columns].values # converts a dataframe to a numpy array
x_test = test[df_train_columns].values

# Log transform

Checking the features that have low variance and transforming
their values with log in order to spread the distribution

In [15]:
# from sklearn.feature_selection import VarianceThreshold

# selector = VarianceThreshold(threshold=(1))
# selector.fit(x_train)

# mask = np.array(selector.get_support(),dtype='bool')

# x_train_log = np.copy(x_train)
# x_test_log = np.copy(x_test)
# x_train_log[:,mask] = np.log10(x_train_log[:,mask]-np.amin(x_train_log[:,mask],axis = 0)+1)
# x_test_log[:,mask] = np.log10(x_test_log[:,mask]-np.amin(x_test_log[:,mask],axis = 0)+1)

# Normalization

We use mainly tree based algorithms so no need to normalize the data

In [16]:
# from sklearn.preprocessing import minmax_scale

# x_train_norm = minmax_scale(x_train)
# x_test_norm = minmax_scale(x_test)

# Standardization

In [17]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler = scaler.fit(x_train)
# x_train_std = scaler.transform(x_train)
# x_test_std = scaler.transform(x_test)

# Dimensionality reduction

## Gaussian Random Projection

bad results

In [18]:
# from sklearn.random_projection import GaussianRandomProjection

# transformer = GaussianRandomProjection(random_state=0,n_components = 500)
# transformer.fit(x_train)
# x_train_gaus = transformer.transform(x_train)
# x_test_gaus = transformer.transform(x_test)

## FeatureAgglomeration 

can be used

In [19]:
# from sklearn.cluster import FeatureAgglomeration 

# selector = FeatureAgglomeration(n_clusters=5371)
# selector.fit(x_train)

# x_train_clus = selector.transform(x_train)
# x_test_clus = selector.transform(x_test)

# locally linear embedding

bad

In [20]:
# from sklearn.manifold import LocallyLinearEmbedding

# selector = LocallyLinearEmbedding(n_neighbors=50, n_components=500,random_state = 0)
# selector.fit(x_train)
# x_train_linemb = selector.transform(x_train)
# x_test_linemb = selector.transform(x_test)

## PCA

In [21]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components = 'mle', svd_solver='full')
# pca.fit(x_train)
# x_train_pca = pca.transform(x_train)
# x_test_pca = pca.transform(x_test)

# Feature selection

## RFE, Recursive Feature Elimination
### good but takes time

In [22]:
# from sklearn.feature_selection import RFE
# from timeit import default_timer
# start = default_timer()

# estimator = LinearRegression()
# selector = RFE(estimator,n_features_to_select=2000)
# selector = selector.fit(x_train_std,y_train)
# mask = np.array(selector.support_,dtype=bool)
# df_train_columns_new = np.copy(df_train_columns)
# df_train_columns_new = np.expand_dims(df_train_columns_new, axis=0)
# df_train_columns_new = df_train_columns_new[:,mask]
# df_train_columns_new = np.squeeze(df_train_columns_new)


# feature_sel_time = default_timer() - start

# # can be done with transform as well
# x_train_selected = x_train[:,mask]
# x_test_selected = x_test[:,mask]

## RFECV 
#### RFE using cross validation

if the previous was taking time this will be endless

keeping it cause it may be used for further feature selection when already having reduced the features

In [23]:
# from sklearn.feature_selection import RFECV
# estimator = RandomForestRegressor(min_weight_fraction_leaf=0.01,n_jobs=-2,random_state=0,max_depth=15, n_estimators=100)            
# selector = RFECV(estimator)
# selector = selector.fit(x_train,y_train)
# mask = np.array(selector.support_,dtype=bool)
# df_train_columns_new = np.copy(df_train_columns)
# df_train_columns_new = np.expand_dims(df_train_columns_new, axis=0)
# df_train_columns_new = df_train_columns_new[:,mask]
# df_train_columns_new = np.squeeze(df_train_columns_new)
# x_train_rfecv = x_train[:,mask]
# x_test_rfecv = x_test[:,mask]

## Select From Model

fast and good

In [24]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel


selector = ExtraTreesRegressor(min_weight_fraction_leaf=0.001,n_jobs=-2,random_state=0,max_depth=50, n_estimators=300)
selector = selector.fit(x_train, y_train)
# selector.feature_importances_  

selector = SelectFromModel(selector, prefit=True)
# X_new = model.transform(X)
# X_new.shape      
x_train_sel_model = selector.transform(x_train)
x_test_sel_model = selector.transform(x_test)

print('total columns {}'.format(np.shape(x_train_sel_model)[1]))

# uncomment if you want to use lgb with columns significance 

# mask = np.array(selector.get_support(),dtype='bool')
# df_train_columns_new = np.copy(df_train_columns)
# df_train_columns_new = np.expand_dims(df_train_columns_new, axis=0)
# df_train_columns_new = df_train_columns[:,mask]
# df_train_columns_new = np.squeeze(df_train_columns_new)

total columns 662


## Variance threshold 
#### removing low variance variables

In [25]:
# from sklearn.feature_selection import VarianceThreshold

# selector = VarianceThreshold(threshold=(.5))
# selector.fit(x_train)

# x_train_rem_var = selector.transform(x_train)
# x_test_rem_var = selector.transform(x_test)

# print('total columns {}'.format(np.shape(x_train_rem_var)[1]))

# # Reducing the labels in case it is used later on lgb
# mask = np.array(selector.get_support(),dtype='bool')
# df_train_columns = np.copy(df_train_columns)
# df_train_columns_new = np.expand_dims(df_train_columns_new, axis=0)
# df_train_columns_new = df_train_columns_new[:,mask]
# df_train_columns_new = np.squeeze(df_train_columns_new)

## Combine variance threshold with select from model technique

In [26]:
# from timeit import default_timer
# from sklearn.ensemble import ExtraTreesRegressor
# from sklearn.feature_selection import SelectFromModel
# from sklearn.feature_selection import VarianceThreshold

# start = default_timer()

# selector = VarianceThreshold(threshold=(.5))
# selector.fit(x_train)

# x_train_rem_var = selector.transform(x_train)
# x_test_rem_var = selector.transform(x_test)

# print('total columns after variance threshold {}'.format(np.shape(x_train_rem_var)[1]))

# mask = np.array(selector.get_support(),dtype='bool')
# df_train_columns_new = np.copy(df_train_columns)
# df_train_columns_new = np.expand_dims(df_train_columns_new, axis=0)
# df_train_columns_new = df_train_columns_new[:,mask]
# # df_train_columns_new = np.squeeze(df_train_columns_new)


# start = default_timer()


# selector = ExtraTreesRegressor(min_weight_fraction_leaf=0.001,n_jobs=-2,random_state=0,max_depth=50, n_estimators=300)
# selector = selector.fit(x_train_rem_var, y_train)
# # selector.feature_importances_  

# selector = SelectFromModel(selector, prefit=True) 
# x_train_sel_model = selector.transform(x_train_rem_var)
# x_test_sel_model = selector.transform(x_test_rem_var)

# feature_sel_time = default_timer() - start
# print('feature selection time: {}'.format(feature_sel_time))
# print('total columns after select from model {}'.format(np.shape(x_train_sel_model)[1]))

# mask = np.array(selector.get_support(),dtype='bool')
# # df_train_columns_new = np.expand_dims_new(df_train_columns, axis=0)
# df_train_columns_new = df_train_columns_new[:,mask]
# df_train_columns_new = np.squeeze(df_train_columns_new)

In [27]:
def train_model(X_train, X_test, Y_train, folds=5, model_type='lgb',plot_feature_importance=True,df_train_columns=df_train_columns):

    oof = np.zeros(ntrain)
    prediction = np.zeros(ntest)
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train,Y_train)):
        print('Fold', fold_n+1, 'started at', time.ctime())
        x_train, x_valid = X_train[train_index], X_train[valid_index]
        y_train, y_valid = Y_train[train_index], Y_train[valid_index]
        
        if model_type == 'adaboost':
            model = AdaBoostRegressor(random_state=0, n_estimators=100)
            model.fit(x_train, y_train)
            y_pred_valid = model.predict(x_valid)
            y_pred = model.predict(X_test) 
        
        if model_type == 'mlp':
            model = MLPRegressor(hidden_layer_sizes=(700,300,200,10), activation='relu', solver='adam',
                                 alpha=1e-4, batch_size='auto', learning_rate='constant', 
                                 learning_rate_init=1e-5, power_t=0.5, max_iter=200, shuffle=True,
                                 random_state=None, tol=0.0001, verbose=False, warm_start=False, 
                                 momentum=0.9, nesterovs_momentum=True, early_stopping=False,
                                 validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08,
                                 n_iter_no_change=10, max_fun=15000)
            model.fit(x_train, y_train)
            y_pred_valid = model.predict(x_valid)
            y_pred = model.predict(X_test) 
        
        if model_type == 'svm':
            model = svm.SVR(kernel='rbf')
            model.fit(x_train, y_train)
            y_pred_valid = model.predict(x_valid)
            y_pred = model.predict(X_test) 
        
        if model_type == 'lasso':
            model = Lasso(alpha=0.1)
            model.fit(x_train, y_train)
            y_pred_valid = model.predict(x_valid)
            y_pred = model.predict(X_test) 
            
        if model_type == 'KNN':
            model = KNeighborsRegressor(n_neighbors = 150)
            model.fit(x_train, y_train)
            y_pred_valid = model.predict(x_valid)
            y_pred = model.predict(X_test) 
        
        if model_type == 'ridge':
            model = Ridge(alpha=.5)
            model.fit(x_train, y_train)
            y_pred_valid = model.predict(x_valid)
            y_pred = model.predict(X_test)    
        
        if model_type == 'linear':
            model = LinearRegression()
            model.fit(x_train, y_train)
            y_pred_valid = model.predict(x_valid)
            y_pred = model.predict(X_test) 
             
        if model_type == 'rf': #basic
            model = RandomForestRegressor(min_weight_fraction_leaf=0.01,n_jobs=-2,random_state=0,max_depth=15, n_estimators=100)
            model.fit(x_train, y_train)
            y_pred_valid = model.predict(x_valid)
            y_pred = model.predict(X_test)               
        
        if model_type == 'extra_trees': #good
            model = ExtraTreesRegressor(min_weight_fraction_leaf=0.01,n_jobs=-2,random_state=0,max_depth=20, n_estimators=100)
            model.fit(x_train, y_train)
            y_pred_valid = model.predict(x_valid)
            y_pred = model.predict(X_test)  
        
        if model_type == 'dec_trees': ##sucks
            model = DecisionTreeRegressor(random_state=0,max_depth=20)
            model.fit(x_train, y_train)
            y_pred_valid = model.predict(x_valid)
            y_pred = model.predict(X_test)  
        
        if model_type == 'lgb':
            lgb_params = {   
                         'num_leaves': 100,
                         'extra_trees': True,
                         'min_data_in_leaf': 10,
                         'min_sum_hessian_in_leaf': 11,
                         'objective': 'regression',
                         'max_depth': -1,
                         'learning_rate': 0.001,
                         'boosting': "gbdt",
                         'feature_fraction': 0.8,
                         'feature_fraction_seed': 9,
                         'max_bin ': 200,
                         "bagging_freq": 5,
                         "bagging_fraction": 0.8,
                         "bagging_seed": 9,
                         'metric': 'rmse',
                         'lambda_l1': 0.1,
                         'verbosity': -1,
                         'min_child_weight': 5.34,
                         'reg_alpha': 1.130,
                         'reg_lambda': 0.360,
                         'subsample': 0.8,
                         }
            
            
            model = lgb.LGBMRegressor(**lgb_params, n_estimators = 20000, n_jobs = -1)
            model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)], eval_metric='rmse',verbose=10000, early_stopping_rounds=100)
            
            y_pred_valid = model.predict(x_valid)
            y_pred_valid = np.clip(y_pred_valid, a_min=0, a_max=1)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            y_pred = np.clip(y_pred, a_min=0, a_max=1)
            
            # feature importance
            fold_importance = pd.DataFrame()
            
            fold_importance["feature"] = train[df_train_columns].columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)            

        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(mean_squared_error(y_valid, y_pred_valid) ** 0.5)
        prediction += y_pred          
        
    if (model_type == 'lgb' and plot_feature_importance==True):

        cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
            by="importance", ascending=False)[:50].index

        best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

        plt.figure(figsize=(16, 12));
        sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
        plt.title('LGB Features (avg over folds)')

    prediction /= NFOLDS        
    print('CV mean score: {0:.5f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    

    
    return oof, prediction

In [28]:
# for trials that require to take the best columns back 
def train_lgb_cv(X_train, X_test, Y_train, folds=5, model_type='lgb'):

    oof = np.zeros(ntrain)
    prediction = np.zeros(ntest)
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train,Y_train)):
        print('Fold', fold_n+1, 'started at', time.ctime())
        x_train, x_valid = X_train[train_index], X_train[valid_index]
        y_train, y_valid = Y_train[train_index], Y_train[valid_index]
        
        lgb_params = {   
                     'num_leaves': 100,
                     'extra_trees': True,
                     'min_data_in_leaf': 10,
                     'min_sum_hessian_in_leaf': 11,
                     'objective': 'regression',
                     'max_depth': -1,
                     'learning_rate': 0.001,
                     'boosting': "gbdt",
                     'feature_fraction': 0.8,
                     'feature_fraction_seed': 9,
                     'max_bin ': 200,
                     "bagging_freq": 5,
                     "bagging_fraction": 0.8,
                     "bagging_seed": 9,
                     'metric': 'rmse',
                     'lambda_l1': 0.1,
                     'verbosity': -1,
                     'min_child_weight': 5.34,
                     'reg_alpha': 1.130,
                     'reg_lambda': 0.360,
                     'subsample': 0.8,
                     }


        model = lgb.LGBMRegressor(**lgb_params, n_estimators = 20000, n_jobs = -1)
        model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)], eval_metric='rmse',verbose=10000, early_stopping_rounds=100)

        y_pred_valid = model.predict(x_valid)
        y_pred_valid = np.clip(y_pred_valid, a_min=0, a_max=1)
        y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
        y_pred = np.clip(y_pred, a_min=0, a_max=1)
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(mean_squared_error(y_valid, y_pred_valid) ** 0.5)
        prediction += y_pred          
        
    prediction /= NFOLDS        
    print('CV mean score: {0:.5f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    

    
    return oof, prediction,

general run

In [29]:
# oof, prediction ,best_features= train_model(X_train=x_train, X_test=x_test, Y_train=y_train, 
#                               folds=folds, model_type='lgb', plot_feature_importance=True, df_train_columns=df_train_columns)

# selected_features = best_features.feature.values
# x_train_new = train[selected_features].values 
# x_test_new = test[selected_features].values

In [30]:
# oof, prediction = train_model(X_train=x_train_selected, X_test=x_test_selected, Y_train=y_train, 
#                               folds=folds, model_type='lgb', plot_feature_importance=True,df_train_columns=df_train_columns_new)

lgb alone

In [31]:
oof, prediction = train_lgb_cv(X_train=x_train_sel_model, X_test=x_test_sel_model, Y_train=y_train
                              ,folds=folds)

Fold 1 started at Mon May 18 20:20:33 2020
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[6510]	training's rmse: 0.0322634	valid_1's rmse: 0.164398
Fold 2 started at Mon May 18 20:21:31 2020
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3789]	training's rmse: 0.057435	valid_1's rmse: 0.159054
Fold 3 started at Mon May 18 20:22:05 2020
Training until validation scores don't improve for 100 rounds
[10000]	training's rmse: 0.0214917	valid_1's rmse: 0.162511
[20000]	training's rmse: 0.00905078	valid_1's rmse: 0.159556
Did not meet early stopping. Best iteration is:
[20000]	training's rmse: 0.00905078	valid_1's rmse: 0.159556
Fold 4 started at Mon May 18 20:24:23 2020
Training until validation scores don't improve for 100 rounds
[10000]	training's rmse: 0.0174303	valid_1's rmse: 0.190683
Early stopping, best iteration is:
[10270]	training's rmse: 0.016822	valid_1's rmse: 0.190624
Fold 5 st

## Create Submission File

In [32]:
sample_submission = pd.read_csv('../input/duth-dbirlab2-1/sample_submission.csv')
sub_df = pd.DataFrame({"obs_id":sample_submission["obs_id"].values})
sub_df["Overall Probability"] = prediction
sub_df["Overall Probability"] = sub_df["Overall Probability"].apply(lambda x: 1 if x>1 else 0 if x<0 else x)
sub_df.to_csv("submission.csv", index=False)