<h1><center>Task 1</center></h1>

In [1]:
#Importing libraries

#Data processing
import pandas as pd
import numpy as np
from numpy.random import seed
import re
import datetime as datetime
from datetime import date
import numbers

#Data visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.patches as mpatches
from matplotlib.patches import Rectangle
import seaborn as sns
import plotly.io as pio
import plotly.graph_objects as go
from mpl_toolkits.mplot3d import Axes3D

#Data modeling
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error, classification_report, accuracy_score
from sklearn import datasets, linear_model, metrics
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingRegressor, VotingClassifier
from sklearn.decomposition import PCA, KernelPCA, IncrementalPCA, FastICA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer, RobustScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.xgbod import XGBOD
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import VarianceThreshold
from skfeature.function.similarity_based import fisher_score
from sklearn.metrics import r2_score
from scipy.spatial.distance import mahalanobis
from scipy.stats import chi2
from sklearn.cluster import DBSCAN
from sklearn.svm import OneClassSVM
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.regularizers import l1, l2
import tensorflow
import math
from scikeras.wrappers import KerasRegressor
from sklearn.svm import SVR

#Other
import warnings
pio.renderers.default = "notebook"

In [2]:
#Suppressing warnings
warnings.simplefilter(action = "ignore")

In [3]:
#Setting random seed
seed(11)

In [4]:
#Importing data
xtrain_df = pd.read_csv("X_train.csv", index_col = 0)
ytrain_df = pd.read_csv("y_train.csv", index_col = 0)
xtest_df = pd.read_csv("X_test.csv", index_col = 0)

In [5]:
#Set feature columns
feature_columns = xtrain_df.columns.to_list()

In [6]:
#Set params
params = {
    "random_state": 11,
    # Missing values
    "missing_values_imputation_method": "knn", #"median","mean","knn","mice"
    "mv_knn_nneighbors": 5,
    "mv_knn_weights": "uniform", #"uniform","distance"
    "mv_mice_max_iter": 5,
    "mv_mice_initial_strategy": 'median', #"mean","median","most_frequent","constant"
    "mv_mice_n_nearest_features": 5,
    # Outliers
    "outlier_removal_method": "svm", #"iqr","if","md","svm","pca"
    "o_iqr_threshold":10,
    "o_if_contamination":0.04,
    "o_if_n_estimators":100,
    "o_if_max_samples": 0.9,
    "o_if_max_features": 0.9,
    "o_md_chisquarethreshold":0.9,
    "o_svm_kernel": "rbf", #"linear","poly","rbf","sigmoid","precomputed"
    "o_svm_degree": 4,
    "o_svm_nu": 0.025,
    "o_svm_gamma": "scale", #"scale","auto"
    "o_pca_n_components": 10,
    "o_pca_percentilereconstructionerror": 95,
    # Feature selection
    "feature_selection_method": "lasso", #"mi","variancethreshold","lasso","rf","rfe","pca"
    "fs_mi_threshold": 0.05,
    "fs_vt_threshold": 1.0,
    "fs_lasso_alpha": 0.375,
    "fs_rf_n_estimators": 50,
    "fs_rf_importance_threshold": 0.001,
    "fs_rfe_nfeatures": 300,
    "fs_pca_n_components": 250,
    # Scaling
    "scaling_method": "std", #"maxabs","std","minmax","robust"
    #Models
}

### Columns

In [7]:
def drop_constants(X_train,X_test,feature_columns):
    constant_column_mask = xtrain_df.nunique() == 1
    constants = constant_column_mask[constant_column_mask].index.to_list()
    for constant in constants:
        X_train = X_train.drop(constant, axis = 1)
        X_test = X_test.drop(constant, axis = 1)
        feature_columns.remove(constant)
    return X_train,X_test,feature_columns

In [8]:
xtrain_df, xtest_df, feature_columns = drop_constants(xtrain_df, xtest_df, feature_columns)

### Missing values

In [9]:
def missing_value_imputation(X_train, X_test, params):
    if params['missing_values_imputation_method'] == 'median':
        X_train_imp = np.where(np.isnan(np.array(X_train)), np.nanmedian(np.array(X_train), axis = 0), np.array(X_train))
        X_test_imp = np.where(np.isnan(np.array(X_test)), np.nanmedian(np.array(X_test), axis = 0), np.array(X_test))
    
    elif params['missing_values_imputation_method'] == 'mean':
        X_train_imp = np.where(np.isnan(np.array(X_train)), np.nanmean(np.array(X_train), axis = 0), np.array(X_train))
        X_test_imp = np.where(np.isnan(np.array(X_test)), np.nanmean(np.array(X_test), axis = 0), np.array(X_test))
    
    elif params['missing_values_imputation_method'] == 'knn':
        imputer = KNNImputer(n_neighbors=params["mv_knn_nneighbors"], weights=params["mv_knn_weights"])
        imputed_values_train = imputer.fit_transform(X_train)
        imputed_values_test = imputer.transform(X_test)
        X_train_imp = np.where(np.isnan(X_train), imputed_values_train, X_train)
        X_test_imp = np.where(np.isnan(X_test), imputed_values_test, X_test)
    
    elif params['missing_values_imputation_method'] == 'mice':
        imputer = IterativeImputer(max_iter=params["mv_mice_max_iter"], initial_strategy=params["mv_mice_initial_strategy"],n_nearest_features=params["mv_mice_n_nearest_features"])
        imputed_values_train = imputer.fit_transform(X_train)
        imputed_values_test = imputer.transform(X_test)
        X_train_imp = np.where(np.isnan(X_train), imputed_values_train, X_train)
        X_test_imp = np.where(np.isnan(X_test), imputed_values_test, X_test)
    
    X_train = pd.DataFrame(X_train_imp, index = X_train.index, columns = X_train.columns)
    X_test = pd.DataFrame(X_test_imp, index = X_test.index, columns = X_test.columns)
        
    return X_train, X_test

In [10]:
xtrain_df, xtest_df = missing_value_imputation(xtrain_df, xtest_df, params)

### Outliers

In [11]:
def outlier_removal(X_train, y_train, params):
    if params['outlier_removal_method'] == 'iqr':
        q1 = xtrain_df.quantile(0.25)
        q3 = xtrain_df.quantile(0.75)
        iqr = q3 - q1
        mask = (xtrain_df < (q1 - params["o_iqr_threshold"] * iqr)) | (xtrain_df > (q3 + params["o_iqr_threshold"] * iqr))
        outlier_mask = mask.max(axis=1)
    
    elif params['outlier_removal_method'] == 'if':
        model = IForest(contamination=params["o_if_contamination"], n_estimators=params["o_if_n_estimators"], max_samples=params["o_if_max_samples"], max_features=params["o_if_max_features"])
        model.fit(xtrain_df)
        outlier_mask = model.predict(xtrain_df)
        outlier_mask = outlier_mask == 1
        
    elif params['outlier_removal_method'] == 'md':
        X_train_a = np.array(xtrain_df)

        mean_vec = np.mean(X_train_a, axis=0)
        cov_matrix = np.cov(X_train_a, rowvar=False)
        inv_cov_matrix = np.linalg.inv(cov_matrix)

        def mahalanobis_distance(x, mean_vec, inv_cov_matrix):
            diff = x - mean_vec
            return np.sqrt(np.dot(np.dot(diff, inv_cov_matrix), diff.T))

        m_distances = np.array([mahalanobis_distance(x, mean_vec, inv_cov_matrix) for x in X_train_a])
        
        threshold = np.sqrt(chi2.ppf(params["o_md_chisquarethreshold"], df=X_train.shape[1]))
        outlier_mask = m_distances > threshold
    
    elif params['outlier_removal_method'] == 'svm':
        X_train_a = np.array(xtrain_df)

        ocsvm = OneClassSVM(kernel=params["o_svm_kernel"], nu=params["o_svm_nu"], gamma=params["o_svm_gamma"], degree=params["o_svm_degree"])
        ocsvm.fit(X_train_a)
        predictions = ocsvm.predict(X_train_a)

        outlier_mask = predictions == -1
    
    elif params['outlier_removal_method'] == 'pca':
        X_train_a = np.array(xtrain_df) 

        pca = PCA(n_components=params["o_pca_n_components"])
        X_train_pca = pca.fit_transform(X_train_a)
        X_train_reconstructed = pca.inverse_transform(X_train_pca)
        reconstruction_error = np.mean((X_train - X_train_reconstructed) ** 2, axis=1)

        threshold = np.percentile(reconstruction_error, params["o_pca_percentilereconstructionerror"])
        outlier_mask = reconstruction_error > threshold

    X_train_cleaned = pd.DataFrame(X_train[~outlier_mask],index=X_train[~outlier_mask].index,columns=X_train.columns)
    y_train_cleaned = pd.DataFrame(y_train[~outlier_mask],index=y_train[~outlier_mask].index,columns=y_train.columns)
    
    n_outliers = sum(outlier_mask)
    
    return X_train_cleaned,y_train_cleaned,n_outliers

In [12]:
xtrain_df, ytrain_df, n_outliers = outlier_removal(xtrain_df, ytrain_df, params)

In [13]:
xtrain_df, ytrain_df, n_outliers = outlier_removal(xtrain_df, ytrain_df, params)

### Deskewing

In [14]:
def deskew(X_train, X_test):
    for col in X_train.columns:
        try:
            pt = PowerTransformer(method="yeo-johnson")
            deskewed_col_train = pt.fit_transform(np.array(X_train[col]).reshape(-1, 1))
            deskewed_col_test = pt.transform(np.array(X_test[col]).reshape(-1, 1))
            X_train[col] = deskewed_col_train
            X_test[col] = deskewed_col_test
        except: 
            X_train[col] = X_train[col]
            X_test[col] = X_test[col]
    return X_train, X_test

In [15]:
xtrain_df, xtest_df = deskew(xtrain_df, xtest_df)

### Feature Selection

In [16]:
def feature_selection(X_train, y_train, X_test, params):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
        
    if params['feature_selection_method'] == 'mi':
        mi = mutual_info_regression(X_train, y_train)
        mi_df = pd.DataFrame({'Feature': X_train.columns, 'Mutual Information': mi})
        mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)
        selected_features = mi_df[mi_df['Mutual Information'] > params["fs_mi_threshold"]]['Feature']
    
    elif params['feature_selection_method'] == 'variancethreshold':
        selector = VarianceThreshold(threshold = params["fs_vt_threshold"])
        X_train_selected = selector.fit_transform(X_train)
        selected_features = X_train.columns[selector.get_support()]

    elif params['feature_selection_method'] == 'lasso':
        lasso = Lasso(alpha=params["fs_lasso_alpha"])
        lasso.fit(X_train_scaled, y_train)
        selected_features = X_train.columns[lasso.coef_ != 0]
    
    elif params['feature_selection_method'] == 'rf':
        rf = RandomForestRegressor(n_estimators = params["fs_rf_n_estimators"])
        rf.fit(X_train, y_train)
        importance = rf.feature_importances_
        feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importance})
        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
        threshold = params["fs_rf_importance_threshold"]
        selected_features = feature_importance_df[feature_importance_df['Importance'] > threshold]['Feature']
    
    elif params['feature_selection_method'] == 'rfe':
        model = LinearRegression()
        rfe = RFE(estimator=model, n_features_to_select=params["fs_rfe_nfeatures"])
        rfe.fit(X_train, y_train)
        selected_features = X_train.columns[rfe.support_]
    
    elif params['feature_selection_method'] == 'pca':
        pca = KernelPCA(n_components=params["fs_pca_n_components"],kernel = 'rbf')
        X_train_pca = pca.fit_transform(X_train_scaled)
        X_test_pca = pca.transform(X_test_scaled)
        pca_components_train_df = pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(X_train_pca.shape[1])], index = X_train.index)
        pca_components_test_df = pd.DataFrame(X_test_pca, columns=[f'PC{i+1}' for i in range(X_test_pca.shape[1])], index = X_test.index)
        
    if params['feature_selection_method'] == 'pca':
        return pca_components_train_df, pca_components_test_df, params["fs_pca_n_components"]
        n_features = params["fs_pca_n_components"]
    else:
        X_train = X_train[selected_features]
        X_test = X_test[selected_features]
        n_features = len(selected_features)
        
        return X_train,X_test,n_features

In [17]:
xtrain_df, xtest_df, n_features = feature_selection(xtrain_df, ytrain_df, xtest_df, params)

### Scaling

In [18]:
def scaling(X_train, X_test, params):
    if params['scaling_method'] == 'maxabs':
        scaler = MaxAbsScaler()
    
    elif params['scaling_method'] == 'std':
        scaler = StandardScaler()
    
    elif params['scaling_method'] == 'minmax':
        scaler = MinMaxScaler()
    
    elif params['scaling_method'] == 'robust':
        scaler = RobustScaler()
    
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    X_train_scaled =  pd.DataFrame(data = X_train_scaled, columns = X_train.columns, index = X_train.index)
    X_test_scaled =  pd.DataFrame(data = X_test_scaled, columns = X_test.columns, index = X_test.index)
    
    return X_train_scaled, X_test_scaled

In [19]:
xtrain_df, xtest_df = scaling(xtrain_df, xtest_df, params)

### Regression models

In [20]:
X_train, X_vali, y_train, y_vali = train_test_split(xtrain_df, ytrain_df, test_size = 0.2, random_state = params["random_state"])

#### Linear regression

In [21]:
def linear_regression(X_rest, y_rest, X_train, y_train, X_vali, y_vali, params):
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_vali)
    r2 = r2_score(y_vali, y_pred)

    return model, r2

In [22]:
lir_model, lir_r2 = linear_regression(xtrain_df, ytrain_df, X_train, y_train, X_vali, y_vali, params)

In [23]:
lir_r2

0.5714146192473868

#### Lasso regression

In [24]:
params["lasso_params"] = {
    "alpha":[0.03,0.035,0.0375,0.04,0.0425,0.045,0.05]
}

In [25]:
def lasso_regression(X_rest, y_rest, X_train, y_train, X_vali, y_vali, params):
    model_params = params["lasso_params"]
    
    gs = GridSearchCV(linear_model.Lasso(), 
                      param_grid = model_params,
                      cv = 2, 
                      scoring='r2')
    
    gs.fit(X_rest, y_rest)
        
    return gs, gs.best_score_, gs.best_params_

In [26]:
lar_model, lar_r2, lar_params = lasso_regression(xtrain_df, ytrain_df, X_train, y_train, X_vali, y_vali, params)

In [27]:
lar_r2

0.44242924895954117

#### XGBoost

In [28]:
params["xgboost_params"] = {
    "n_estimators":[2200],
    "learning_rate": [0.03],
    "eta": [0],
    "gamma": [0],
    "max_depth": [4],
    "subsample": [0.6,0.7,0.8],
    "colsample_bytree": [0.6],
    "lambda": [0.7],
    "alpha": [0.4]
}

In [29]:
def xgboost(X_rest, y_rest, X_train, y_train, X_vali, y_vali, params):
    model_params = params["xgboost_params"]
    
    gs = GridSearchCV(xgb.XGBRegressor(), 
                      param_grid = model_params,
                      cv = 2, 
                      scoring='r2')
    
    gs.fit(X_rest, y_rest)
        
    return gs, gs.best_score_, gs.best_params_

In [30]:
xgb_model, xgb_r2, xgb_params = xgboost(xtrain_df, ytrain_df, X_train, y_train, X_vali, y_vali, params)

In [31]:
xgb_r2

0.5265296697616577

#### Random forest

In [32]:
params["randomforest_params"] = {
    "n_estimators": [600],
    "criterion": ["squared_error"], #“squared_error”, “absolute_error”, “friedman_mse”, “poisson”
    "max_depth": [11],
    "min_samples_split": [2],
    "max_features": [0.5]
}

In [33]:
def randomforest(X_rest, y_rest, X_train, y_train, X_vali, y_vali, params):
    model_params = params["randomforest_params"]
    
    gs = GridSearchCV(RandomForestRegressor(), 
                      param_grid = model_params,
                      cv = 2, 
                      scoring='r2')
    
    gs.fit(X_rest, y_rest)
    
    return gs, gs.best_score_, gs.best_params_

In [34]:
rf_model, rf_r2, rf_params = randomforest(xtrain_df, ytrain_df, X_train, y_train, X_vali, y_vali, params)

In [35]:
rf_r2

0.5002284472066874

#### LightGBM

In [36]:
params["lgbm_params"] = {
    "n_estimators": [400],
    "learning_rate": [0.01],
    "max_depth": [10],
    "subsample": [0.5],
    "colsample_bytree": [0.7],
    "reg_lambda": [0],
    "reg_alpha": [1]
}

In [37]:
def lgbm(X_rest, y_rest, X_train, y_train, X_vali, y_vali, params):
    model_params = params["lgbm_params"]
    
    gs = GridSearchCV(LGBMRegressor(), 
                      param_grid = model_params,
                      cv = 2, 
                      scoring='r2')
    
    gs.fit(X_rest, y_rest)
        
    return gs, gs.best_score_, gs.best_params_

In [38]:
lgbm_model, lgbm_r2, lgbm_params = lgbm(xtrain_df, ytrain_df, X_train, y_train, X_vali, y_vali, params)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000469 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11253
[LightGBM] [Info] Number of data points in the train set: 576, number of used features: 60
[LightGBM] [Info] Start training from score 69.673611






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000289 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11275
[LightGBM] [Info] Number of data points in the train set: 577, number of used features: 60
[LightGBM] [Info] Start training from score 70.509532






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000369 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15000
[LightGBM] [Info] Number of data points in the train set: 1153, number of used features: 60
[LightGBM] [Info] Start training from score 70.091934


In [39]:
lgbm_r2

0.5229763816739856

#### SVM

In [40]:
params["svm_params"] = {
    "degree": [1],
    "C": [20],
    "epsilon": [2],
    "kernel": ["rbf"]
}

In [41]:
def svm_regression(X_rest, y_rest, X_train, y_train, X_vali, y_vali, params):
    model_params = params["svm_params"]
    
    gs = GridSearchCV(SVR(), 
                      param_grid=model_params,
                      cv=2, 
                      scoring='r2')
    
    gs.fit(X_rest, y_rest)
    
    return gs, gs.best_score_, gs.best_params_

In [42]:
svm_model, svm_r2, svm_params = svm_regression(xtrain_df, ytrain_df, X_train, y_train, X_vali, y_vali, params)

In [43]:
svm_r2

0.5079278014860735

#### Neural network

In [44]:
input_dim = xtrain_df.shape[1]

In [45]:
def build_nn_model():
    model = tensorflow.keras.Sequential([
        tensorflow.keras.layers.Dense(32, activation='relu', input_shape=(input_dim,)),
        tensorflow.keras.layers.Dense(1, activation='relu')
    ])
    model.compile(optimizer='rmsprop', loss='mse')
    
    return model

In [46]:
def nn(X_rest, y_rest, X_train, y_train, X_vali, y_vali, params):
    model = build_nn_model()
    
    model.fit(x = X_train, y = y_train,
          validation_data = (X_vali, y_vali),
          epochs = 2500)
    
    y_pred = model.predict(X_vali)
    r2 = r2_score(y_vali, y_pred)
        
    return model, r2

In [47]:
#nn_model, nn_r2 = nn(xtrain_df, ytrain_df, X_train, y_train, X_vali, y_vali, params)

In [48]:
#nn_model_sl = KerasRegressor(model=build_nn_model,epochs = 2500)

In [49]:
#nn_r2

#### Voting ensemble

In [50]:
def ensemble(lir_model, lar_model, lgbm_model, xgb_model, rf_model, svm_model, nn_model, X_rest, y_rest, X_train, y_train, X_vali, y_vali, params):
    voting = VotingRegressor(estimators = [
        #("lir_model", lir_model),
        ("lar_model", lar_model),
        ("lgbm_model", lgbm_model),
        ("xgb_model", xgb_model),
        #("rf_model", rf_model),
        ("svm_model", svm_model),
        #("nn_model", nn_model)
    ])
    
    voting.fit(X_train, y_train)
    
    y_pred = voting.predict(X_vali)
    r2 = r2_score(y_vali, y_pred)
        
    return voting, r2

In [51]:
nn_model_sl = 0

In [52]:
voting_model, voting_r2 = ensemble(lir_model, lar_model, lgbm_model, xgb_model, rf_model, svm_model, nn_model_sl, xtrain_df, ytrain_df, X_train, y_train, X_vali, y_vali, params)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000826 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9032
[LightGBM] [Info] Number of data points in the train set: 461, number of used features: 60
[LightGBM] [Info] Start training from score 69.969631




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000320 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9028
[LightGBM] [Info] Number of data points in the train set: 461, number of used features: 60
[LightGBM] [Info] Start training from score 70.214751






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000427 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14966
[LightGBM] [Info] Number of data points in the train set: 922, number of used features: 60
[LightGBM] [Info] Start training from score 70.092191


In [53]:
voting_r2

0.657879276148283

In [54]:
voting = VotingRegressor(estimators = [
        #("lir_model", lir_model),
        ("lar_model", lar_model),
        ("lgbm_model", lgbm_model),
        ("xgb_model", xgb_model),
        #("rf_model", rf_model),
        ("svm_model", svm_model),
        #("nn_model", nn_model)
    ])
    
voting.fit(xtrain_df, ytrain_df)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000655 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11253
[LightGBM] [Info] Number of data points in the train set: 576, number of used features: 60
[LightGBM] [Info] Start training from score 69.673611






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000349 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11275
[LightGBM] [Info] Number of data points in the train set: 577, number of used features: 60
[LightGBM] [Info] Start training from score 70.509532






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000358 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15000
[LightGBM] [Info] Number of data points in the train set: 1153, number of used features: 60
[LightGBM] [Info] Start training from score 70.091934


In [55]:
y_pred = voting_model.predict(xtest_df)
sub = pd.DataFrame(data=y_pred,index=xtest_df.index)
sub.columns = ["y"]
sub.to_csv("submission.csv")