In [1]:
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from eli5.sklearn import PermutationImportance
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor,Pool
import matplotlib.patches as patch
from scipy.stats import kurtosis
import matplotlib.pyplot as plt
from sklearn.svm import NuSVR
from scipy.stats import skew
from scipy.stats import norm
from scipy import linalg
from sklearn import tree
from sklearn import svm
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm
import seaborn as sns
import pandas as pd
import numpy as np
import graphviz
import warnings
import random
import eli5
import shap 
import time
import glob
import sys
import os

Using TensorFlow backend.


In [2]:
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier

In [3]:
%matplotlib inline
%precision 4
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
np.set_printoptions(suppress=True)
pd.set_option("display.precision", 15)

In [4]:
train = pd.read_csv("uncorrelated.csv")
y= pd.read_csv("y.csv")

## Scaling and partition

In [5]:
scaler = StandardScaler()
scaler.fit(train)
X_scaled = scaler.transform(train)

In [6]:
X_scaled = pd.DataFrame(X_scaled)

In [42]:
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, shuffle=True)

# Model- SVM

In [8]:
svm = NuSVR()
svm_fit = svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_val)

In [9]:
score = mean_absolute_error(y_val, y_pred_svm)
print(f'Score: {score:0.3f}')

Score: 2.092


# Model- RF

In [10]:
rf = RandomForestRegressor(max_depth=8, random_state=0,n_estimators=500)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [11]:
Yhat_tr=rf.predict(X_train)
Yhat_te=rf.predict(X_val)

In [12]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(Yhat_tr, y_train)

1.0387380197391751

In [13]:
mean_absolute_error(Yhat_te, y_val)

1.2672952497565928

# Catboost

In [14]:
#train_pool = Pool(X_train,y_train)
cat_model = CatBoostRegressor(
                               iterations=2000,# change 25 to 3000 to get best performance 
                               learning_rate=0.03,
                               eval_metric='MAE',
                              )
cat_model.fit(X_train, y_train,silent=True)
y_pred_cat = cat_model.predict(X_val)

In [15]:
score = mean_absolute_error(y_val, y_pred_cat)
print(f'Score: {score:0.3f}')

Score: 0.871


# LGBM

In [16]:
d_train = lgb.Dataset(X_train, label= y_train)

In [17]:
params = {'objective' : "regression", 
               'boosting':"gbdt",
               'metric':"mae",
               'boost_from_average':"false",
               'num_threads':8,
               'learning_rate' : 0.001,
               'num_leaves' : 52,
               'max_depth':-1,
               'tree_learner' : "serial",
               'feature_fraction' : 0.85,
               'bagging_freq' : 1,
               'bagging_fraction' : 0.85,
               'min_data_in_leaf' : 10,
               'min_sum_hessian_in_leaf' : 10.0,
               'verbosity' : -1}

In [18]:
clf= lgb.train(params, d_train, 2000)

In [19]:
y_pred = clf.predict(X_val)

In [20]:
score = mean_absolute_error(y_val, y_pred)
print(f'Score: {score:0.3f}')

Score: 1.323


# XGBOOST

In [21]:
data_dmatrix = xgb.DMatrix(data=X_scaled,label=y)

In [22]:
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

In [23]:
cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=5,
                    num_boost_round=500,early_stopping_rounds=10,metrics="mae", as_pandas=True, seed=123)



In [24]:
cv_results.tail()

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
356,0.2199808,0.002227207704728,0.8941406,0.030106649967075
357,0.2195726,0.002204011215942,0.8941142,0.030057308817657
358,0.2191318,0.002203423917452,0.8941078,0.030043193594556
359,0.218671,0.002283505287929,0.8939428,0.03001420888446
360,0.2182764,0.002264973871814,0.8938092,0.030048841544392


In [25]:
print((cv_results["test-mae-mean"]).tail(1))

360    0.8938092
Name: test-mae-mean, dtype: float64


# Neural Network

In [26]:
NN1=MLPClassifier(hidden_layer_sizes=(4),activation='relu',solver='sgd',alpha=0.0000,
                  batch_size=15,learning_rate='adaptive',
                  learning_rate_init=0.0003,max_iter=25,verbose=True)

In [41]:
NN1.fit(X_train, y_train)

ValueError: Unknown label type: (array([ 2.0725,  3.2289,  1.3524, ...,  1.5876, 10.3171,  6.673 ]),)

# PCA

In [27]:
pca = PCA(n_components=0.9)
train_pca = pca.fit_transform(X_train)
test_pca = pca.transform(X_val)

In [28]:
explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[0.4246 0.0747 0.0651 0.0511 0.0357 0.0341 0.0277 0.0219 0.0173 0.0152
 0.0144 0.0127 0.0096 0.0088 0.0081 0.0077 0.0063 0.0061 0.0058 0.0055
 0.005  0.0045 0.0043 0.0041 0.0038 0.0035 0.0033 0.0031 0.0031 0.003
 0.0029 0.0029 0.0028 0.0027]


In [29]:
train_pca=pd.DataFrame(train_pca)
test_pca=pd.DataFrame(test_pca)

# PCA- XGBOOST

In [30]:
pca_dmatrix = xgb.DMatrix(data=train_pca,label=y_train)

In [31]:
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

In [32]:
pca_cv_results = xgb.cv(dtrain=pca_dmatrix, params=params, nfold=5,
                    num_boost_round=500,early_stopping_rounds=10,metrics="mae", as_pandas=True, seed=123)



In [33]:
pca_cv_results.tail()

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
44,1.7787424,0.013372433564613,2.2294094,0.05550429654216
45,1.7712142,0.013009417564211,2.228397,0.055432932200994
46,1.7624768,0.01434230788123,2.2293342,0.054923347656165
47,1.7543718,0.014565775522093,2.2283788,0.05450839633451
48,1.7464498,0.015568272998634,2.2274438,0.05459226571008


# PCA-Catboost

In [34]:
cat_model.fit(train_pca, y_train,silent=True)
y_pred_cat = cat_model.predict(test_pca)

In [35]:
score = mean_absolute_error(y_val, y_pred_cat)
print(f'Score: {score:0.3f}')

Score: 3.076
