In [55]:
# importing initial necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


%matplotlib inline
df = pd.read_csv(r'C:\Users\Newton\Documents\Data_for_UCI_named.csv')
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [56]:
X = df.drop(columns = ['stab','stabf']) # set of features
X.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923


In [57]:
y = df['stabf'] # setting the target variable
y.head()

0    unstable
1      stable
2    unstable
3    unstable
4    unstable
Name: stabf, dtype: object

In [58]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)
y_train.value_counts() # checking the distribution on y_train

unstable    5092
stable      2908
Name: stabf, dtype: int64

In [59]:
from sklearn.preprocessing import StandardScaler # using standard scaler to transform the training set and test set
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
x_train_scaled

array([[ 0.36732671, -0.98604156,  0.65044706, ...,  0.33985949,
         0.58556788,  0.49223946],
       [-0.06465869,  0.08943734,  1.03507899, ..., -1.5584875 ,
         1.42964862, -1.44352101],
       [-1.46785   ,  1.29841758, -0.50253617, ...,  1.45153362,
        -1.04574277,  0.49248925],
       ...,
       [ 0.65760851, -0.72275633, -1.4058879 , ...,  0.29310048,
        -1.55058661,  0.81034412],
       [-0.05931596, -1.26053241, -1.01047147, ..., -0.38825455,
        -0.72678059,  1.66791568],
       [-1.47321368,  0.63843757,  0.25012249, ..., -1.17410957,
         1.179282  ,  0.78362657]])

In [60]:
from sklearn.ensemble import RandomForestClassifier # using scikit learn to train a random forest classifier
rclf=RandomForestClassifier(random_state=1)
rclf.fit(x_train_scaled,y_train)
rclf_pred=rclf.predict(x_test_scaled)
rclf_pred

array(['unstable', 'unstable', 'stable', ..., 'stable', 'stable',
       'unstable'], dtype=object)

In [61]:
from sklearn.ensemble import ExtraTreesClassifier # using scikit learn to train an extra trees classifier
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}
xtree = ExtraTreesClassifier(random_state = 1)
rndsmodel = RandomizedSearchCV(xtree, hyperparameter_grid, random_state=1, verbose = True)
rndsmodel.fit(x_train_scaled,y_train)
xtree_pred = rndsmodel.predict(x_test_scaled)
xtree_pred

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.8min finished


array(['unstable', 'unstable', 'stable', ..., 'stable', 'unstable',
       'unstable'], dtype=object)

In [62]:
xgb = XGBClassifier(max_depth = 3, learning_rate = 0.1, random_state = 1) # using xgboost to train
xgb.fit(x_train_scaled,y_train)
xgb_pred = xgb.predict(x_test_scaled)
xgb_pred

array(['unstable', 'unstable', 'stable', ..., 'stable', 'unstable',
       'unstable'], dtype=object)

In [63]:
lgbm = LGBMClassifier(random_state = 1) # using lightgbm to train
lgbm.fit(x_train_scaled,y_train)
lgbm_pred = lgbm.predict(x_test_scaled)
lgbm_pred

array(['unstable', 'unstable', 'stable', ..., 'stable', 'unstable',
       'unstable'], dtype=object)

In [64]:
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix # from random forest
accuracy = accuracy_score(y_true=y_test, y_pred=rclf_pred)
accuracy

0.929

In [65]:
accuracy = accuracy_score(y_true=y_test, y_pred=xgb_pred) # from xgboost
accuracy

0.9195

In [66]:
accuracy = accuracy_score(y_true=y_test, y_pred=lgbm_pred) # from LGBM
accuracy

0.9375

In [67]:
print(rndsmodel.best_params_)

{'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': None}
