# Import Relevant Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

### Load data

In [2]:
df = pd.read_csv('Data_for_UCI_named.csv' )

In [3]:
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [4]:
df.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [5]:
X = df.drop(columns='stabf')
y = df['stabf']

#### Split Data

In [52]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
y_train.value_counts()

unstable    5092
stable      2908
Name: stabf, dtype: int64

#### Standardize the Data

In [53]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalised_x_train = scaler.fit_transform(x_train)

In [8]:
normalised_x_train.shape

(8000, 13)

In [9]:
y_train.shape

(8000,)

In [10]:
normalised_x_test = scaler.transform(x_test)

In [11]:
normalised_x_test.shape

(2000, 13)

#### Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state =1)
rfc.fit(normalised_x_train, y_train)

RandomForestClassifier(random_state=1)

In [13]:
rfc_pred = rfc.predict(normalised_x_test)

#### Confusion Matrix, Classification Report and Accuracy

In [34]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test, rfc_pred))

[[ 712    0]
 [   1 1287]]


In [35]:
print(classification_report(y_test, rfc_pred))

              precision    recall  f1-score   support

      stable       1.00      1.00      1.00       712
    unstable       1.00      1.00      1.00      1288

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



In [36]:
print(accuracy_score(y_test, rfc_pred))

0.9995


#### XGBoost

In [16]:
import xgboost as xgb

In [17]:
xgb = xgb.XGBClassifier(random_state=1)

In [18]:
xgb.fit(normalised_x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [19]:
pred_xgb =  xgb.predict(normalised_x_test)

In [20]:
print(classification_report(y_test, pred_xgb))

              precision    recall  f1-score   support

      stable       1.00      1.00      1.00       712
    unstable       1.00      1.00      1.00      1288

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



In [37]:
print(accuracy_score(y_test, pred_xgb))

0.9995


#### Extra Tree

In [21]:
from sklearn.ensemble import ExtraTreesClassifier

In [22]:
etc = ExtraTreesClassifier(random_state=1)

In [23]:
etc.fit(normalised_x_train, y_train)

ExtraTreesClassifier(random_state=1)

In [65]:
#feature importance
importance = etc.feature_importances_
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.05934
Feature: 1, Score: 0.05799
Feature: 2, Score: 0.05481
Feature: 3, Score: 0.05871
Feature: 4, Score: 0.01688
Feature: 5, Score: 0.01697
Feature: 6, Score: 0.01717
Feature: 7, Score: 0.01680
Feature: 8, Score: 0.04099
Feature: 9, Score: 0.04513
Feature: 10, Score: 0.04559
Feature: 11, Score: 0.04369
Feature: 12, Score: 0.52594


In [24]:
pred_etc = etc.predict(normalised_x_test)

In [25]:
print(classification_report(y_test, pred_etc))

              precision    recall  f1-score   support

      stable       0.99      0.98      0.98       712
    unstable       0.99      1.00      0.99      1288

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000



In [62]:
print(accuracy_score(y_test, pred_etc))

0.9885


#### Hyperparameters

In [26]:
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {'n_estimators': n_estimators,'min_samples_leaf':min_samples_leaf, 'min_samples_split': min_samples_split, 'max_features':max_features}

In [55]:
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(estimator = etc, cv=5, n_iter=10, param_distributions=hyperparameter_grid, scoring='accuracy', n_jobs=-1, verbose =1, random_state=1)

In [56]:
random_search = random_search.fit(normalised_x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   50.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   56.8s finished


In [57]:
random_search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [30]:
print("\n The best score across ALL searched params:\n",
          random_search.best_score_)


 The best score across ALL searched params:
 1.0


In [58]:
best_etc = ExtraTreesClassifier(n_estimators=1000, min_samples_split=2, min_samples_leaf=8, max_features=None)
best_etc.fit(normalised_x_train, y_train)

ExtraTreesClassifier(max_features=None, min_samples_leaf=8, n_estimators=1000)

In [59]:
best_etc_pred = best_etc.predict(normalised_x_test)

In [60]:
print(classification_report(y_test, best_etc_pred))

              precision    recall  f1-score   support

      stable       1.00      1.00      1.00       712
    unstable       1.00      1.00      1.00      1288

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



In [61]:
print(accuracy_score(y_test, best_etc_pred))

0.9995
