In [1]:
# import pandas
import pandas as pd
# import numpy
import numpy as np
# import seaborn
import seaborn as sns
# import matplotlib
from matplotlib import pyplot as plt
# import train_test_split to split the dataset into training and testing set
from sklearn.model_selection import train_test_split
# import StandardScaler
from sklearn.preprocessing import StandardScaler
# import accuracy_score, confusion_matrix, classification_report & f1_score to perform cross validation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
# import cross_val_score, RandomizedSearchCV, KFold & StratifiedKFold for measuring performance
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, KFold, StratifiedKFold
# import RandomForestClassifier, ExtraTreesClassifier to create and select  Classifier model
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
# import XGBClassifier for Gradient boost classifier
from xgboost import XGBClassifier
# import LigtGBM Classifier
from lightgbm import LGBMClassifier
#import warnings
#warnings.filterwarnings('ignore')

In [9]:
df = pd.read_csv('Data_for_UCI_named.csv')
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [10]:
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [11]:
df.shape

(10000, 14)

In [12]:
df.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [13]:
df.drop('stab', axis = 1, inplace = True)
df

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.959060,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.781760,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.277210,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.669600,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.797110,0.455450,0.656947,0.820923,unstable
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,unstable
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.285880,0.366120,stable
9997,2.364034,2.842030,8.776391,1.008906,4.299976,-1.380719,-0.943884,-1.975373,0.487838,0.986505,0.149286,0.145984,stable
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.966330,-0.649915,-0.898510,0.365246,0.587558,0.889118,0.818391,unstable


In [14]:
# create feature variables
X = df.drop('stabf', axis = 1)
# create target variable
y = df['stabf']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

In [16]:
# count unique values of stabf in y_train
y_train.value_counts()

unstable    5092
stable      2908
Name: stabf, dtype: int64

In [49]:
# normalise train set to a common scale using the  standard scaler
normalized_X_train = pd.DataFrame(StandardScaler().fit_transform(X_train), columns = X_train.columns)


# normalise X_test to a common scale using the  standard scaler
normalized_X_test = pd.DataFrame(StandardScaler().fit_transform(X_test), columns = X_test.columns)

In [50]:
# function to train model
def train_model(model, X = normalized_X_train, y = y_train):
    return model.fit(X, y)    

In [26]:
# function for classification report and accuracy
def model_accuracy(model, X_train = normalized_X_train, y_train = y_train, X_test = normalized_X_test):
    model.fit(X_train, y_train)                                 
    y_pred = model.predict(X_test)                              
    print(f'Classification report for {model} is:')
    print(classification_report(y_test, y_pred, digits = 5))    
    print('')
    accuracy = round(accuracy_score(y_test, y_pred), 4)         
    print(f'Accuracy is {accuracy}')                            
    return

In [27]:
# random forest classifier
rfc = RandomForestClassifier(random_state=1)

In [28]:
model_accuracy(rfc)

Classification report for RandomForestClassifier(random_state=1) is:
              precision    recall  f1-score   support

      stable    0.91765   0.87640   0.89655       712
    unstable    0.93333   0.95652   0.94479      1288

    accuracy                        0.92800      2000
   macro avg    0.92549   0.91646   0.92067      2000
weighted avg    0.92775   0.92800   0.92761      2000


Accuracy is 0.928


In [54]:
lgb = LGBMClassifier(random_state=1)
#  extra trees classifier

In [55]:
model_accuracy(lgb)

Classification report for LGBMClassifier(random_state=1) is:
              precision    recall  f1-score   support

      stable    0.92576   0.89326   0.90922       712
    unstable    0.94212   0.96040   0.95117      1288

    accuracy                        0.93650      2000
   macro avg    0.93394   0.92683   0.93020      2000
weighted avg    0.93630   0.93650   0.93624      2000


Accuracy is 0.9365


In [56]:
# light gradient boosting model
xgb = XGBClassifier(max_depth=3, learning_rate=0.1, random_state=1)
model_accuracy(xgb)



Classification report for XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None) is:
              precision    recall  f1-score   support

      stable    0.92438   0.84129   0.88088       712
    unstable    0.91642   0.96196   0.93864      1288

    accuracy                        0.91900      2000
   macro avg    0.92040   0.90162   0.90976      2000
weighted avg    0.91925   0.91900   0.91808      2000


Accuracy is 

In [44]:
#  extra trees classifier
extc = ExtraTreesClassifier(random_state=1)
model_accuracy(extc)

Classification report for ExtraTreesClassifier(random_state=1) is:
              precision    recall  f1-score   support

      stable    0.94063   0.84551   0.89053       712
    unstable    0.91912   0.97050   0.94411      1288

    accuracy                        0.92600      2000
   macro avg    0.92987   0.90800   0.91732      2000
weighted avg    0.92677   0.92600   0.92504      2000


Accuracy is 0.926


In [45]:
# define the confusion matrix
TP, FP, FN, TN = 255, 1380, 45, 20
pricission = TP/(TP + FP)
accuracy = TP/(TP + FN)
F1_score = round((2 * pricission * accuracy)/(pricission + accuracy), 4)
f'The F1 score of the classifier is {F1_score}'

'The F1 score of the classifier is 0.2636'

In [47]:
# hyperparameter
# the number of trees in the forest/number of boosting rounds
n_estimators = [50, 100, 300, 500, 1000]
# the minimum number of samples required to split an internal node
min_samples_split = [2, 3, 5, 7, 9]
# the minimum number of samples required to be at a leaf node
min_samples_leaf = [1, 2, 4, 6, 8]
# the number of features to consider when looking for the best split
max_features = ['auto', 'sqrt', 'log2', None] 
# grid hyperparameter
hyperparameter = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

In [51]:
radomized_cv = RandomizedSearchCV(extc, hyperparameter,cv = 5, n_iter = 10, 
                                  scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)
search_param = train_model(radomized_cv)
search_param.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [53]:
hypertuned_extc = ExtraTreesClassifier(**search_param.best_params_, random_state = 1)
hypertuned_extc.fit(normalized_X_train, y_train)
hypertund_y_pred = hypertuned_extc.predict(normalized_X_test)
accuracy = round(accuracy_score(y_test, hypertund_y_pred), 4)
print(f'Accuracy is {accuracy}')

Accuracy is 0.9285
