# Dataset Description
Predictive features:
'tau1' to 'tau4': the reaction time of each network participant, a real value within the range 0.5 to 10 ('tau1' corresponds to the supplier node, 'tau2' to 'tau4' to the consumer nodes);
'p1' to 'p4': nominal power produced (positive) or consumed (negative) by each network participant, a real value within the range -2.0 to -0.5 for consumers ('p2' to 'p4'). As the total power consumed equals the total power generated, p1 (supplier node) = - (p2 + p3 + p4);
'g1' to 'g4': price elasticity coefficient for each network participant, a real value within the range 0.05 to 1.00 ('g1' corresponds to the supplier node, 'g2' to 'g4' to the consumer nodes; 'g' stands for 'gamma');
Dependent variables:

'stab': the maximum real part of the characteristic differential equation root (if positive, the system is linearly unstable; if negative, linearly stable);
'stabf': a categorical (binary) label ('stable' or 'unstable').

In [1]:
#importing necessary modules
import pandas as pd
from numpy import asarray
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix,classification_report 


In [2]:
#importing our data with pandas
dataset = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv')
dataset

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.959060,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.781760,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.277210,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.669600,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.797110,0.455450,0.656947,0.820923,0.049860,unstable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,0.023892,unstable
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.285880,0.366120,-0.025803,stable
9997,2.364034,2.842030,8.776391,1.008906,4.299976,-1.380719,-0.943884,-1.975373,0.487838,0.986505,0.149286,0.145984,-0.031810,stable
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.966330,-0.649915,-0.898510,0.365246,0.587558,0.889118,0.818391,0.037789,unstable


In [3]:
#checking for null data
dataset.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [4]:
#dropping columns 
dataset.drop('stab', axis = 1, inplace = True)

In [5]:
#converting to binary labels
bi = LabelEncoder()
dataset['stabf']= bi.fit_transform(dataset['stabf'])
dataset


Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.959060,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,1
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.781760,0
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.277210,-0.920492,0.163041,0.766689,0.839444,0.109853,1
3,0.716415,7.669600,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,1
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.797110,0.455450,0.656947,0.820923,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,1
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.285880,0.366120,0
9997,2.364034,2.842030,8.776391,1.008906,4.299976,-1.380719,-0.943884,-1.975373,0.487838,0.986505,0.149286,0.145984,0
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.966330,-0.649915,-0.898510,0.365246,0.587558,0.889118,0.818391,1


In [6]:
#splitting data into train and test sets
x = dataset.drop('stabf', axis = 1)
y = dataset['stabf']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)


In [7]:
x_train_col= x_train.columns
x_test_col = x_test.columns

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [9]:
#putting scaled data in a dataframe and retaining column names
x_train= pd.DataFrame(x_train, columns = x_train_col)
x_test= pd.DataFrame(x_test, columns = x_test_col)

In [10]:
#Training a random forest model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 1)
rf.fit(x_train, y_train)
rf_pred= rf.predict(x_test)

In [11]:
#measuring classification performance for random forest
rf_accuracy = accuracy_score(y_true=y_test, y_pred=rf_pred)
print('Accuracy: {}'.format(rf_accuracy))

Accuracy: 0.928


In [12]:
#Training an extra trees classifier
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(random_state = 1)
etc.fit(x_train,y_train)
etc_pred = etc.predict(x_test)

In [13]:
#measuring classification performance for extra trees
etc_accuracy = accuracy_score(y_true=y_test, y_pred=etc_pred)
print('Accuracy: {}'.format(etc_accuracy))

Accuracy: 0.926


In [14]:
#Training an extreme boosting model using xgboost
from xgboost import XGBClassifier
xgb= XGBClassifier(random_state= 1)
xgb.fit(x_train, y_train)
xgb_pred = xgb.predict(x_test)





In [15]:
#measuring classification performance for xgboost
xgb_accuracy = accuracy_score(y_true=y_test, y_pred=xgb_pred)
print('Accuracy: {}'.format(xgb_accuracy))

Accuracy: 0.946


In [16]:
#Training a light gradient boosting model using lightgbm
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(random_state = 1)
lgb.fit(x_train,y_train)
lgb_pred = lgb.predict(x_test)

In [17]:
#measuring classification performance for light gradient boosting model
lgb_accuracy = accuracy_score(y_true=y_test, y_pred=lgb_pred)
print('Accuracy: {}'.format(lgb_accuracy))

Accuracy: 0.9365


In [18]:
#getting best hyperparameters from randomized search CV 
#set up randomsearch with 5folds
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 
hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,'max_features': max_features}

randomcv = RandomizedSearchCV(estimator = etc, 
                              param_distributions = hyperparameter_grid, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1,
                              random_state = 1)

In [19]:
best = randomcv.fit(x_train,y_train)
best.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [20]:
#getting accuracy of the new etc model using best params
etc2 = ExtraTreesClassifier(max_features = None, 
                            min_samples_leaf= 8,
                            min_samples_split= 2,
                            n_estimators= 1000, 
                            random_state = 1)
etc2.fit(x_train, y_train)
etc2_pred= etc2.predict(x_test)

etc2_accuracy = accuracy_score(y_true=y_test, y_pred=etc2_pred)
print('Accuracy: {}'.format(etc2_accuracy))

Accuracy: 0.9285


In [21]:
#checking feature importance
feature = x_train.columns
feature_imp = pd.DataFrame(etc2.feature_importances_,index=feature)
maxmin = feature_imp.sort_values(0)
maxmin

Unnamed: 0,0
p1,0.003683
p4,0.004962
p2,0.005337
p3,0.005429
g1,0.102562
g2,0.107578
g4,0.109541
g3,0.113063
tau3,0.13468
tau4,0.135417


In [22]:
# most important feature
print('most important feature: {}'.format(maxmin.idxmax()))

# least important feature
print('least important feature: {}'.format(maxmin.idxmin()))

most important feature: 0    tau2
dtype: object
least important feature: 0    p1
dtype: object
