In [1]:
# Import the necessary libraries and modules
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# Define the url ponting to the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv'

In [3]:
# Import the dataset
data_df = pd.read_csv(url)

In [4]:
# View the first 5 columns in the dataset
data_df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [5]:
# View the summary information about the imported dataset
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [6]:
# Checking for null values in the dataset
data_df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

The dataset has no null values

In [7]:
# Viewing the summary statistics of the dataset
data_df.describe()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.25,5.250001,5.250004,5.249997,3.75,-1.25,-1.25,-1.25,0.525,0.525,0.525,0.525,0.015731
std,2.742548,2.742549,2.742549,2.742556,0.75216,0.433035,0.433035,0.433035,0.274256,0.274255,0.274255,0.274255,0.036919
min,0.500793,0.500141,0.500788,0.500473,1.58259,-1.999891,-1.999945,-1.999926,0.050009,0.050053,0.050054,0.050028,-0.08076
25%,2.874892,2.87514,2.875522,2.87495,3.2183,-1.624901,-1.625025,-1.62496,0.287521,0.287552,0.287514,0.287494,-0.015557
50%,5.250004,5.249981,5.249979,5.249734,3.751025,-1.249966,-1.249974,-1.250007,0.525009,0.525003,0.525015,0.525002,0.017142
75%,7.62469,7.624893,7.624948,7.624838,4.28242,-0.874977,-0.875043,-0.875065,0.762435,0.76249,0.76244,0.762433,0.044878
max,9.999469,9.999837,9.99945,9.999443,5.864418,-0.500108,-0.500072,-0.500025,0.999937,0.999944,0.999982,0.99993,0.109403


In [8]:
list(data_df.columns)

['tau1',
 'tau2',
 'tau3',
 'tau4',
 'p1',
 'p2',
 'p3',
 'p4',
 'g1',
 'g2',
 'g3',
 'g4',
 'stab',
 'stabf']

In [9]:
# Checking the value counts of the target variable
data_df['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

There are some imbalance in the target variable

In [10]:
data_df.corr()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
tau1,1.0,0.015586,-0.00597,-0.017265,0.027183,-0.015485,-0.015924,-0.015807,0.010521,0.01535,-0.001279,0.005494,0.275761
tau2,0.015586,1.0,0.014273,-0.001965,-0.004769,0.006573,0.007673,-0.005963,-0.001742,0.015383,0.016508,-0.011764,0.290975
tau3,-0.00597,0.014273,1.0,0.004354,0.016953,-0.003134,-0.00878,-0.017531,-0.011605,0.007671,0.014702,-0.011497,0.2807
tau4,-0.017265,-0.001965,0.004354,1.0,-0.003173,0.010553,0.006169,-0.011211,-0.004149,0.008431,0.00326,-0.000491,0.278576
p1,0.027183,-0.004769,0.016953,-0.003173,1.0,-0.573157,-0.584554,-0.579239,0.000721,0.015405,0.001069,-0.015451,0.010278
p2,-0.015485,0.006573,-0.003134,0.010553,-0.573157,1.0,0.002388,-0.006844,0.015603,-0.018032,0.007555,0.019817,0.006255
p3,-0.015924,0.007673,-0.00878,0.006169,-0.584554,0.002388,1.0,0.012953,-0.003219,-0.011575,-0.005897,-0.010485,-0.003321
p4,-0.015807,-0.005963,-0.017531,-0.011211,-0.579239,-0.006844,0.012953,1.0,-0.013636,0.00285,-0.003515,0.017505,-0.020786
g1,0.010521,-0.001742,-0.011605,-0.004149,0.000721,0.015603,-0.003219,-0.013636,1.0,0.007559,-0.005836,0.012431,0.282774
g2,0.01535,0.015383,0.007671,0.008431,0.015405,-0.018032,-0.011575,0.00285,0.007559,1.0,-0.012809,-0.014909,0.293601


In [11]:
data_df[data_df['stab'] > 0]

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.959060,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.277210,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.669600,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.797110,0.455450,0.656947,0.820923,0.049860,unstable
6,6.710166,3.765204,6.929314,8.818562,2.397419,-0.614590,-1.208826,-0.574004,0.177890,0.397977,0.402046,0.376630,0.005954,unstable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9993,5.754191,3.032743,5.084803,4.633624,5.199250,-1.717030,-1.713212,-1.769009,0.157284,0.975921,0.511555,0.696591,0.050212,unstable
9994,2.042954,8.514335,8.173809,5.466635,3.783797,-1.639912,-0.662469,-1.481417,0.154129,0.944486,0.053225,0.499109,0.026311,unstable
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,0.023892,unstable
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.966330,-0.649915,-0.898510,0.365246,0.587558,0.889118,0.818391,0.037789,unstable


In [12]:
data_df[data_df['stab'] < 0]

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.781760,-0.005957,stable
5,6.999209,9.109247,3.784066,4.267788,4.429669,-1.857139,-0.670397,-1.902133,0.261793,0.077930,0.542884,0.469931,-0.017385,stable
8,4.689852,4.007747,1.478573,3.733787,4.041300,-1.410344,-1.238204,-1.392751,0.269708,0.250364,0.164941,0.482439,-0.038677,stable
10,5.930110,6.730873,6.245138,0.533288,2.327092,-0.702501,-1.116920,-0.507671,0.239816,0.563110,0.164461,0.753701,-0.028411,stable
12,1.616787,2.939228,0.819791,4.191804,3.752282,-1.484885,-1.280581,-0.986816,0.899698,0.866546,0.303921,0.077610,-0.048617,stable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9980,4.141337,1.112868,8.400888,1.869624,3.671563,-0.699052,-1.513896,-1.458615,0.081811,0.094963,0.662959,0.169682,-0.032192,stable
9983,1.482214,3.121273,5.220920,1.609288,4.394557,-1.924542,-0.881996,-1.588019,0.097309,0.096599,0.383334,0.988509,-0.054541,stable
9984,2.501787,3.087194,7.493896,2.177944,3.817471,-0.958986,-1.044281,-1.814205,0.503049,0.107808,0.534358,0.357840,-0.036061,stable
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.285880,0.366120,-0.025803,stable


In [13]:
# Dropping stab column
data_df = data_df.drop (columns=['stab'])

In [14]:
data_df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [15]:
# Separating the dataset into target and features

target_df = data_df['stabf']

features_df = data_df.drop(columns=['stabf'])

In [16]:
target_df.head()

0    unstable
1      stable
2    unstable
3    unstable
4    unstable
Name: stabf, dtype: object

In [17]:
features_df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923


In [18]:
# Splitting the target and features into train and test sets
x_train, x_test, y_train, y_test = train_test_split(features_df, target_df, test_size=0.2, random_state=1)

In [19]:
x_train.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
2694,6.255995,2.542401,7.024714,9.476518,3.529888,-1.224881,-0.688228,-1.61678,0.568221,0.618403,0.685739,0.660088
5140,5.070581,5.490253,8.075688,0.761075,4.220888,-1.280596,-1.902185,-1.038107,0.443515,0.097244,0.916955,0.129254
2568,1.220072,8.804028,3.874283,8.433949,3.614027,-1.039236,-0.953566,-1.621224,0.908353,0.923594,0.238881,0.660156
3671,7.498402,6.697603,8.798626,2.126236,3.134585,-1.581906,-0.589386,-0.963293,0.260826,0.899003,0.964752,0.600598
7427,7.074006,1.337511,6.100756,7.759156,2.526922,-0.92254,-0.6326,-0.971782,0.98458,0.716082,0.836928,0.165162


In [20]:
x_test.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
9953,6.877876,4.11382,9.356768,8.299753,4.056779,-1.89747,-1.590581,-0.568728,0.276567,0.845536,0.11244,0.822562
3850,5.802841,6.271371,4.73154,3.819867,3.579569,-1.70948,-1.067511,-0.802579,0.077527,0.416478,0.912846,0.861306
4962,2.286998,4.385142,2.830232,5.29388,3.035814,-1.202764,-0.902011,-0.931039,0.924216,0.130186,0.703887,0.063811
3886,5.01992,2.209962,6.26608,0.578901,4.322584,-1.960207,-1.074561,-1.287815,0.54691,0.065992,0.427349,0.814648
5437,7.646145,9.187896,5.484219,9.934313,3.634226,-1.254541,-1.335366,-1.044319,0.561528,0.121611,0.787318,0.300314


In [21]:
y_train.head()

2694    unstable
5140    unstable
2568    unstable
3671    unstable
7427    unstable
Name: stabf, dtype: object

In [22]:
y_test.head()

9953    unstable
3850    unstable
4962      stable
3886      stable
5437    unstable
Name: stabf, dtype: object

In [23]:
# Defining a function for scaling the sets using StandardScaler
def scaler(df):
  scaler = StandardScaler()
  scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
  return scaled_df

In [24]:
# Applying the function on the x_train set
x_train_scaled = scaler(x_train)

x_train_scaled.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.367327,-0.986042,0.650447,1.547527,-0.29149,0.061535,1.293862,-0.845074,0.160918,0.339859,0.585568,0.492239
1,-0.064659,0.089437,1.035079,-1.641494,0.619865,-0.067235,-1.502925,0.486613,-0.293143,-1.558488,1.429649,-1.443521
2,-1.46785,1.298418,-0.502536,1.166046,-0.180521,0.490603,0.68256,-0.855302,1.39935,1.451534,-1.045743,0.492489
3,0.820081,0.52992,1.299657,-1.141975,-0.812854,-0.763632,1.521579,0.65878,-0.958319,1.361958,1.60414,0.275303
4,0.665424,-1.425627,0.3123,0.919137,-1.614296,0.760315,1.422019,0.639243,1.676895,0.69566,1.137504,-1.312575


In [25]:
# Applying the function on x_test set
x_test_scaled = scaler(x_test)

x_test_scaled.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.592163,-0.420565,1.472472,1.093036,0.426786,-1.504594,-0.792677,1.600201,-0.925703,1.175287,-1.492644,1.086291
1,0.199183,0.364543,-0.190076,-0.518473,-0.229402,-1.071766,0.427103,1.052337,-1.65591,-0.395949,1.412703,1.227535
2,-1.086035,-0.321834,-0.873505,0.011761,-0.977094,0.094896,0.813041,0.751381,1.450284,-1.44437,0.654216,-1.679799
3,-0.087014,-1.113357,0.361518,-1.684316,0.79228,-1.649041,0.410662,-0.084473,0.066085,-1.67945,-0.349573,1.057439
4,0.873004,1.425833,0.080476,1.681022,-0.154247,-0.024315,-0.197525,0.485988,0.119716,-1.475773,0.957057,-0.817608


In [26]:
# Applying LabelEncoder on the target sets
encoder = LabelEncoder()

In [27]:
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [28]:
y_train

array([1, 1, 1, ..., 1, 1, 0])

In [29]:
pd.DataFrame(y_train, columns=['stabf']).head()

Unnamed: 0,stabf
0,1
1,1
2,1
3,1
4,1


In [30]:
y_test

array([1, 1, 0, ..., 0, 1, 1])

In [31]:
pd.DataFrame(y_test, columns=['stabf']).head()

Unnamed: 0,stabf
0,1
1,1
2,0
3,0
4,1


In [32]:
# Defining a function for evaluating f1_score

def f1Score(TP:int, FP:int, FN:int, TN:int):
  precision = TP/(TP + FP)
  recall = TP/(TP + FN)
  f_score = 2*(precision*recall)/(precision + recall)
  f_score = round(f_score, 4)
  return f_score

In [33]:
# Applying the function with the parameters given in the quiz
f1Score(TP=255, FP=1380, FN=45, TN=20)

0.2636

In [34]:
# Define a function for determining the option that meets the conditions specified in the quiz question
def value_calc(option, TP, FP, FN, TN):
  recall = TP/(TP + FN)
  FP_rate = FP/(FP + TN)
  cost = 5*FP + FN

  values_dict = {f'Option {option}':[recall, FP_rate, cost]}

  return values_dict


In [35]:
dict_A = value_calc('A', 82, 2, 18, 98)

dict_A

{'Option A': [0.82, 0.02, 28]}

In [36]:
dict_B = value_calc('B', 79, 1, 21, 99)
dict_B

{'Option B': [0.79, 0.01, 26]}

In [37]:
dict_C = value_calc('C', 78, 9, 22, 91)
dict_C

{'Option C': [0.78, 0.09, 67]}

In [38]:
dict_D = value_calc('D', 90, 4, 10, 96)
dict_D

{'Option D': [0.9, 0.04, 30]}

In [39]:
values_dict = {**dict_A, **dict_B, **dict_C, **dict_D}
values_dict

{'Option A': [0.82, 0.02, 28],
 'Option B': [0.79, 0.01, 26],
 'Option C': [0.78, 0.09, 67],
 'Option D': [0.9, 0.04, 30]}

In [40]:
values_df = pd.DataFrame(values_dict, index=['Recall', 'False_Positive_Rate', 'Cost'])
values_df

Unnamed: 0,Option A,Option B,Option C,Option D
Recall,0.82,0.79,0.78,0.9
False_Positive_Rate,0.02,0.01,0.09,0.04
Cost,28.0,26.0,67.0,30.0


In [41]:
# Defining a function for model accuracy

def model_accuracy(model):
  model_ = model(random_state = 1)
  model_.fit(x_train_scaled, y_train)
  model_pred = model_.predict(x_test_scaled)
  accuracy = round(accuracy_score(y_test, model_pred), 4)
  return accuracy


In [42]:
# Applying the function to RandomForestClassifier
rand_forest_accuracy = model_accuracy(RandomForestClassifier)
rand_forest_accuracy

0.928

In [43]:
# Applying the model accuracy function to XGBClassifier
xgb_accuracy = model_accuracy(XGBClassifier)
xgb_accuracy

0.919

In [44]:
# Applying the model accuracy function to LGBMClassifier
lgbm_accuracy = model_accuracy(LGBMClassifier)
lgbm_accuracy

0.9355

In [45]:
# Setting up the parameters for hyperparameter grid search
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None]

In [46]:
hyperparam_grid = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'max_features':max_features}

In [47]:
ext_trees_cl = ExtraTreesClassifier(random_state=1)

In [48]:
rand_grid_search = RandomizedSearchCV(ext_trees_cl, param_distributions=hyperparam_grid,
                                      random_state=1,
                                      scoring='accuracy',
                                      n_iter=10,
                                      cv=5,
                                      n_jobs=-1,
                                      verbose=1)

In [49]:
rand_grid_search.fit(x_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=ExtraTreesClassifier(random_state=1),
                   n_jobs=-1,
                   param_distributions={'max_features': ['auto', 'sqrt', 'log2',
                                                         None],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 3, 5, 7, 9],
                                        'n_estimators': [50, 100, 300, 500,
                                                         1000]},
                   random_state=1, scoring='accuracy', verbose=1)

In [50]:
rand_grid_search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [51]:
# For ExtraTreesClassifier with no hyperparameter tuning

ext_trees_cl.fit(x_train_scaled, y_train)

ext_trees_pred = ext_trees_cl.predict(x_test_scaled)

untuned_acc_score = round(accuracy_score(y_test, ext_trees_pred), 4)

untuned_acc_score

0.926

In [52]:
# For ExtraTreeClassifier with the selected hyperparameter values
ext_trees_tuned = ExtraTreesClassifier(n_estimators=1000, min_samples_split=2,
                                       min_samples_leaf=8,
                                       max_features=None, random_state=1)

ext_trees_tuned.fit(x_train_scaled, y_train)

new_pred = ext_trees_tuned.predict(x_test_scaled)

tuned_acc_score = accuracy_score(y_test, ext_trees_pred)

tuned_acc_score

0.926

In [53]:
# Feature importance for the features_df
feature_imp = ext_trees_tuned.feature_importances_

feature_imp

array([0.13723975, 0.1405075 , 0.13468029, 0.13541676, 0.00368342,
       0.00533686, 0.00542927, 0.00496249, 0.10256244, 0.10757765,
       0.11306268, 0.10954089])

In [54]:
sorted(zip(feature_imp, features_df), reverse=True)

[(0.14050750384993677, 'tau2'),
 (0.13723974766109256, 'tau1'),
 (0.1354167630909727, 'tau4'),
 (0.13468028520386593, 'tau3'),
 (0.11306267999167334, 'g3'),
 (0.10954089174337298, 'g4'),
 (0.10757764577478764, 'g2'),
 (0.10256244080927947, 'g1'),
 (0.005429268421191957, 'p3'),
 (0.005336864710946151, 'p2'),
 (0.004962486591192238, 'p4'),
 (0.003683422151688322, 'p1')]