### MACHINE LEARNING: CLASSIFICATION QUIZ

In [1]:
# Importing the libraries needed
import pandas as pd
import numpy as np

In [2]:
# Reading in the data
data = pd.read_csv("grid_stab.csv")
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
# Exploring the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [4]:
data.shape

(10000, 14)

In [5]:
data = data.drop(columns=["stab"])

In [6]:
data.columns

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4', 'stabf'],
      dtype='object')

In [7]:
data.stabf.value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [8]:
# Splitting the features and labels
X = data.drop(columns=["stabf"])
y = data.stabf

In [9]:
X.columns

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4'],
      dtype='object')

In [10]:
y.head()

0    unstable
1      stable
2    unstable
3    unstable
4    unstable
Name: stabf, dtype: object

In [11]:
# Splitting into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [12]:
# Scaling the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [13]:
y_train.head()

2694    unstable
5140    unstable
2568    unstable
3671    unstable
7427    unstable
Name: stabf, dtype: object

In [14]:
def encode_labels(y, encoder1, encoder2):
    
    labels = encoder1.fit_transform(y)
    labels = encoder2.fit_transform(labels.reshape(-1, 1)).toarray()[:, 0]
                                    
    return labels

In [15]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoder = LabelEncoder()
ohe = OneHotEncoder()

In [16]:
# Encoding the labels
y_train = encode_labels(y_train, encoder, ohe)
y_test = encode_labels(y_test, encoder, ohe)

In [17]:
# Random forest classifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)
y_pred1 = rf.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred1)

0.926

The accuracy of the RandomForest classifier is = 0.926

In [19]:
# Extra Trees Classifier

et = ExtraTreesClassifier(random_state=1)
et.fit(X_train, y_train)
y_pred2 = et.predict(X_test)

In [20]:
accuracy_score(y_test, y_pred2)

0.922

The accuracy of the Extra Trees classifier is = 0.922

In [21]:
# XGBoost classifier
import xgboost as xgb

model = xgb.XGBClassifier(random_state=1)
model.fit(X_train, y_train)
y_pred3 = model.predict(X_test)





In [22]:
accuracy_score(y_test, y_pred3)

0.946

The accuracy of the XGBoost classifier is = 0.946

In [23]:
# Lightgbm classifier
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(random_state=1)
lgbm.fit(X_train, y_train)
y_pred4 = lgbm.predict(X_test)

In [24]:
accuracy_score(y_test, y_pred4)

0.9365

The accuracy of the Lightgbm classifier is = 0.9365

In [25]:
# Improving the Extra Trees Classifier
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ["auto", "sqrt", "log2", None]

param_grid = {"n_estimators": n_estimators,
              "min_samples_leaf": min_samples_leaf,
              "min_samples_split": min_samples_split,
              "max_features": max_features}

rand_search = RandomizedSearchCV(et, 
                        param_grid, 
                        n_iter=10, 
                        scoring="accuracy",
                        n_jobs=-1,
                        verbose=1,
                       random_state=1)

In [26]:
rand_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(estimator=ExtraTreesClassifier(random_state=1), n_jobs=-1,
                   param_distributions={'max_features': ['auto', 'sqrt', 'log2',
                                                         None],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 3, 5, 7, 9],
                                        'n_estimators': [50, 100, 300, 500,
                                                         1000]},
                   random_state=1, scoring='accuracy', verbose=1)

In [27]:
rand_search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [28]:
model = rand_search.best_estimator_
y_pred5 = model.predict(X_test)
accuracy_score(y_test, y_pred5)

0.9285

The accuracy of the improved Extra Trees classifier is = 0.9285

In [29]:
X.columns

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4'],
      dtype='object')

In [30]:
model.feature_importances_

array([0.13723975, 0.1405075 , 0.13468029, 0.13541676, 0.00368342,
       0.00533686, 0.00542927, 0.00496249, 0.10256244, 0.10757765,
       0.11306268, 0.10954089])

In [31]:
features = pd.DataFrame(model.feature_importances_, index=X.columns, columns=["importance"])

In [32]:
features.sort_values(by="importance", ascending=False)

Unnamed: 0,importance
tau2,0.140508
tau1,0.13724
tau4,0.135417
tau3,0.13468
g3,0.113063
g4,0.109541
g2,0.107578
g1,0.102562
p3,0.005429
p2,0.005337


* tau2 is the most important 
* p1 is the least important