# NNONA Ebuka John
@ebukajohnnn

# Preparing the Data for the Quiz

In [1]:
#importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#Loading (reading) as dataframe from csv
df = pd.read_csv('Data_for_UCI_named.csv')

In [3]:
# checking the head of the data
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [4]:
# checking for missing values
df.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [5]:
# from the Instructions, we are told to drop the 'stab' columns
df = df.drop(columns = 'stab')

In [6]:
# before we split the data
df.stabf.value_counts() 

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [7]:
# Splitting the data into the Predictors(Features) and Labels(Response)
X = df.drop(['stabf'],axis = 1)
y = df['stabf']

In [8]:
# Assigning/Splitting the data into testing and training sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=1)

In [9]:
#after split
y_train.value_counts() 

unstable    5092
stable      2908
Name: stabf, dtype: int64

In [10]:
#after split
y_test.value_counts() 

unstable    1288
stable       712
Name: stabf, dtype: int64

In [11]:
#checking the Xtrain dataframe before transformation is done to it
X_train.head(3) 

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
2694,6.255995,2.542401,7.024714,9.476518,3.529888,-1.224881,-0.688228,-1.61678,0.568221,0.618403,0.685739,0.660088
5140,5.070581,5.490253,8.075688,0.761075,4.220888,-1.280596,-1.902185,-1.038107,0.443515,0.097244,0.916955,0.129254
2568,1.220072,8.804028,3.874283,8.433949,3.614027,-1.039236,-0.953566,-1.621224,0.908353,0.923594,0.238881,0.660156


In [12]:
#checking the Xtest dataframe before transformation is done to it
X_test.head(3) 

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
9953,6.877876,4.11382,9.356768,8.299753,4.056779,-1.89747,-1.590581,-0.568728,0.276567,0.845536,0.11244,0.822562
3850,5.802841,6.271371,4.73154,3.819867,3.579569,-1.70948,-1.067511,-0.802579,0.077527,0.416478,0.912846,0.861306
4962,2.286998,4.385142,2.830232,5.29388,3.035814,-1.202764,-0.902011,-0.931039,0.924216,0.130186,0.703887,0.063811


In [13]:
#As instructed we are told to carry out Standard Scaling.
#Here we use the Standard Scaler transformation technique
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [14]:
#Transforming the X_train (feature)data
transformed_X_train = scaler.fit_transform(X_train)
transformed_X_train = pd.DataFrame(transformed_X_train, columns = X_train.columns)

In [15]:
#after applying transformation on the data
transformed_X_train.head() 

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.367327,-0.986042,0.650447,1.547527,-0.29149,0.061535,1.293862,-0.845074,0.160918,0.339859,0.585568,0.492239
1,-0.064659,0.089437,1.035079,-1.641494,0.619865,-0.067235,-1.502925,0.486613,-0.293143,-1.558488,1.429649,-1.443521
2,-1.46785,1.298418,-0.502536,1.166046,-0.180521,0.490603,0.68256,-0.855302,1.39935,1.451534,-1.045743,0.492489
3,0.820081,0.52992,1.299657,-1.141975,-0.812854,-0.763632,1.521579,0.65878,-0.958319,1.361958,1.60414,0.275303
4,0.665424,-1.425627,0.3123,0.919137,-1.614296,0.760315,1.422019,0.639243,1.676895,0.69566,1.137504,-1.312575


In [16]:
#Transforming the X_test (feature)data
transformed_X_test = scaler.transform(X_test)
transformed_X_test = pd.DataFrame(transformed_X_test, columns = X_test.columns)

In [17]:
#after applying transformation on the data
transformed_X_test.head() 

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.593951,-0.412733,1.503924,1.116943,0.403423,-1.492971,-0.785033,1.566781,-0.901007,1.167203,-1.50733,1.084726
1,0.20219,0.374416,-0.1888,-0.522268,-0.225967,-1.058483,0.420047,1.028627,-1.625721,-0.39566,1.414651,1.226011
2,-1.079044,-0.313745,-0.884634,0.01708,-0.943122,0.112653,0.801335,0.733004,1.457108,-1.438495,0.651821,-1.682168
3,-0.08312,-1.107327,0.372805,-1.708152,0.75399,-1.637972,0.403805,-0.088036,0.083322,-1.672322,-0.357714,1.055865
4,0.873921,1.438466,0.086662,1.715037,-0.15388,-0.007015,-0.197053,0.472315,0.136549,-1.469731,0.956396,-0.819727


# Data is ready for the Quiz

# Question 1

A classifier predict if insurance claims are fraudulent or not.
The cost of paying a fraudulent claim is higher than the cost of investigating a claim that is suspected to be fraudulent. Which metrics should we use to evaluate this classifier?

In [18]:
#Ans: Precision and F1 Score

# Question 2

Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the most and least important respectively?

In [19]:
#Ans: tau2, p1

# Question 3

A medical company is building a model to predict the occurence of thyroid cancer. The training data contains 900 negative instances (people who don't have cancer) and 100 positive instances. The resulting model has 90% accuracy, but extremely poor recall. What steps can be used to improve the model's performance? (SELECT TWO)

In [20]:
#Ans: Collect more data for the positive case.

In [21]:
#Ans: Over-sample instances from the negative (no cancer)class.

# Question 4

Jack is working on classification modelling. While evaluating the model, he saw that the distance between test and training error is a big positive number with a low training error. Which of the following is he currently facing?

In [22]:
#Ans: Overfitting

# Question 5

A random forest classifier was used to classify handwritten digits 0-9 into the numbers they were intended to represent. The confusion matrix below was generated from the results. Based on the matrix, which number was predicted with the least accuracy?

In [23]:
#Ans: 9

# Question 6

Which of these is not a good metric for evaluating classification algorithms for data with imbalanced class problems?

In [24]:
#Ans: Accuracy

# Question 7

The ROC curve above was generated from a classification algorithm. What can we say about this classifier?

In [25]:
#Ans: The model has no discrimination capacity to differentiate between the positive and the negative class

# Question 8

What is the accuracy on the test set using the XGboost classifier? In 4 decimal places.

In [34]:
#xgboost
from xgboost import XGBClassifier
extreme = XGBClassifier(random_state =1)
extreme.fit(transformed_X_train, y_train)
extreme_pred = extreme.predict(transformed_X_test)



In [27]:
#classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, extreme_pred))

              precision    recall  f1-score   support

      stable       0.94      0.91      0.92       712
    unstable       0.95      0.97      0.96      1288

    accuracy                           0.95      2000
   macro avg       0.94      0.94      0.94      2000
weighted avg       0.95      0.95      0.95      2000



In [29]:
from sklearn.metrics import accuracy_score

In [30]:
round(accuracy_score(y_test,extreme_pred),4) # Giving our accuracy using Xgboost classifier in 4DP

0.9455

# Question 9

Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = 1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?

In [None]:
#Ans: N_estimators = 1000, min_samples_split = 2, min_samples_leaf = 8, max_features = None

In [37]:
from sklearn.ensemble import ExtraTreesClassifier

Tree_CLass = ExtraTreesClassifier (random_state = 1)   # recall random state of 1 was used throughout the quiz

In [38]:
n_estimators = [50, 100, 300, 500, 1000]    # Given we are to use this parameters to answer the question

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}

In [39]:
# importing RandomizedSearchCV as instructed for the quiz
from sklearn.model_selection import RandomizedSearchCV   

In [40]:
# According to the parameters given to instantiate
Rand_search = RandomizedSearchCV(estimator = Tree_CLass, param_distributions= hyperparameter_grid, random_state=1,cv = 5, n_iter=10,scoring='accuracy',n_jobs=1, verbose=1)

In [41]:
search = Rand_search.fit(transformed_X_train,y_train)     # fitting in the parameters into the training data

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [42]:
#checking for the best parameter for the model
search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

# Question 10

Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV (with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreeClassifier model with no hyperparameter tuning?

In [None]:
#Ans: Lower

In [43]:
#experimenting with this generated parameter to test the model's performance
best_Tree_Class = ExtraTreesClassifier(n_estimators=1000, min_samples_split=2, 
                                 min_samples_leaf=8, max_features=None)
best_Tree_Class.fit(transformed_X_train, y_train)
best_Tree_Class = best_Tree_Class.predict(transformed_X_test)

In [44]:
print(classification_report(y_test,best_Tree_Class, digits=4)) #  adding digits = 4 is to get my answer in 4dp.
print('\n')
print("Accuracy score {}".format(accuracy_score(y_test, best_Tree_Class)))

              precision    recall  f1-score   support

      stable     0.9241    0.8722    0.8974       712
    unstable     0.9315    0.9604    0.9457      1288

    accuracy                         0.9290      2000
   macro avg     0.9278    0.9163    0.9216      2000
weighted avg     0.9289    0.9290    0.9285      2000



Accuracy score 0.929


In [45]:
# COMPARING THIS RESULT WITH THE ORIGINAL EXTRA-TREE CLASSIFER WITHOUT TUNING

Tree_CLass.fit(transformed_X_train,y_train)
Tree_predict = Tree_CLass.predict(transformed_X_test)

# We can see here that the accuracy of the previous (New Optimal Model) is Lower than that without Hyperparameter Tuning
print(classification_report(y_test,Tree_predict))   


              precision    recall  f1-score   support

      stable       0.94      0.85      0.89       712
    unstable       0.92      0.97      0.95      1288

    accuracy                           0.93      2000
   macro avg       0.93      0.91      0.92      2000
weighted avg       0.93      0.93      0.93      2000



# Question 11

What is the accuracy on the test set using the LGBM classifier? In 4 decimal places.

In [None]:
#Ans: 0.9375

In [46]:
import lightgbm as lgbm

ModuleNotFoundError: No module named 'lightgbm'

In [None]:
lgbm = lgbm.LGBMClassifier(random_state=1)
lgbm.fit(transformed_X_train,y_train)
lgbm_predict  = lgbm.predict(transformed_X_test)

In [None]:
# giving our value of accuracy in 4DP
round(accuracy_score(y_test, lgbm_predict),4) 

# Question 12

Why do we use weak learners in boosting, instead of strong learners?

In [None]:
#Ans: To prevent overfitting

# Question 13

You are building a classifier and the accuracy is poor on both the training and test sets. Which would you use to try to improve the performance?

In [None]:
#Ans: Boosting

# Question 14

You are developing a machine learning classification algorithm that categorizes handwritten digits 0-9 into the numbers they represent. How should you pre-process the label data?

In [None]:
#Ans: One-hot encoding

# Question 15

You are working on a spam classification system using regularized logistic regression. "Spam" is a positive class (y=1) and "not spam" is the negative class (y=0).You have trained your classifier and there are n = 1700 examples in the test set. The confusion matrix of predicted class vs. actual class is:

In [None]:
#Ans: 0.3177

In [36]:
# Given total instances (n) = 1700
# F1 is given as 2 * (Precision*Recall)/(Precision + Recall)
# From the confusion matrix we have;

Precision = (355/ ( 355+1480)) 
Recall =  (355 /(355+45)) 
F1_Score = 2 * (Precision*Recall)/(Precision + Recall)
print(round(F1_Score,4))

0.3177


# Question 16

Which of the following metric is generally NOT useful for a classification problem?

In [None]:
#Ans:Accuracy

# Question 17

What other hyperparameter optimization method can you try apart from Random Search?

In [None]:
#Ans: Bayesian Optimization and Grid Search 

# Question 18

What is the accuracy on the test set using the random forest classifier? In 4 decimal places.

In [21]:
#importing our classifier and fitting in the training data
from sklearn.ensemble import RandomForestClassifier  
Random_C = RandomForestClassifier(random_state=1)
Random_C.fit(transformed_X_train,y_train)

RandomForestClassifier(random_state=1)

In [23]:
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

RandomForestClassifier(random_state=1)

In [25]:
predict = Random_C.predict(transformed_X_test)

In [26]:
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix

print("Accuracy score {}".format(round(accuracy_score(y_test, predict), 4)))

Accuracy score 0.929


# Question 19

What is the entropy of the target variable if its actual values are given as:

In [None]:
#Ans: -3/7 log(3/7)-4/7 log(4/7)

# Question 20

According to a use-case, in a certain ML task, a false positive is six times costlier than a false negative. You, as a Data Scientist, trained 4 models, to solve the use case.
Keep the following evaluation criteria in mind.
1) Must have a recall rate of at least 80%
2) Must have a false positive rate of 8% or less
3) Must minimize business costs
After creating each binary classification model,you generated the corresponding confusion matrix. Which confusion matrix represents the model that satisfies the requirements?

In [None]:
#Ans: TN = 98%, FP = 2%, FN = 18%, TP = 82%

In [None]:
# recall that Recall is given as TP / TP+FN

# for condtion 1, TN = 98%, FP = 2%, FN = 18%, TP = 82%
#Recall = TP/TP+FN = 82/82+18 = 0.82, this is above 80%....

# Condition #2, The FP here is 2% which meets this condition.....

# Condition #3, to confirm if it minimizes business cost, we were told FP is 6*FN
# FP*5 = 2*6 = 12% which gives the smallest compared to the other options.