<a href="https://colab.research.google.com/github/froschi95/Hamoye_data_science/blob/master/Stage_C_Quiz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MACHINE LEARNING - CLASSIFICATION: MANAGING THE QUALITY METRIC OF THE  GLOBAL ECOLOGICAL FOOTRINT

The dataset was obtained from National Footprints and Biocapacity Accounts

In [2]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, LeaveOneOut, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import LinearSVC


%matplotlib inline
matplotlib.rcParams['savefig.dpi'] = 144

  import pandas.util.testing as tm


In [3]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv')
pd.set_option('display.max_columns', None)

df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [4]:
df = df.drop(columns='stab')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stabf   10000 non-null  object 
dtypes: float64(12), object(1)
memory usage: 1015.8+ KB


To confirm that there are no missing values in our data set

In [5]:
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stabf    0
dtype: int64

In [6]:
df['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [7]:
X = df.drop(columns='stabf')
y = df['stabf']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
y_train.value_counts()

unstable    5092
stable      2908
Name: stabf, dtype: int64

In [8]:
y_test.value_counts()

unstable    1288
stable       712
Name: stabf, dtype: int64

In [9]:
x_train.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
2694,6.255995,2.542401,7.024714,9.476518,3.529888,-1.224881,-0.688228,-1.61678,0.568221,0.618403,0.685739,0.660088
5140,5.070581,5.490253,8.075688,0.761075,4.220888,-1.280596,-1.902185,-1.038107,0.443515,0.097244,0.916955,0.129254
2568,1.220072,8.804028,3.874283,8.433949,3.614027,-1.039236,-0.953566,-1.621224,0.908353,0.923594,0.238881,0.660156
3671,7.498402,6.697603,8.798626,2.126236,3.134585,-1.581906,-0.589386,-0.963293,0.260826,0.899003,0.964752,0.600598
7427,7.074006,1.337511,6.100756,7.759156,2.526922,-0.92254,-0.6326,-0.971782,0.98458,0.716082,0.836928,0.165162


In [10]:
# Transforming/normalising the features
scaler = StandardScaler()
x_train_norm = scaler.fit_transform(x_train, y_train)
x_test_norm = scaler.transform(x_test)

In [11]:
x_train_norm = pd.DataFrame(x_train_norm, columns=x_train.columns)
x_test_norm = pd.DataFrame(x_test_norm, columns=x_test.columns)
x_train_norm.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.367327,-0.986042,0.650447,1.547527,-0.29149,0.061535,1.293862,-0.845074,0.160918,0.339859,0.585568,0.492239
1,-0.064659,0.089437,1.035079,-1.641494,0.619865,-0.067235,-1.502925,0.486613,-0.293143,-1.558488,1.429649,-1.443521
2,-1.46785,1.298418,-0.502536,1.166046,-0.180521,0.490603,0.68256,-0.855302,1.39935,1.451534,-1.045743,0.492489
3,0.820081,0.52992,1.299657,-1.141975,-0.812854,-0.763632,1.521579,0.65878,-0.958319,1.361958,1.60414,0.275303
4,0.665424,-1.425627,0.3123,0.919137,-1.614296,0.760315,1.422019,0.639243,1.676895,0.69566,1.137504,-1.312575


## ExtraTreesClassifier and Optimized ExtraTreesClassifier(using RandomizedSearchCV

In [12]:
ext_clf = ExtraTreesClassifier(random_state=1, verbose=1)

n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 

hyperparam_grid = {'n_estimators': n_estimators,
                   'min_samples_leaf': min_samples_leaf,
                   'min_samples_split': min_samples_split,
                   'max_features': max_features}

rnd_scv = RandomizedSearchCV(ext_clf, hyperparam_grid, verbose=1, random_state=1)

In [13]:
ext_clf.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [14]:
ext_optimized = rnd_scv.fit(x_train_norm, y_train)
# Fitting/training the optimized model

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    4.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    3.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    3.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.3s finished
[

In [15]:
ext_optimized.best_params_

{'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [16]:
ypred1 = ext_optimized.predict(x_test_norm)
clf_report = classification_report(y_test, ypred1, digits=4)
print(clf_report)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

      stable     0.9211    0.8694    0.8945       712
    unstable     0.9300    0.9589    0.9442      1288

    accuracy                         0.9270      2000
   macro avg     0.9256    0.9141    0.9193      2000
weighted avg     0.9268    0.9270    0.9265      2000



[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.3s finished


In [17]:
ext_clf.fit(x_train_norm, y_train)
ypred = ext_clf.predict(x_test_norm)

clf_report1 = classification_report(y_test, ypred, digits=4)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


In [18]:
print(clf_report1)

              precision    recall  f1-score   support

      stable     0.9410    0.8511    0.8938       712
    unstable     0.9218    0.9705    0.9455      1288

    accuracy                         0.9280      2000
   macro avg     0.9314    0.9108    0.9197      2000
weighted avg     0.9287    0.9280    0.9271      2000



## RandomForestClassifier

In [19]:
rndf_clf = RandomForestClassifier(random_state=1, verbose=1)
rndf_clf.fit(x_train_norm, y_train)

ypred2 = rndf_clf.predict(x_test_norm)

clf_report2 = classification_report(y_test, ypred2, digits=4)
print(clf_report2)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

      stable     0.9191    0.8778    0.8980       712
    unstable     0.9341    0.9573    0.9456      1288

    accuracy                         0.9290      2000
   macro avg     0.9266    0.9176    0.9218      2000
weighted avg     0.9288    0.9290    0.9286      2000



[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


##XGBoost Classifier

In [20]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(max_depth=3, learning_rate=0.1, random_state=1)
xgb_model.fit(x_train_norm, y_train)
ypred3 = xgb_model.predict(x_test_norm)

In [21]:
clf_report3 = classification_report(y_test, ypred3, digits=4)
print(clf_report3)

              precision    recall  f1-score   support

      stable     0.9206    0.8469    0.8822       712
    unstable     0.9190    0.9596    0.9389      1288

    accuracy                         0.9195      2000
   macro avg     0.9198    0.9033    0.9105      2000
weighted avg     0.9195    0.9195    0.9187      2000



## LightGBM Classifier

In [22]:
from lightgbm import LGBMClassifier
lgb_model = LGBMClassifier(random_state=1)
lgb_model.fit(x_train_norm, y_train)
ypred4 = lgb_model.predict(x_test_norm)

clf_report4 = classification_report(y_test, ypred4, digits=4)
print(clf_report4)

              precision    recall  f1-score   support

      stable     0.9297    0.8919    0.9104       712
    unstable     0.9415    0.9627    0.9520      1288

    accuracy                         0.9375      2000
   macro avg     0.9356    0.9273    0.9312      2000
weighted avg     0.9373    0.9375    0.9372      2000



In [23]:
acc4 = accuracy_score(y_true=y_test, y_pred=ypred4)
recall4 = recall_score(y_true=y_test, y_pred=ypred4, pos_label='unstable')
precision4 = precision_score(y_true=y_test, y_pred=ypred4, pos_label='unstable')
print(acc4, recall4, precision4)

0.9375 0.9627329192546584 0.9415337889141989


## COMBINING ALL MODEL PERFORMANCES IN A DATAFRAME

In [24]:
models = ['ExtraTreesClf', 'Optimized_ExtraTreesClf', 'RandomForest', 'XGBoost', 'LightGBM']
predictions = [ypred, ypred1, ypred2, ypred3, ypred4]
results = []
for model in models:
    acc = accuracy_score(y_true=y_test, y_pred=predictions[models.index(model)])
    prec = precision_score(y_true=y_test, y_pred=predictions[models.index(model)], pos_label='stable')
    rec = recall_score(y_true=y_test, y_pred=predictions[models.index(model)], pos_label='stable')
    f1 = f1_score(y_true=y_test, y_pred=predictions[models.index(model)], pos_label='stable')
    results.append([model, acc, prec, rec, f1])
#     model_performance = model_performance.append()
#     model_performance.head()
results

[['ExtraTreesClf',
  0.928,
  0.9409937888198758,
  0.851123595505618,
  0.8938053097345133],
 ['Optimized_ExtraTreesClf',
  0.927,
  0.9211309523809523,
  0.8693820224719101,
  0.8945086705202311],
 ['RandomForest',
  0.929,
  0.9191176470588235,
  0.8778089887640449,
  0.8979885057471264],
 ['XGBoost',
  0.9195,
  0.9206106870229007,
  0.8469101123595506,
  0.8822238478419898],
 ['LightGBM',
  0.9375,
  0.9297218155197657,
  0.8918539325842697,
  0.910394265232975]]

In [25]:
model_performance = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall','F1-score'])
model_performance

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score
0,ExtraTreesClf,0.928,0.940994,0.851124,0.893805
1,Optimized_ExtraTreesClf,0.927,0.921131,0.869382,0.894509
2,RandomForest,0.929,0.919118,0.877809,0.897989
3,XGBoost,0.9195,0.920611,0.84691,0.882224
4,LightGBM,0.9375,0.929722,0.891854,0.910394


## Feature Importances

---



In [27]:
feat_import = ext_clf.feature_importances_
sorted(zip(feat_import, x_train.columns), reverse=True)

[(0.11844468079199041, 'tau2'),
 (0.11739736493320078, 'tau1'),
 (0.11546569217199552, 'tau4'),
 (0.11316851246674982, 'tau3'),
 (0.09688268324346265, 'g3'),
 (0.09401881529815702, 'g4'),
 (0.09367635844551439, 'g2'),
 (0.08978290601483987, 'g1'),
 (0.040706278296424536, 'p3'),
 (0.040578638540044426, 'p4'),
 (0.04037131556379323, 'p2'),
 (0.039506754233827476, 'p1')]

## Confusion Matrix for all The Models in the following order:

- ExtraTreesClassifier
- Optimized ExtraTreesClassifier
- RandomForestClassifier
- XGBClassifier
- LGBMClassifier

In [28]:
cnf = confusion_matrix(y_true=y_test, y_pred=ypred, labels=['stable', 'unstable'])
cnf

array([[ 606,  106],
       [  38, 1250]])

In [29]:
cnf1 = confusion_matrix(y_true=y_test, y_pred=ypred1, labels=['stable', 'unstable'])
cnf1

array([[ 619,   93],
       [  53, 1235]])

In [30]:
cnf2 = confusion_matrix(y_true=y_test, y_pred=ypred2, labels=['stable', 'unstable'])
cnf2

array([[ 625,   87],
       [  55, 1233]])

In [31]:
cnf3 = confusion_matrix(y_true=y_test, y_pred=ypred3, labels=['stable', 'unstable'])
cnf3

array([[ 603,  109],
       [  52, 1236]])

In [32]:
cnf4 = confusion_matrix(y_true=y_test, y_pred=ypred4, labels=['stable', 'unstable'])
cnf4

array([[ 635,   77],
       [  48, 1240]])