In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# Load the data
file_path = Path('saveris_alarms_weather.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,date,location,status,alarm_type,limit,value,platform_1,platform_2,deepfreezer_1,...,freezer_2,freezer_3,freezer_4,cooling_1,cooling_2,sensor_alarm,temp,temp_min,temp_max,humidity
0,2,2021-06-24 17:00:00,Freezer,0,recovery,-19,-20.0,2.3,0.1,-23.0,...,-19.9,-20.0,-21.6,3.4,3.5,freezer_3,19.68,18.79,24.18,62.0
1,3,2021-06-24 17:30:00,Deepfreezer,0,recovery,-25,-25.9,2.4,0.6,-25.9,...,-20.8,-21.2,-22.1,3.1,3.2,deepfreezer_1,19.68,18.79,24.18,62.0
2,4,2021-06-24 17:30:00,Deepfreezer,0,recovery,-25,-25.0,2.4,0.6,-25.9,...,-20.8,-21.2,-22.1,3.1,3.2,deepfreezer_2,19.68,18.79,24.18,62.0
3,5,2021-06-24 17:45:00,Platform,1,deficit,1,0.6,2.4,0.6,-26.4,...,-21.0,-21.5,-22.2,3.1,3.2,platform_2,19.68,18.79,24.18,62.0
4,6,2021-06-24 19:15:00,Platform,1,deficit,1,0.8,0.8,-0.9,-27.2,...,-21.0,-22.2,-22.4,3.0,3.1,platform_1,19.68,18.79,24.18,62.0


In [5]:

columns = ["Unnamed: 0", "date", "location", "status", "alarm_type", "limit", "value", "platform_1" "platform_2", "deepfreezer_1", 
          "deepfreezer_2", "freezer_1", "freezer_2", "freezer_3", "freezer_4", "cooling_1", "cooling_2", "sensor_alarm", "temp", 
           "temp_min", "temp_max", "humidity"]

target = ["status"]

In [6]:

df = df.drop(["Unnamed: 0", "date", "location", "alarm_type", "sensor_alarm"], axis=1)
df.head()


Unnamed: 0,status,limit,value,platform_1,platform_2,deepfreezer_1,deepfreezer_2,freezer_1,freezer_2,freezer_3,freezer_4,cooling_1,cooling_2,temp,temp_min,temp_max,humidity
0,0,-19,-20.0,2.3,0.1,-23.0,-24.0,-20.2,-19.9,-20.0,-21.6,3.4,3.5,19.68,18.79,24.18,62.0
1,0,-25,-25.9,2.4,0.6,-25.9,-25.0,-20.9,-20.8,-21.2,-22.1,3.1,3.2,19.68,18.79,24.18,62.0
2,0,-25,-25.0,2.4,0.6,-25.9,-25.0,-20.9,-20.8,-21.2,-22.1,3.1,3.2,19.68,18.79,24.18,62.0
3,1,1,0.6,2.4,0.6,-26.4,-25.4,-21.0,-21.0,-21.5,-22.2,3.1,3.2,19.68,18.79,24.18,62.0
4,1,1,0.8,0.8,-0.9,-27.2,-26.2,-21.1,-21.0,-22.2,-22.4,3.0,3.1,19.68,18.79,24.18,62.0


# Split the Data into Training and Testing

In [7]:
#"limit", "value", "platform_1", "platform_2", "deepfreezer_1", "deepfreezer_2", "freezer_1", "freezer_2", "freezer_3", "freezer_4", "cooling_1", "cooling_2",
# Create our features
X_df = df.drop(columns=['status'])
X = pd.get_dummies(X_df, columns=[ "temp", 
           "temp_min", "temp_max", "humidity"], drop_first = True)

# Create our target
y = df["status"].to_frame()

In [8]:
X.describe()

Unnamed: 0,limit,value,platform_1,platform_2,deepfreezer_1,deepfreezer_2,freezer_1,freezer_2,freezer_3,freezer_4,...,humidity_91.0,humidity_92.0,humidity_93.0,humidity_94.0,humidity_95.0,humidity_96.0,humidity_97.0,humidity_98.0,humidity_99.0,humidity_100.0
count,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,...,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0
mean,-23.879478,-23.744311,2.314645,1.40072,-30.694064,-30.467606,-22.951252,-23.075161,-23.771144,-23.670874,...,0.024434,0.027432,0.019487,0.028631,0.008544,0.018888,0.009144,0.004197,0.005996,0.000899
std,9.085741,8.967414,1.161922,1.27632,2.471188,2.399975,1.466239,1.506519,1.391468,1.357735,...,0.154404,0.163351,0.138241,0.166781,0.092047,0.136139,0.095193,0.064655,0.077208,0.029979
min,-38.0,-36.0,-3.6,-4.6,-36.1,-35.7,-27.5,-28.3,-30.4,-29.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-32.0,-30.8,1.7,0.7,-32.3,-32.0,-24.1,-24.2,-24.6,-24.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-24.0,-23.9,2.3,1.4,-31.1,-30.8,-22.8,-23.0,-23.8,-23.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,-22.0,-22.0,2.9,2.1,-29.5,-29.4,-21.9,-21.9,-22.7,-22.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,6.0,7.7,7.8,7.8,-15.7,-2.8,-7.2,-17.1,-18.8,-18.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Check the balance of our target values
y['status'].value_counts()

1    3367
0    3304
Name: status, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [11]:
# Scale data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Resample the training data with the RandomOversampler
from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators =1000, random_state=1)
model = brfc.fit(X_train_scaled, y_train)
BalancedRandomForestClassifier()

BalancedRandomForestClassifier()

In [12]:
# Calculate the balanced accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

predictions = model.predict(X_test_scaled)
accuracy_score(y_test, predictions)

0.920863309352518

In [13]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[755,  71],
       [ 61, 781]], dtype=int64)

In [14]:
# Print the imbalanced classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       826
           1       0.92      0.93      0.92       842

    accuracy                           0.92      1668
   macro avg       0.92      0.92      0.92      1668
weighted avg       0.92      0.92      0.92      1668



In [15]:
# List the features sorted in descending order by feature importance
sorted(zip(model.feature_importances_, X.columns), reverse=True)[:50]

[(0.23938612079610394, 'value'),
 (0.07514264304338042, 'freezer_1'),
 (0.07157315920535583, 'limit'),
 (0.06573005173830687, 'freezer_2'),
 (0.06296348919109006, 'deepfreezer_1'),
 (0.055836455279536276, 'deepfreezer_2'),
 (0.04075486324244635, 'freezer_3'),
 (0.03425512211786859, 'freezer_4'),
 (0.02949505142855379, 'platform_2'),
 (0.028232332472928726, 'platform_1'),
 (0.023515414823496464, 'cooling_1'),
 (0.021436628765905505, 'cooling_2'),
 (0.0019247408075000938, 'temp_min_15.18'),
 (0.001864992951477031, 'temp_min_14.18'),
 (0.0017621752469713472, 'temp_min_12.18'),
 (0.0017607932728474867, 'temp_min_13.18'),
 (0.0016286179509258246, 'temp_max_14.99'),
 (0.0015661808702040407, 'humidity_92.0'),
 (0.0015409242640036318, 'temp_max_17.89'),
 (0.001488316854341851, 'temp_min_11.18'),
 (0.0014840249807608766, 'temp_max_15.55'),
 (0.0014303438761435162, 'temp_max_16.1'),
 (0.0013796931733580835, 'temp_min_14.89'),
 (0.0013516489294497097, 'temp_min_10.18'),
 (0.0013092998132099444, '

### Easy Ensemble AdaBoost Classifier

In [16]:
# Train the EasyEnsembleClassifier

from imblearn.ensemble import EasyEnsembleClassifier

model = EasyEnsembleClassifier(base_estimator=None, n_estimators=100, n_jobs=1, random_state=1, 
                                   replacement=False, sampling_strategy='auto', verbose=0, 
                                   warm_start=False)

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

EasyEnsembleClassifier()

EasyEnsembleClassifier()

In [17]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
predictions = model.predict(X_test_scaled)
balanced_accuracy_score(y_test, predictions)

0.9428965394282034

In [18]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[766,  60],
       [ 35, 807]], dtype=int64)

In [19]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94       826
           1       0.93      0.96      0.94       842

    accuracy                           0.94      1668
   macro avg       0.94      0.94      0.94      1668
weighted avg       0.94      0.94      0.94      1668

